1 files changed, 77 insertions, 58 deletions
diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py
index b2615eefcd..0783f1735d 100644
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@@ -25,17 +25,17 @@
 # written by Fredrik Lundh (fredrik@pythonware.com)
 #
 
-import sys
+import sys, os, zipfile
 
 SCRIPT = sys.argv[0]
 VERSION = "3.2"
 
 # The Unicode Database
-UNIDATA_VERSION = "5.2.0"
+UNIDATA_VERSION = "6.0.0"
 UNICODE_DATA = "UnicodeData%s.txt"
 COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
 EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
-UNIHAN = "Unihan%s.txt"
+UNIHAN = "Unihan%s.zip"
 DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
 DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
 LINE_BREAK = "LineBreak%s.txt"
@@ -75,23 +75,13 @@ def maketables(trace=0):
     print("--- Reading", UNICODE_DATA % "", "...")
 
     version = ""
-    unicode = UnicodeData(UNICODE_DATA % version,
-                          COMPOSITION_EXCLUSIONS % version,
-                          EASTASIAN_WIDTH % version,
-                          UNIHAN % version,
-                          DERIVED_CORE_PROPERTIES % version,
-                          DERIVEDNORMALIZATION_PROPS % version,
-                          LINE_BREAK % version)
+    unicode = UnicodeData(UNIDATA_VERSION)
 
     print(len(list(filter(None, unicode.table))), "characters")
 
     for version in old_versions:
         print("--- Reading", UNICODE_DATA % ("-"+version), "...")
-        old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
-                                  COMPOSITION_EXCLUSIONS % ("-"+version),
-                                  EASTASIAN_WIDTH % ("-"+version),
-                                  UNIHAN % ("-"+version),
-                                  DERIVED_CORE_PROPERTIES % ("-"+version))
+        old_unicode = UnicodeData(version)
         print(len(list(filter(None, old_unicode.table))), "characters")
         merge_old_version(version, unicode, old_unicode)
 
@@ -771,6 +761,10 @@ def merge_old_version(version, new, old):
                     elif k == 16:
                         # derived property changes; not yet
                         pass
+                    elif k == 17:
+                        # normalization quickchecks are not performed
+                        # for older versions
+                        pass
                     else:
                         class Difference(Exception):pass
                         raise Difference(hex(i), k, old.table[i], new.table[i])
@@ -779,6 +773,21 @@ def merge_old_version(version, new, old):
                                      numeric_changes)),
                         normalization_changes))
 
+def open_data(template, version):
+    local = template % ('-'+version,)
+    if not os.path.exists(local):
+        import urllib.request
+        if version == '3.2.0':
+            # irregular url structure
+            url = 'http://www.unicode.org/Public/3.2-Update/' + local
+        else:
+            url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
+        urllib.request.urlretrieve(url, filename=local)
+    if local.endswith('.txt'):
+        return open(local, encoding='utf-8')
+    else:
+        # Unihan.zip
+        return open(local, 'rb')
 
 # --------------------------------------------------------------------
 # the following support code is taken from the unidb utilities
@@ -793,11 +802,11 @@ class UnicodeData:
     #  ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
     #  derived-props] (17)
 
-    def __init__(self, filename, exclusions, eastasianwidth, unihan,
-                 derivedprops, derivednormalizationprops=None, linebreakprops=None,
+    def __init__(self, version,
+                 linebreakprops=False,
                  expand=1):
         self.changed = []
-        file = open(filename)
+        file = open_data(UNICODE_DATA, version)
         table = [None] * 0x110000
         while 1:
             s = file.readline()
@@ -825,11 +834,11 @@ class UnicodeData:
                     table[i] = f2
 
         # public attributes
-        self.filename = filename
+        self.filename = UNICODE_DATA % ''
         self.table = table
         self.chars = list(range(0x110000)) # unicode 3.2
 
-        file = open(exclusions)
+        file = open_data(COMPOSITION_EXCLUSIONS, version)
         self.exclusions = {}
         for s in file:
             s = s.strip()
@@ -841,7 +850,7 @@ class UnicodeData:
             self.exclusions[char] = 1
 
         widths = [None] * 0x110000
-        for s in open(eastasianwidth):
+        for s in open_data(EASTASIAN_WIDTH, version):
             s = s.strip()
             if not s:
                 continue
@@ -862,7 +871,7 @@ class UnicodeData:
         for i in range(0, 0x110000):
             if table[i] is not None:
                 table[i].append(set())
-        for s in open(derivedprops):
+        for s in open_data(DERIVED_CORE_PROPERTIES, version):
             s = s.split('#', 1)[0].strip()
             if not s:
                 continue
@@ -881,43 +890,53 @@ class UnicodeData:
                     # apply to unassigned code points; ignore them
                     table[char][-1].add(p)
 
-        if linebreakprops:
-            for s in open(linebreakprops):
-                s = s.partition('#')[0]
-                s = [i.strip() for i in s.split(';')]
-                if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
-                    continue
-                if '..' not in s[0]:
-                    first = last = int(s[0], 16)
-                else:
-                    first, last = [int(c, 16) for c in s[0].split('..')]
-                for char in range(first, last+1):
-                    table[char][-1].add('Line_Break')
-
-        if derivednormalizationprops:
-            quickchecks = [0] * 0x110000 # default is Yes
-            qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
-            for s in open(derivednormalizationprops):
-                if '#' in s:
-                    s = s[:s.index('#')]
-                s = [i.strip() for i in s.split(';')]
-                if len(s) < 2 or s[1] not in qc_order:
-                    continue
-                quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
-                quickcheck_shift = qc_order.index(s[1])*2
-                quickcheck <<= quickcheck_shift
-                if '..' not in s[0]:
-                    first = last = int(s[0], 16)
-                else:
-                    first, last = [int(c, 16) for c in s[0].split('..')]
-                for char in range(first, last+1):
-                    assert not (quickchecks[char]>>quickcheck_shift)&3
-                    quickchecks[char] |= quickcheck
-            for i in range(0, 0x110000):
-                if table[i] is not None:
-                    table[i].append(quickchecks[i])
+        for s in open_data(LINE_BREAK, version):
+            s = s.partition('#')[0]
+            s = [i.strip() for i in s.split(';')]
+            if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
+                continue
+            if '..' not in s[0]:
+                first = last = int(s[0], 16)
+            else:
+                first, last = [int(c, 16) for c in s[0].split('..')]
+            for char in range(first, last+1):
+                table[char][-1].add('Line_Break')
+
+        # We only want the quickcheck properties
+        # Format: NF?_QC; Y(es)/N(o)/M(aybe)
+        # Yes is the default, hence only N and M occur
+        # In 3.2.0, the format was different (NF?_NO)
+        # The parsing will incorrectly determine these as
+        # "yes", however, unicodedata.c will not perform quickchecks
+        # for older versions, and no delta records will be created.
+        quickchecks = [0] * 0x110000
+        qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
+        for s in open_data(DERIVEDNORMALIZATION_PROPS, version):
+            if '#' in s:
+                s = s[:s.index('#')]
+            s = [i.strip() for i in s.split(';')]
+            if len(s) < 2 or s[1] not in qc_order:
+                continue
+            quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
+            quickcheck_shift = qc_order.index(s[1])*2
+            quickcheck <<= quickcheck_shift
+            if '..' not in s[0]:
+                first = last = int(s[0], 16)
+            else:
+                first, last = [int(c, 16) for c in s[0].split('..')]
+            for char in range(first, last+1):
+                assert not (quickchecks[char]>>quickcheck_shift)&3
+                quickchecks[char] |= quickcheck
+        for i in range(0, 0x110000):
+            if table[i] is not None:
+                table[i].append(quickchecks[i])
 
-        for line in open(unihan, encoding='utf-8'):
+        zip = zipfile.ZipFile(open_data(UNIHAN, version))
+        if version == '3.2.0':
+            data = zip.open('Unihan-3.2.0.txt').read()
+        else:
+            data = zip.open('Unihan_NumericValues.txt').read()
+        for line in data.decode("utf-8").splitlines():
             if not line.startswith('U+'):
                 continue
             code, tag, value = line.split(None, 3)[:3]