# # makeunidb.py -- generate a compact version of the unicode property # database (unicodedatabase.h) # import sys SCRIPT = sys.argv[0] VERSION = "1.0" UNICODE_DATA = "c:/pythonware/modules/unidb/etc/UnicodeData-Latest.txt" CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd", "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm", "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk", "So" ] BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO", "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS", "ON" ] def maketable(): unicode = UnicodeData(UNICODE_DATA) # extract unicode properties dummy = (0, 0, 0, 0, "NULL") table = [dummy] cache = {0: dummy} index = [0] * len(unicode.chars) DECOMPOSITION = [""] for char in unicode.chars: record = unicode.table[char] if record: # extract database properties category = CATEGORY_NAMES.index(record[2]) combining = int(record[3]) bidirectional = BIDIRECTIONAL_NAMES.index(record[4]) mirrored = record[9] == "Y" if record[5]: decomposition = '"%s"' % record[5] else: decomposition = "NULL" item = ( category, combining, bidirectional, mirrored, decomposition ) # add entry to index and item tables i = cache.get(item) if i is None: cache[item] = i = len(table) table.append(item) index[char] = i # FIXME: we really should compress the decomposition stuff # (see the unidb utilities for one way to do this) FILE = "unicodedata_db.h" sys.stdout = open(FILE, "w") print "/* this file was generated by %s %s */" % (SCRIPT, VERSION) print print "/* a list of unique database records */" print "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {" for item in table: print " {%d, %d, %d, %d, %s}," % item print "};" print print "/* string literals */" print "const char *_PyUnicode_CategoryNames[] = {" for name in CATEGORY_NAMES: print " \"%s\"," % name print " NULL" print "};" print "const char *_PyUnicode_BidirectionalNames[] = {" for name in BIDIRECTIONAL_NAMES: print " \"%s\"," % name print " NULL" print "};" # split index table index1, index2, shift = splitbins(index) print "/* index tables used to find the right database record */" print "#define SHIFT", shift Array("index1", index1).dump(sys.stdout) Array("index2", index2).dump(sys.stdout) sys.stdout = sys.__stdout__ # -------------------------------------------------------------------- # the following support code is taken from the unidb utilities # Copyright (c) 1999-2000 by Secret Labs AB # load a unicode-data file from disk import string, sys class UnicodeData: def __init__(self, filename): file = open(filename) table = [None] * 65536 while 1: s = file.readline() if not s: break s = string.split(string.strip(s), ";") char = string.atoi(s[0], 16) table[char] = s # public attributes self.filename = filename self.table = table self.chars = range(65536) # unicode def uselatin1(self): # restrict character range to ISO Latin 1 self.chars = range(256) # stuff to deal with arrays of unsigned integers class Array: def __init__(self, name, data): self.name = name self.data = data def dump(self, file): # write data to file, as a C array size = getsize(self.data) # print >>sys.stderr, self.name+":", size*len(self.data), "bytes" file.write("static ") if size == 1: file.write("unsigned char") elif size == 2: file.write("unsigned short") else: file.write("unsigned int") file.write(" " + self.name + "[] = {\n") if self.data: s = " " for item in self.data: i = str(item) + ", " if len(s) + len(i) > 78: file.write(s + "\n") s = " " + i else: s = s + i if string.strip(s): file.write(s + "\n") file.write("};\n\n") def getsize(data): # return smallest possible integer size for the given array maxdata = max(data) if maxdata < 256: return 1 elif maxdata < 65536: return 2 else: return 4 def splitbins(bins): # split a sparse integer table into two tables, such as: # value = t2[(t1[char>>shift]<>shift) # determine memory size b = len(bin1)*getsize(bin1) + len(bin2)*getsize(bin2) if b < bytes: best = shift, bin1, bin2 bytes = b shift, bin1, bin2 = best ## print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % ( ## len(bin1), len(bin2), shift, bytes ## ) return bin1, bin2, shift if __name__ == "__main__": maketable()