diff options
| author | Thomas Kluyver <takowl@gmail.com> | 2010-10-05 20:08:57 +0100 |
|---|---|---|
| committer | Thomas Kluyver <takowl@gmail.com> | 2010-10-05 20:08:57 +0100 |
| commit | c919314b46d4211cac896ef908ee136a96540427 (patch) | |
| tree | 9ef390b4d5ae4042849d9030ce7d60ef2e5d285b /unidecode | |
| parent | cb7de0938b507b3e3fa1be2eafb315a6da389146 (diff) | |
| download | unidecode-c919314b46d4211cac896ef908ee136a96540427.tar.gz | |
Simplify code, and make a Python 3 version of the tests.
Diffstat (limited to 'unidecode')
| -rw-r--r-- | unidecode/__init__.py | 96 |
1 files changed, 51 insertions, 45 deletions
diff --git a/unidecode/__init__.py b/unidecode/__init__.py index d52679b..b32f44a 100644 --- a/unidecode/__init__.py +++ b/unidecode/__init__.py @@ -1,48 +1,54 @@ -"""ASCII transliterations of Unicode text -""" -Char = {} +# -*- coding: utf-8 -*- +"""Transliterate Unicode text into plain 7-bit ASCII. + +Example usage: +>>> from unidecode import unidecode: +>>> unidecode(u"\u5317\u4EB0") +"Bei Jing " -NULLMAP = [ '' * 0x100 ] +The transliteration uses a straightforward map, and doesn't have alternatives +for the same character based on language, position, or anything else. + +In Python 3, a standard string object will be returned. If you need bytes, use: +>>> unidecode("Κνωσός").encode("ascii") +b'Knosos' +""" +Cache = {} def unidecode(string): - """Transliterate an Unicode object into an ASCII string - - >>> unidecode(u"\u5317\u4EB0") - "Bei Jing " - """ - - retval = [] - - for char in string: - o = ord(char) - - if o < 0x80: - retval.append(char) - continue - - h = o >> 8 - l = o & 0xff - - c = Char.get(h, None) - - if c == None: - try: - mod = __import__('unidecode.x%02x'%(h), [], [], ['data']) - except ImportError: - Char[h] = NULLMAP - retval.append('') - continue - - Char[h] = mod.data - - try: - retval.append( mod.data[l] ) - except IndexError: - retval.append( '' ) - else: - try: - retval.append( c[l] ) - except IndexError: - retval.append( '' ) - - return ''.join(retval) + """Transliterate an Unicode object into an ASCII string + + >>> unidecode(u"\u5317\u4EB0") + "Bei Jing " + """ + + retval = [] + + for char in string: + codepoint = ord(char) + + if codepoint < 0x80: # Basic ASCII + retval.append(char) + continue + + if codepoint > 0xffff: + continue # We don't support characters beyond the BMP. + + section = codepoint >> 8 # Chop off the last two hex digits + position = codepoint % 256 # Last two hex digits + + try: + table = Cache[section] + except KeyError: + try: + mod = __import__('unidecode.x%02x'%(section), [], [], ['data']) + except ImportError: + Cache[section] = None + continue # No match: ignore this character and carry on. + + Cache[section] = table = mod.data + + if table: + retval.append( table[position] ) + + return ''.join(retval)
\ No newline at end of file |
