summaryrefslogtreecommitdiff
path: root/unidecode
diff options
context:
space:
mode:
authorThomas Kluyver <takowl@gmail.com>2010-10-05 20:08:57 +0100
committerThomas Kluyver <takowl@gmail.com>2010-10-05 20:08:57 +0100
commitc919314b46d4211cac896ef908ee136a96540427 (patch)
tree9ef390b4d5ae4042849d9030ce7d60ef2e5d285b /unidecode
parentcb7de0938b507b3e3fa1be2eafb315a6da389146 (diff)
downloadunidecode-c919314b46d4211cac896ef908ee136a96540427.tar.gz
Simplify code, and make a Python 3 version of the tests.
Diffstat (limited to 'unidecode')
-rw-r--r--unidecode/__init__.py96
1 files changed, 51 insertions, 45 deletions
diff --git a/unidecode/__init__.py b/unidecode/__init__.py
index d52679b..b32f44a 100644
--- a/unidecode/__init__.py
+++ b/unidecode/__init__.py
@@ -1,48 +1,54 @@
-"""ASCII transliterations of Unicode text
-"""
-Char = {}
+# -*- coding: utf-8 -*-
+"""Transliterate Unicode text into plain 7-bit ASCII.
+
+Example usage:
+>>> from unidecode import unidecode:
+>>> unidecode(u"\u5317\u4EB0")
+"Bei Jing "
-NULLMAP = [ '' * 0x100 ]
+The transliteration uses a straightforward map, and doesn't have alternatives
+for the same character based on language, position, or anything else.
+
+In Python 3, a standard string object will be returned. If you need bytes, use:
+>>> unidecode("Κνωσός").encode("ascii")
+b'Knosos'
+"""
+Cache = {}
def unidecode(string):
- """Transliterate an Unicode object into an ASCII string
-
- >>> unidecode(u"\u5317\u4EB0")
- "Bei Jing "
- """
-
- retval = []
-
- for char in string:
- o = ord(char)
-
- if o < 0x80:
- retval.append(char)
- continue
-
- h = o >> 8
- l = o & 0xff
-
- c = Char.get(h, None)
-
- if c == None:
- try:
- mod = __import__('unidecode.x%02x'%(h), [], [], ['data'])
- except ImportError:
- Char[h] = NULLMAP
- retval.append('')
- continue
-
- Char[h] = mod.data
-
- try:
- retval.append( mod.data[l] )
- except IndexError:
- retval.append( '' )
- else:
- try:
- retval.append( c[l] )
- except IndexError:
- retval.append( '' )
-
- return ''.join(retval)
+ """Transliterate an Unicode object into an ASCII string
+
+ >>> unidecode(u"\u5317\u4EB0")
+ "Bei Jing "
+ """
+
+ retval = []
+
+ for char in string:
+ codepoint = ord(char)
+
+ if codepoint < 0x80: # Basic ASCII
+ retval.append(char)
+ continue
+
+ if codepoint > 0xffff:
+ continue # We don't support characters beyond the BMP.
+
+ section = codepoint >> 8 # Chop off the last two hex digits
+ position = codepoint % 256 # Last two hex digits
+
+ try:
+ table = Cache[section]
+ except KeyError:
+ try:
+ mod = __import__('unidecode.x%02x'%(section), [], [], ['data'])
+ except ImportError:
+ Cache[section] = None
+ continue # No match: ignore this character and carry on.
+
+ Cache[section] = table = mod.data
+
+ if table:
+ retval.append( table[position] )
+
+ return ''.join(retval) \ No newline at end of file