Simplify code, and make a Python 3 version of the tests.

author: Thomas Kluyver <takowl@gmail.com> 2010-10-05 20:08:57 +0100
committer: Thomas Kluyver <takowl@gmail.com> 2010-10-05 20:08:57 +0100
commit: c919314b46d4211cac896ef908ee136a96540427 (patch)
tree: 9ef390b4d5ae4042849d9030ce7d60ef2e5d285b /unidecode
parent: cb7de0938b507b3e3fa1be2eafb315a6da389146 (diff)
download: unidecode-c919314b46d4211cac896ef908ee136a96540427.tar.gz
1 files changed, 51 insertions, 45 deletions
diff --git a/unidecode/__init__.py b/unidecode/__init__.py
index d52679b..b32f44a 100644
--- a/unidecode/__init__.py
+++ b/unidecode/__init__.py
@@ -1,48 +1,54 @@
-"""ASCII transliterations of Unicode text
-"""
-Char = {}
+# -*- coding: utf-8 -*-
+"""Transliterate Unicode text into plain 7-bit ASCII.
+
+Example usage:
+>>> from unidecode import unidecode:
+>>> unidecode(u"\u5317\u4EB0")
+"Bei Jing "
 
-NULLMAP = [ '' * 0x100 ]
+The transliteration uses a straightforward map, and doesn't have alternatives
+for the same character based on language, position, or anything else.
+
+In Python 3, a standard string object will be returned. If you need bytes, use:
+>>> unidecode("Κνωσός").encode("ascii")
+b'Knosos'
+"""
+Cache = {}
 
 def unidecode(string):
-	"""Transliterate an Unicode object into an ASCII string
-
-	>>> unidecode(u"\u5317\u4EB0")
-	"Bei Jing "
-	"""
-
-	retval = []
-
-	for char in string:
-		o = ord(char)
-
-		if o < 0x80:
-			retval.append(char)
-			continue
-
-		h = o >> 8
-		l = o & 0xff
-
-		c = Char.get(h, None)
-		
-		if c == None:
-			try:
-				mod = __import__('unidecode.x%02x'%(h), [], [], ['data'])
-			except ImportError:
-				Char[h] = NULLMAP
-				retval.append('')
-				continue
-
-			Char[h] = mod.data
-
-			try:
-				retval.append( mod.data[l] )
-			except IndexError:
-				retval.append( '' )
-		else:
-			try:
-				retval.append( c[l] )
-			except IndexError:
-				retval.append( '' )
-
-	return ''.join(retval)
+    """Transliterate an Unicode object into an ASCII string
+
+    >>> unidecode(u"\u5317\u4EB0")
+    "Bei Jing "
+    """
+
+    retval = []
+
+    for char in string:
+        codepoint = ord(char)
+
+        if codepoint < 0x80: # Basic ASCII
+            retval.append(char)
+            continue
+        
+        if codepoint > 0xffff:
+            continue # We don't support characters beyond the BMP.
+
+        section = codepoint >> 8   # Chop off the last two hex digits
+        position = codepoint % 256 # Last two hex digits
+
+        try:
+            table = Cache[section]
+        except KeyError:
+            try:
+                mod = __import__('unidecode.x%02x'%(section), [], [], ['data'])
+            except ImportError:
+                Cache[section] = None
+                continue   # No match: ignore this character and carry on.
+
+            Cache[section] = table = mod.data
+
+        if table:
+            retval.append( table[position] )
+
+    return ''.join(retval)
+\ No newline at end of file
author	Thomas Kluyver <takowl@gmail.com>	2010-10-05 20:08:57 +0100
committer	Thomas Kluyver <takowl@gmail.com>	2010-10-05 20:08:57 +0100
commit	c919314b46d4211cac896ef908ee136a96540427 (patch)
tree	9ef390b4d5ae4042849d9030ce7d60ef2e5d285b /unidecode
parent	cb7de0938b507b3e3fa1be2eafb315a6da389146 (diff)
download	unidecode-c919314b46d4211cac896ef908ee136a96540427.tar.gz