summaryrefslogtreecommitdiff
path: root/unidecode/__init__.py
diff options
context:
space:
mode:
authordukebody <israel.saeta@dukebody.com>2015-11-14 17:08:53 +0100
committerdukebody <israel.saeta@dukebody.com>2015-11-14 17:08:53 +0100
commitbe04113429619571dc3f639bdf6db4f05a1e6fd3 (patch)
tree246ff0a3d6ededde2e6709bc8a865ae216817ba5 /unidecode/__init__.py
parent28cf185b973c1cc91bed655adfd13444092083d9 (diff)
downloadunidecode-be04113429619571dc3f639bdf6db4f05a1e6fd3.tar.gz
Add unidecode_fast function to speedup mostly-ASCII transliterations.
Diffstat (limited to 'unidecode/__init__.py')
-rw-r--r--unidecode/__init__.py36
1 files changed, 30 insertions, 6 deletions
diff --git a/unidecode/__init__.py b/unidecode/__init__.py
index ac5b86d..94dd970 100644
--- a/unidecode/__init__.py
+++ b/unidecode/__init__.py
@@ -19,18 +19,42 @@ from sys import version_info
Cache = {}
+
+def _warn_if_not_unicode(string):
+ if version_info[0] < 3 and not isinstance(string, unicode):
+ warnings.warn( "Argument %r is not an unicode object. "
+ "Passing an encoded string will likely have "
+ "unexpected results." % (type(string),),
+ RuntimeWarning, 2)
+
+
+def unidecode_fast(string):
+ """
+ Try to transliterate using ASCII codec. If it fails, fall back to
+ transliteration using the character tables.
+
+ This is approx. five times faster if the string only contains ASCII
+ characters, but sligthly slower than using unidecode directly non-ASCII
+ chars are present.
+ """
+ _warn_if_not_unicode(string)
+ try:
+ bytestring = string.encode('ASCII')
+ except UnicodeEncodeError:
+ return unidecode(string)
+ if version_info[0] >= 3:
+ return string
+ else:
+ return bytestring
+
+
def unidecode(string):
"""Transliterate an Unicode object into an ASCII string
>>> unidecode(u"\u5317\u4EB0")
"Bei Jing "
"""
-
- if version_info[0] < 3 and not isinstance(string, unicode):
- warnings.warn( "Argument %r is not an unicode object. "
- "Passing an encoded string will likely have "
- "unexpected results." % (type(string),),
- RuntimeWarning, 2)
+ _warn_if_not_unicode(string)
retval = []