diff options
| author | dukebody <israel.saeta@dukebody.com> | 2015-11-14 17:08:53 +0100 |
|---|---|---|
| committer | dukebody <israel.saeta@dukebody.com> | 2015-11-14 17:08:53 +0100 |
| commit | be04113429619571dc3f639bdf6db4f05a1e6fd3 (patch) | |
| tree | 246ff0a3d6ededde2e6709bc8a865ae216817ba5 /unidecode/__init__.py | |
| parent | 28cf185b973c1cc91bed655adfd13444092083d9 (diff) | |
| download | unidecode-be04113429619571dc3f639bdf6db4f05a1e6fd3.tar.gz | |
Add unidecode_fast function to speedup mostly-ASCII transliterations.
Diffstat (limited to 'unidecode/__init__.py')
| -rw-r--r-- | unidecode/__init__.py | 36 |
1 files changed, 30 insertions, 6 deletions
diff --git a/unidecode/__init__.py b/unidecode/__init__.py index ac5b86d..94dd970 100644 --- a/unidecode/__init__.py +++ b/unidecode/__init__.py @@ -19,18 +19,42 @@ from sys import version_info Cache = {} + +def _warn_if_not_unicode(string): + if version_info[0] < 3 and not isinstance(string, unicode): + warnings.warn( "Argument %r is not an unicode object. " + "Passing an encoded string will likely have " + "unexpected results." % (type(string),), + RuntimeWarning, 2) + + +def unidecode_fast(string): + """ + Try to transliterate using ASCII codec. If it fails, fall back to + transliteration using the character tables. + + This is approx. five times faster if the string only contains ASCII + characters, but sligthly slower than using unidecode directly non-ASCII + chars are present. + """ + _warn_if_not_unicode(string) + try: + bytestring = string.encode('ASCII') + except UnicodeEncodeError: + return unidecode(string) + if version_info[0] >= 3: + return string + else: + return bytestring + + def unidecode(string): """Transliterate an Unicode object into an ASCII string >>> unidecode(u"\u5317\u4EB0") "Bei Jing " """ - - if version_info[0] < 3 and not isinstance(string, unicode): - warnings.warn( "Argument %r is not an unicode object. " - "Passing an encoded string will likely have " - "unexpected results." % (type(string),), - RuntimeWarning, 2) + _warn_if_not_unicode(string) retval = [] |
