diff options
author | Leo Hemsted <leo.hemsted@digital.cabinet-office.gov.uk> | 2017-11-30 18:00:28 +0000 |
---|---|---|
committer | Leo Hemsted <leo.hemsted@digital.cabinet-office.gov.uk> | 2017-11-30 18:05:00 +0000 |
commit | a2d5e02ee96381ef7c438f530f6326c0e2909d38 (patch) | |
tree | 5eba944b178a94d11c861194f18cdbe23d8b3869 /smartypants.py | |
parent | 0f44a39bda1b3d3064a91cd2de1edab583b93828 (diff) | |
download | smartypants-git-regex-speedup.tar.gz |
Use the regex lib and atomics to speed up _tokenizeregex-speedup
Use an atomic (part of the PCRE syntax) to speed up tokenizing of
large texts witih very few matches. Improved 8400 char text with no
tags from 0.5secs execution time to 0.03secs.
With credit to James Kirrage
Diffstat (limited to 'smartypants.py')
-rwxr-xr-x | smartypants.py | 17 |
1 files changed, 13 insertions, 4 deletions
diff --git a/smartypants.py b/smartypants.py index f248c07..39e0021 100755 --- a/smartypants.py +++ b/smartypants.py @@ -20,7 +20,13 @@ __license__ = 'BSD License' __url__ = 'https://github.com/leohemsted/smartypants.py' __description__ = 'Python with the SmartyPants' -import re +try: + import regex as re + # regex uses atomics to improve performance + TAG_REGEX = r'((?>[^<]*))(<!--.*?--\s*>|<[^>]*>)' +except ImportError: + import re + TAG_REGEX = r'([^<]*)(<!--.*?--\s*>|<[^>]*>)' class _Attr(object): @@ -213,7 +219,6 @@ def smartypants(text, attr=None): # the last character of the previous text # token, to use as context to curl single- # character quote tokens correctly. - tags_to_skip_regex = _tags_to_skip_regex() for cur_token in tokens: @@ -564,11 +569,15 @@ def _tokenize(text): Based on the _tokenize() subroutine from `Brad Choate's MTRegex plugin`__. __ http://www.bradchoate.com/past/mtregex.php - """ + + If you have the ``regex`` library (https://pypi.python.org/pypi/regex/), + this function will use an alternative regex to perform significantly faster + on large input texts. + """ tokens = [] - tag_soup = re.compile(r'([^<]*)(<!--.*?--\s*>|<[^>]*>)', re.S) + tag_soup = re.compile(TAG_REGEX, re.S) token_match = tag_soup.search(text) |