summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeo Hemsted <leo.hemsted@digital.cabinet-office.gov.uk>2017-11-30 18:00:28 +0000
committerLeo Hemsted <leo.hemsted@digital.cabinet-office.gov.uk>2017-11-30 18:05:00 +0000
commita2d5e02ee96381ef7c438f530f6326c0e2909d38 (patch)
tree5eba944b178a94d11c861194f18cdbe23d8b3869
parent0f44a39bda1b3d3064a91cd2de1edab583b93828 (diff)
downloadsmartypants-git-regex-speedup.tar.gz
Use the regex lib and atomics to speed up _tokenizeregex-speedup
Use an atomic (part of the PCRE syntax) to speed up tokenizing of large texts witih very few matches. Improved 8400 char text with no tags from 0.5secs execution time to 0.03secs. With credit to James Kirrage
-rwxr-xr-xsmartypants.py17
1 files changed, 13 insertions, 4 deletions
diff --git a/smartypants.py b/smartypants.py
index f248c07..39e0021 100755
--- a/smartypants.py
+++ b/smartypants.py
@@ -20,7 +20,13 @@ __license__ = 'BSD License'
__url__ = 'https://github.com/leohemsted/smartypants.py'
__description__ = 'Python with the SmartyPants'
-import re
+try:
+ import regex as re
+ # regex uses atomics to improve performance
+ TAG_REGEX = r'((?>[^<]*))(<!--.*?--\s*>|<[^>]*>)'
+except ImportError:
+ import re
+ TAG_REGEX = r'([^<]*)(<!--.*?--\s*>|<[^>]*>)'
class _Attr(object):
@@ -213,7 +219,6 @@ def smartypants(text, attr=None):
# the last character of the previous text
# token, to use as context to curl single-
# character quote tokens correctly.
-
tags_to_skip_regex = _tags_to_skip_regex()
for cur_token in tokens:
@@ -564,11 +569,15 @@ def _tokenize(text):
Based on the _tokenize() subroutine from `Brad Choate's MTRegex plugin`__.
__ http://www.bradchoate.com/past/mtregex.php
- """
+
+ If you have the ``regex`` library (https://pypi.python.org/pypi/regex/),
+ this function will use an alternative regex to perform significantly faster
+ on large input texts.
+ """
tokens = []
- tag_soup = re.compile(r'([^<]*)(<!--.*?--\s*>|<[^>]*>)', re.S)
+ tag_soup = re.compile(TAG_REGEX, re.S)
token_match = tag_soup.search(text)