summaryrefslogtreecommitdiff
path: root/smartypants.py
diff options
context:
space:
mode:
authorLeo Hemsted <leo.hemsted@digital.cabinet-office.gov.uk>2017-11-30 18:00:28 +0000
committerLeo Hemsted <leo.hemsted@digital.cabinet-office.gov.uk>2017-11-30 18:05:00 +0000
commita2d5e02ee96381ef7c438f530f6326c0e2909d38 (patch)
tree5eba944b178a94d11c861194f18cdbe23d8b3869 /smartypants.py
parent0f44a39bda1b3d3064a91cd2de1edab583b93828 (diff)
downloadsmartypants-git-regex-speedup.tar.gz
Use the regex lib and atomics to speed up _tokenizeregex-speedup
Use an atomic (part of the PCRE syntax) to speed up tokenizing of large texts witih very few matches. Improved 8400 char text with no tags from 0.5secs execution time to 0.03secs. With credit to James Kirrage
Diffstat (limited to 'smartypants.py')
-rwxr-xr-xsmartypants.py17
1 files changed, 13 insertions, 4 deletions
diff --git a/smartypants.py b/smartypants.py
index f248c07..39e0021 100755
--- a/smartypants.py
+++ b/smartypants.py
@@ -20,7 +20,13 @@ __license__ = 'BSD License'
__url__ = 'https://github.com/leohemsted/smartypants.py'
__description__ = 'Python with the SmartyPants'
-import re
+try:
+ import regex as re
+ # regex uses atomics to improve performance
+ TAG_REGEX = r'((?>[^<]*))(<!--.*?--\s*>|<[^>]*>)'
+except ImportError:
+ import re
+ TAG_REGEX = r'([^<]*)(<!--.*?--\s*>|<[^>]*>)'
class _Attr(object):
@@ -213,7 +219,6 @@ def smartypants(text, attr=None):
# the last character of the previous text
# token, to use as context to curl single-
# character quote tokens correctly.
-
tags_to_skip_regex = _tags_to_skip_regex()
for cur_token in tokens:
@@ -564,11 +569,15 @@ def _tokenize(text):
Based on the _tokenize() subroutine from `Brad Choate's MTRegex plugin`__.
__ http://www.bradchoate.com/past/mtregex.php
- """
+
+ If you have the ``regex`` library (https://pypi.python.org/pypi/regex/),
+ this function will use an alternative regex to perform significantly faster
+ on large input texts.
+ """
tokens = []
- tag_soup = re.compile(r'([^<]*)(<!--.*?--\s*>|<[^>]*>)', re.S)
+ tag_soup = re.compile(TAG_REGEX, re.S)
token_match = tag_soup.search(text)