Use the regex lib and atomics to speed up _tokenizeregex-speedup

Use an atomic (part of the PCRE syntax) to speed up tokenizing of large texts witih very few matches. Improved 8400 char text with no tags from 0.5secs execution time to 0.03secs. With credit to James Kirrage
author: Leo Hemsted <leo.hemsted@digital.cabinet-office.gov.uk> 2017-11-30 18:00:28 +0000
committer: Leo Hemsted <leo.hemsted@digital.cabinet-office.gov.uk> 2017-11-30 18:05:00 +0000
commit: a2d5e02ee96381ef7c438f530f6326c0e2909d38 (patch)
tree: 5eba944b178a94d11c861194f18cdbe23d8b3869 /smartypants.py
parent: 0f44a39bda1b3d3064a91cd2de1edab583b93828 (diff)
download: smartypants-git-regex-speedup.tar.gz
1 files changed, 13 insertions, 4 deletions
diff --git a/smartypants.py b/smartypants.py
index f248c07..39e0021 100755
--- a/smartypants.py
+++ b/smartypants.py
@@ -20,7 +20,13 @@ __license__ = 'BSD License'
 __url__ = 'https://github.com/leohemsted/smartypants.py'
 __description__ = 'Python with the SmartyPants'
 
-import re
+try:
+    import regex as re
+    # regex uses atomics to improve performance
+    TAG_REGEX = r'((?>[^<]*))(<!--.*?--\s*>|<[^>]*>)'
+except ImportError:
+    import re
+    TAG_REGEX = r'([^<]*)(<!--.*?--\s*>|<[^>]*>)'
 
 
 class _Attr(object):
@@ -213,7 +219,6 @@ def smartypants(text, attr=None):
     # the last character of the previous text
     # token, to use as context to curl single-
     # character quote tokens correctly.
-
     tags_to_skip_regex = _tags_to_skip_regex()
 
     for cur_token in tokens:
@@ -564,11 +569,15 @@ def _tokenize(text):
     Based on the _tokenize() subroutine from `Brad Choate's MTRegex plugin`__.
 
     __ http://www.bradchoate.com/past/mtregex.php
-    """
 
+
+    If you have the ``regex`` library (https://pypi.python.org/pypi/regex/),
+    this function will use an alternative regex to perform significantly faster
+    on large input texts.
+    """
     tokens = []
 
-    tag_soup = re.compile(r'([^<]*)(<!--.*?--\s*>|<[^>]*>)', re.S)
+    tag_soup = re.compile(TAG_REGEX, re.S)
 
     token_match = tag_soup.search(text)
author	Leo Hemsted <leo.hemsted@digital.cabinet-office.gov.uk>	2017-11-30 18:00:28 +0000
committer	Leo Hemsted <leo.hemsted@digital.cabinet-office.gov.uk>	2017-11-30 18:05:00 +0000
commit	a2d5e02ee96381ef7c438f530f6326c0e2909d38 (patch)
tree	5eba944b178a94d11c861194f18cdbe23d8b3869 /smartypants.py
parent	0f44a39bda1b3d3064a91cd2de1edab583b93828 (diff)
download	smartypants-git-regex-speedup.tar.gz