The default size of the re module's compiled regular expression cache has

been increased from 100 to 500 and the cache replacement policy has changed from simply clearing the entire cache on overflow to randomly forgetting 20% of the existing cached compiled regular expressions. This is a performance win for applications that use a lot of regular expressions and limits the impact of the performance hit anytime the cache is exceeded.
author: Gregory P. Smith <greg@mad-scientist.com> 2010-07-27 05:31:29 +0000
committer: Gregory P. Smith <greg@mad-scientist.com> 2010-07-27 05:31:29 +0000
commit: 5a63183a8b8a9e177f97feac975850df5e6f98aa (patch)
tree: 5cc448b4d1c43daf25f07973360871bf449c773d /Lib/re.py
parent: f5ae1efd417b06cef5164cac79e2cc46afa6dd80 (diff)
download: cpython-git-5a63183a8b8a9e177f97feac975850df5e6f98aa.tar.gz
1 files changed, 36 insertions, 3 deletions
diff --git a/Lib/re.py b/Lib/re.py
index 9bd913a68d..2f1a76eaa5 100644
--- a/Lib/re.py
+++ b/Lib/re.py
@@ -254,7 +254,40 @@ _cache_repl = {}
 
 _pattern_type = type(sre_compile.compile("", 0))
 
-_MAXCACHE = 100
+_MAXCACHE = 500
+
+def _shrink_cache(cache_dict, max_length, divisor=5):
+    """Make room in the given cache.
+
+    Args:
+        cache_dict: The cache dictionary to modify.
+        max_length: Maximum # of entries in cache_dict before it is shrunk.
+        divisor: Cache will shrink to max_length - 1/divisor*max_length items.
+    """
+    # Toss out a fraction of the entries at random to make room for new ones.
+    # A random algorithm was chosen as opposed to simply cache_dict.popitem()
+    # as popitem could penalize the same regular expression repeatedly based
+    # on its internal hash value.  Being random should spread the cache miss
+    # love around.
+    cache_keys = tuple(cache_dict.keys())
+    overage = len(cache_keys) - max_length
+    if overage < 0:
+        # Cache is already within limits.  Normally this should not happen
+        # but it could due to multithreading.
+        return
+    number_to_toss = max_length // divisor + overage
+    # The import is done here to avoid a circular depencency.
+    import random
+    if not hasattr(random, 'sample'):
+        # Do nothing while resolving the circular dependency:
+        #  re->random->warnings->tokenize->string->re
+        return
+    for doomed_key in random.sample(cache_keys, number_to_toss):
+        try:
+            del cache_dict[doomed_key]
+        except KeyError:
+            # Ignore problems if the cache changed from another thread.
+            pass
 
 def _compile(*key):
     # internal: compile pattern
@@ -272,7 +305,7 @@ def _compile(*key):
         raise TypeError("first argument must be string or compiled pattern")
     p = sre_compile.compile(pattern, flags)
     if len(_cache) >= _MAXCACHE:
-        _cache.clear()
+        _shrink_cache(_cache, _MAXCACHE)
     _cache[cachekey] = p
     return p
 
@@ -284,7 +317,7 @@ def _compile_repl(*key):
     repl, pattern = key
     p = sre_parse.parse_template(repl, pattern)
     if len(_cache_repl) >= _MAXCACHE:
-        _cache_repl.clear()
+        _shrink_cache(_cache_repl, _MAXCACHE)
     _cache_repl[key] = p
     return p
author	Gregory P. Smith <greg@mad-scientist.com>	2010-07-27 05:31:29 +0000
committer	Gregory P. Smith <greg@mad-scientist.com>	2010-07-27 05:31:29 +0000
commit	5a63183a8b8a9e177f97feac975850df5e6f98aa (patch)
tree	5cc448b4d1c43daf25f07973360871bf449c773d /Lib/re.py
parent	f5ae1efd417b06cef5164cac79e2cc46afa6dd80 (diff)
download	cpython-git-5a63183a8b8a9e177f97feac975850df5e6f98aa.tar.gz