summaryrefslogtreecommitdiff
path: root/pyparsing
diff options
context:
space:
mode:
authorMax Fischer <maxfischer2781@gmail.com>2021-07-30 14:51:33 +0200
committerGitHub <noreply@github.com>2021-07-30 07:51:33 -0500
commit5eafa07470a3e761944ff5af00dbcd794dfa09da (patch)
tree832e780435bee81bbe5f1c619b8be544a6426656 /pyparsing
parentd108a29db062c9250f50c978e3a86381d1b0746b (diff)
downloadpyparsing-git-left_recursion_support.tar.gz
Add support for LR parsingleft_recursion_support
* first draft of LR parsing * removed debug output * cache is owned and cleared by ParserElement * bounded recursion must be enabled explicitly * packrat rejects recursion * basic LR unit test * tests for associativity and nesting * added math example * fixed test typo * unittest for empty and non-peg clauses * LR-Forward can match Empty * fixed test typos * added base case to unittest * memo cache only provides copies * flattened Forward parse method * added high-level description of algorithm * expanded docstring * added tests for repetition rules * renamed bounded to left recursion * naive test for existing suite * explicitly testing tests for LR compatibility * LR memo no longer mixes action/no-action results * simplified replacement logic * adjusted example with ambiguous failure case * LR memo content is always returned as copy * draft for peeking recursion * memo update consistent for all actions * fixed a bug for non-string token identifiers * action wins against no-action * cleanup * properly setting names in tests * memoization can be turned off * testing memo switches * typos * flattened recursion memo * left recursion memo size may be limited * adjusted docs for recursion cache
Diffstat (limited to 'pyparsing')
-rw-r--r--pyparsing/core.py146
-rw-r--r--pyparsing/results.py2
-rw-r--r--pyparsing/testing.py3
-rw-r--r--pyparsing/util.py46
4 files changed, 194 insertions, 3 deletions
diff --git a/pyparsing/core.py b/pyparsing/core.py
index c4adb5d..d5ee05f 100644
--- a/pyparsing/core.py
+++ b/pyparsing/core.py
@@ -1,6 +1,7 @@
#
# core.py
#
+from typing import Optional
from abc import ABC, abstractmethod
from enum import Enum
import string
@@ -23,6 +24,8 @@ from .util import (
_escapeRegexRangeChars,
_bslash,
_flatten,
+ LRUMemo as _LRUMemo,
+ UnboundedMemo as _UnboundedMemo,
)
from .exceptions import *
from .actions import *
@@ -703,6 +706,10 @@ class ParserElement(ABC):
else:
return True
+ # cache for left-recursion in Forward references
+ recursion_lock = RLock()
+ recursion_memos = {} # type: dict[tuple[int, Forward, bool], tuple[int, ParseResults | Exception]]
+
# argument cache for optimizing repeated calls when backtracking through recursive expressions
packrat_cache = (
{}
@@ -766,11 +773,73 @@ class ParserElement(ABC):
ParserElement.packrat_cache_stats[:] = [0] * len(
ParserElement.packrat_cache_stats
)
+ ParserElement.recursion_memos.clear()
_packratEnabled = False
+ _left_recursion_enabled = False
+
+ @staticmethod
+ def disable_memoization():
+ """
+ Disables active Packrat or Left Recursion parsing and their memoization
+
+ This method also works if neither Packrat nor Left Recursion are enabled.
+ This makes it safe to call before activating Packrat nor Left Recursion
+ to clear any previous settings.
+ """
+ ParserElement.resetCache()
+ ParserElement._left_recursion_enabled = False
+ ParserElement._packratEnabled = False
+ ParserElement._parse = ParserElement._parseNoCache
+
+ @staticmethod
+ def enable_left_recursion(cache_size_limit: Optional[int] = None, *, force=False):
+ """
+ Enables "bounded recursion" parsing, which allows for both direct and indirect
+ left-recursion. During parsing, left-recursive :class:`Forward` elements are
+ repeatedly matched with a fixed recursion depth that is gradually increased
+ until finding the longest match.
+
+ Example::
+
+ import pyparsing as pp
+ pp.ParserElement.enable_left_recursion()
+
+ E = pp.Forward("E")
+ num = pp.Word(pp.nums)
+ # match `num`, or `num '+' num`, or `num '+' num '+' num`, ...
+ E <<= E + '+' - num | num
+
+ print(E.parseString("1+2+3"))
+
+ Recursion search naturally memoizes matches of ``Forward`` elements and may
+ thus skip reevaluation of parse actions during backtracking. This may break
+ programs with parse actions which rely on strict ordering of side-effects.
+
+ Parameters:
+
+ - cache_size_limit - (default=``None``) - memoize at most this many
+ ``Forward`` elements during matching; if ``None`` (the default),
+ memoize all ``Forward`` elements.
+
+ Bounded Recursion parsing works similar but not identical to Packrat parsing,
+ thus the two cannot be used together. Use ``force=True`` to disable any
+ previous, conflicting settings.
+ """
+ if force:
+ ParserElement.disable_memoization()
+ elif ParserElement._packratEnabled:
+ raise RuntimeError("Packrat and Bounded Recursion are not compatible")
+ if cache_size_limit is None:
+ ParserElement.recursion_memos = _UnboundedMemo()
+ elif cache_size_limit > 0:
+ ParserElement.recursion_memos = _LRUMemo(capacity=cache_size_limit)
+ else:
+ raise NotImplementedError("Memo size of %s" % cache_size_limit)
+ ParserElement._left_recursion_enabled = True
@staticmethod
- def enablePackrat(cache_size_limit=128):
+ def enablePackrat(cache_size_limit=128, *, force=False):
"""Enables "packrat" parsing, which adds memoizing to the parsing logic.
Repeated parse attempts at the same string location (which happens
often in many complex grammars) can immediately return a cached value,
@@ -795,7 +864,15 @@ class ParserElement(ABC):
import pyparsing
pyparsing.ParserElement.enablePackrat()
+
+ Packrat parsing works similar but not identical to Bounded Recursion parsing,
+ thus the two cannot be used together. Use ``force=True`` to disable any
+ previous, conflicting settings.
"""
+ if force:
+ ParserElement.disable_memoization()
+ elif ParserElement._left_recursion_enabled:
+ raise RuntimeError("Packrat and Bounded Recursion are not compatible")
if not ParserElement._packratEnabled:
ParserElement._packratEnabled = True
if cache_size_limit is None:
@@ -4390,7 +4467,72 @@ class Forward(ParseElementEnhance):
"Forward expression was never assigned a value, will not parse any input",
stacklevel=stacklevel,
)
- return super().parseImpl(instring, loc, doActions)
+ if not ParserElement._left_recursion_enabled:
+ return super().parseImpl(instring, loc, doActions)
+ # ## Bounded Recursion algorithm ##
+ # Recursion only needs to be processed at ``Forward`` elements, since they are
+ # the only ones that can actually refer to themselves. The general idea is
+ # to handle recursion stepwise: We start at no recursion, then recurse once,
+ # recurse twice, ..., until more recursion offers no benefit (we hit the bound).
+ #
+ # The "trick" here is that each ``Forward`` gets evaluated in two contexts
+ # - to *match* a specific recursion level, and
+ # - to *search* the bounded recursion level
+ # and the two run concurrently. The *search* must *match* each recursion level
+ # to find the best possible match. This is handled by a memo table, which
+ # provides the previous match to the next level match attempt.
+ #
+ # See also "Left Recursion in Parsing Expression Grammars", Medeiros et al.
+ #
+ # There is a complication since we not only *parse* but also *transform* via
+ # actions: We do not want to run the actions too often while expanding. Thus,
+ # we expand using `doActions=False` and only run `doActions=True` if the next
+ # recursion level is acceptable.
+ with ParserElement.recursion_lock:
+ memo = ParserElement.recursion_memos
+ try:
+ # we are parsing at a specific recursion expansion – use it as-is
+ prev_loc, prev_result = memo[loc, self, doActions]
+ if isinstance(prev_result, Exception):
+ raise prev_result
+ return prev_loc, prev_result.copy()
+ except KeyError:
+ act_key = (loc, self, True)
+ peek_key = (loc, self, False)
+ # we are searching for the best recursion expansion – keep on improving
+ # both `doActions` cases must be tracked separately here!
+ prev_loc, prev_peek = memo[peek_key] = loc - 1, ParseException(
+ instring, loc, "Forward recursion without base case", self
+ )
+ if doActions:
+ memo[act_key] = memo[peek_key]
+ while True:
+ try:
+ new_loc, new_peek = super().parseImpl(instring, loc, False)
+ except ParseException:
+ # we failed before getting any match – do not hide the error
+ if isinstance(prev_peek, Exception):
+ raise
+ new_loc, new_peek = prev_loc, prev_peek
+ # the match did not get better: we are done
+ if new_loc <= prev_loc:
+ if doActions:
+ # replace the match for doActions=False as well,
+ # in case the action did backtrack
+ prev_loc, prev_result = memo[peek_key] = memo[act_key]
+ del memo[peek_key], memo[act_key]
+ return prev_loc, prev_result.copy()
+ del memo[peek_key]
+ return prev_loc, prev_peek.copy()
+ # the match did get better: see if we can improve further
+ else:
+ if doActions:
+ try:
+ memo[act_key] = super().parseImpl(instring, loc, True)
+ except ParseException as e:
+ memo[peek_key] = memo[act_key] = (new_loc, e)
+ raise
+ prev_loc, prev_peek = memo[peek_key] = new_loc, new_peek
def leaveWhitespace(self, recursive=True):
self.skipWhitespace = False
diff --git a/pyparsing/results.py b/pyparsing/results.py
index fb03ecd..fa94a00 100644
--- a/pyparsing/results.py
+++ b/pyparsing/results.py
@@ -522,7 +522,7 @@ class ParseResults:
Returns a new copy of a :class:`ParseResults` object.
"""
ret = ParseResults(self._toklist)
- ret._tokdict = dict(**self._tokdict)
+ ret._tokdict = self._tokdict.copy()
ret._parent = self._parent
ret._all_names |= self._all_names
ret._name = self._name
diff --git a/pyparsing/testing.py b/pyparsing/testing.py
index 393f37b..43f23ab 100644
--- a/pyparsing/testing.py
+++ b/pyparsing/testing.py
@@ -20,6 +20,7 @@ class pyparsing_test:
"""
Context manager to be used when writing unit tests that modify pyparsing config values:
- packrat parsing
+ - bounded recursion parsing
- default whitespace characters.
- default keyword characters
- literal string auto-conversion class
@@ -61,6 +62,7 @@ class pyparsing_test:
else:
self._save_context["packrat_cache_size"] = None
self._save_context["packrat_parse"] = ParserElement._parse
+ self._save_context["recursion_enabled"] = ParserElement._left_recursion_enabled
self._save_context["__diag__"] = {
name: getattr(__diag__, name) for name in __diag__._all_names
@@ -97,6 +99,7 @@ class pyparsing_test:
ParserElement.enablePackrat(self._save_context["packrat_cache_size"])
else:
ParserElement._parse = self._save_context["packrat_parse"]
+ ParserElement._left_recursion_enabled = self._save_context["recursion_enabled"]
__compat__.collect_all_And_tokens = self._save_context["__compat__"]
diff --git a/pyparsing/util.py b/pyparsing/util.py
index 3cb69d2..875799d 100644
--- a/pyparsing/util.py
+++ b/pyparsing/util.py
@@ -119,6 +119,52 @@ class _FifoCache:
self.clear = types.MethodType(clear, self)
+class LRUMemo:
+ """
+ A memoizing mapping that retains `capacity` deleted items
+
+ The memo tracks retained items by their access order; once `capacity` items
+ are retained, the least recently used item is discarded.
+ """
+ def __init__(self, capacity):
+ self._capacity = capacity
+ self._active = {}
+ self._memory = collections.OrderedDict()
+
+ def __getitem__(self, key):
+ try:
+ return self._active[key]
+ except KeyError:
+ self._memory.move_to_end(key)
+ return self._memory[key]
+
+ def __setitem__(self, key, value):
+ self._memory.pop(key, None)
+ self._active[key] = value
+
+ def __delitem__(self, key):
+ try:
+ value = self._active.pop(key)
+ except KeyError:
+ pass
+ else:
+ while len(self._memory) >= self._capacity:
+ self._memory.popitem(last=False)
+ self._memory[key] = value
+
+ def clear(self):
+ self._active.clear()
+ self._memory.clear()
+
+
+class UnboundedMemo(dict):
+ """
+ A memoizing mapping that retains all deleted items
+ """
+ def __delitem__(self, key):
+ pass
+
+
def _escapeRegexRangeChars(s):
# escape these chars: ^-[]
for c in r"\^-[]":