diff options
author | Max Fischer <maxfischer2781@gmail.com> | 2021-07-30 14:51:33 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-07-30 07:51:33 -0500 |
commit | 5eafa07470a3e761944ff5af00dbcd794dfa09da (patch) | |
tree | 832e780435bee81bbe5f1c619b8be544a6426656 /pyparsing | |
parent | d108a29db062c9250f50c978e3a86381d1b0746b (diff) | |
download | pyparsing-git-left_recursion_support.tar.gz |
Add support for LR parsingleft_recursion_support
* first draft of LR parsing
* removed debug output
* cache is owned and cleared by ParserElement
* bounded recursion must be enabled explicitly
* packrat rejects recursion
* basic LR unit test
* tests for associativity and nesting
* added math example
* fixed test typo
* unittest for empty and non-peg clauses
* LR-Forward can match Empty
* fixed test typos
* added base case to unittest
* memo cache only provides copies
* flattened Forward parse method
* added high-level description of algorithm
* expanded docstring
* added tests for repetition rules
* renamed bounded to left recursion
* naive test for existing suite
* explicitly testing tests for LR compatibility
* LR memo no longer mixes action/no-action results
* simplified replacement logic
* adjusted example with ambiguous failure case
* LR memo content is always returned as copy
* draft for peeking recursion
* memo update consistent for all actions
* fixed a bug for non-string token identifiers
* action wins against no-action
* cleanup
* properly setting names in tests
* memoization can be turned off
* testing memo switches
* typos
* flattened recursion memo
* left recursion memo size may be limited
* adjusted docs for recursion cache
Diffstat (limited to 'pyparsing')
-rw-r--r-- | pyparsing/core.py | 146 | ||||
-rw-r--r-- | pyparsing/results.py | 2 | ||||
-rw-r--r-- | pyparsing/testing.py | 3 | ||||
-rw-r--r-- | pyparsing/util.py | 46 |
4 files changed, 194 insertions, 3 deletions
diff --git a/pyparsing/core.py b/pyparsing/core.py index c4adb5d..d5ee05f 100644 --- a/pyparsing/core.py +++ b/pyparsing/core.py @@ -1,6 +1,7 @@ # # core.py # +from typing import Optional from abc import ABC, abstractmethod from enum import Enum import string @@ -23,6 +24,8 @@ from .util import ( _escapeRegexRangeChars, _bslash, _flatten, + LRUMemo as _LRUMemo, + UnboundedMemo as _UnboundedMemo, ) from .exceptions import * from .actions import * @@ -703,6 +706,10 @@ class ParserElement(ABC): else: return True + # cache for left-recursion in Forward references + recursion_lock = RLock() + recursion_memos = {} # type: dict[tuple[int, Forward, bool], tuple[int, ParseResults | Exception]] + # argument cache for optimizing repeated calls when backtracking through recursive expressions packrat_cache = ( {} @@ -766,11 +773,73 @@ class ParserElement(ABC): ParserElement.packrat_cache_stats[:] = [0] * len( ParserElement.packrat_cache_stats ) + ParserElement.recursion_memos.clear() _packratEnabled = False + _left_recursion_enabled = False + + @staticmethod + def disable_memoization(): + """ + Disables active Packrat or Left Recursion parsing and their memoization + + This method also works if neither Packrat nor Left Recursion are enabled. + This makes it safe to call before activating Packrat nor Left Recursion + to clear any previous settings. + """ + ParserElement.resetCache() + ParserElement._left_recursion_enabled = False + ParserElement._packratEnabled = False + ParserElement._parse = ParserElement._parseNoCache + + @staticmethod + def enable_left_recursion(cache_size_limit: Optional[int] = None, *, force=False): + """ + Enables "bounded recursion" parsing, which allows for both direct and indirect + left-recursion. During parsing, left-recursive :class:`Forward` elements are + repeatedly matched with a fixed recursion depth that is gradually increased + until finding the longest match. + + Example:: + + import pyparsing as pp + pp.ParserElement.enable_left_recursion() + + E = pp.Forward("E") + num = pp.Word(pp.nums) + # match `num`, or `num '+' num`, or `num '+' num '+' num`, ... + E <<= E + '+' - num | num + + print(E.parseString("1+2+3")) + + Recursion search naturally memoizes matches of ``Forward`` elements and may + thus skip reevaluation of parse actions during backtracking. This may break + programs with parse actions which rely on strict ordering of side-effects. + + Parameters: + + - cache_size_limit - (default=``None``) - memoize at most this many + ``Forward`` elements during matching; if ``None`` (the default), + memoize all ``Forward`` elements. + + Bounded Recursion parsing works similar but not identical to Packrat parsing, + thus the two cannot be used together. Use ``force=True`` to disable any + previous, conflicting settings. + """ + if force: + ParserElement.disable_memoization() + elif ParserElement._packratEnabled: + raise RuntimeError("Packrat and Bounded Recursion are not compatible") + if cache_size_limit is None: + ParserElement.recursion_memos = _UnboundedMemo() + elif cache_size_limit > 0: + ParserElement.recursion_memos = _LRUMemo(capacity=cache_size_limit) + else: + raise NotImplementedError("Memo size of %s" % cache_size_limit) + ParserElement._left_recursion_enabled = True @staticmethod - def enablePackrat(cache_size_limit=128): + def enablePackrat(cache_size_limit=128, *, force=False): """Enables "packrat" parsing, which adds memoizing to the parsing logic. Repeated parse attempts at the same string location (which happens often in many complex grammars) can immediately return a cached value, @@ -795,7 +864,15 @@ class ParserElement(ABC): import pyparsing pyparsing.ParserElement.enablePackrat() + + Packrat parsing works similar but not identical to Bounded Recursion parsing, + thus the two cannot be used together. Use ``force=True`` to disable any + previous, conflicting settings. """ + if force: + ParserElement.disable_memoization() + elif ParserElement._left_recursion_enabled: + raise RuntimeError("Packrat and Bounded Recursion are not compatible") if not ParserElement._packratEnabled: ParserElement._packratEnabled = True if cache_size_limit is None: @@ -4390,7 +4467,72 @@ class Forward(ParseElementEnhance): "Forward expression was never assigned a value, will not parse any input", stacklevel=stacklevel, ) - return super().parseImpl(instring, loc, doActions) + if not ParserElement._left_recursion_enabled: + return super().parseImpl(instring, loc, doActions) + # ## Bounded Recursion algorithm ## + # Recursion only needs to be processed at ``Forward`` elements, since they are + # the only ones that can actually refer to themselves. The general idea is + # to handle recursion stepwise: We start at no recursion, then recurse once, + # recurse twice, ..., until more recursion offers no benefit (we hit the bound). + # + # The "trick" here is that each ``Forward`` gets evaluated in two contexts + # - to *match* a specific recursion level, and + # - to *search* the bounded recursion level + # and the two run concurrently. The *search* must *match* each recursion level + # to find the best possible match. This is handled by a memo table, which + # provides the previous match to the next level match attempt. + # + # See also "Left Recursion in Parsing Expression Grammars", Medeiros et al. + # + # There is a complication since we not only *parse* but also *transform* via + # actions: We do not want to run the actions too often while expanding. Thus, + # we expand using `doActions=False` and only run `doActions=True` if the next + # recursion level is acceptable. + with ParserElement.recursion_lock: + memo = ParserElement.recursion_memos + try: + # we are parsing at a specific recursion expansion – use it as-is + prev_loc, prev_result = memo[loc, self, doActions] + if isinstance(prev_result, Exception): + raise prev_result + return prev_loc, prev_result.copy() + except KeyError: + act_key = (loc, self, True) + peek_key = (loc, self, False) + # we are searching for the best recursion expansion – keep on improving + # both `doActions` cases must be tracked separately here! + prev_loc, prev_peek = memo[peek_key] = loc - 1, ParseException( + instring, loc, "Forward recursion without base case", self + ) + if doActions: + memo[act_key] = memo[peek_key] + while True: + try: + new_loc, new_peek = super().parseImpl(instring, loc, False) + except ParseException: + # we failed before getting any match – do not hide the error + if isinstance(prev_peek, Exception): + raise + new_loc, new_peek = prev_loc, prev_peek + # the match did not get better: we are done + if new_loc <= prev_loc: + if doActions: + # replace the match for doActions=False as well, + # in case the action did backtrack + prev_loc, prev_result = memo[peek_key] = memo[act_key] + del memo[peek_key], memo[act_key] + return prev_loc, prev_result.copy() + del memo[peek_key] + return prev_loc, prev_peek.copy() + # the match did get better: see if we can improve further + else: + if doActions: + try: + memo[act_key] = super().parseImpl(instring, loc, True) + except ParseException as e: + memo[peek_key] = memo[act_key] = (new_loc, e) + raise + prev_loc, prev_peek = memo[peek_key] = new_loc, new_peek def leaveWhitespace(self, recursive=True): self.skipWhitespace = False diff --git a/pyparsing/results.py b/pyparsing/results.py index fb03ecd..fa94a00 100644 --- a/pyparsing/results.py +++ b/pyparsing/results.py @@ -522,7 +522,7 @@ class ParseResults: Returns a new copy of a :class:`ParseResults` object. """ ret = ParseResults(self._toklist) - ret._tokdict = dict(**self._tokdict) + ret._tokdict = self._tokdict.copy() ret._parent = self._parent ret._all_names |= self._all_names ret._name = self._name diff --git a/pyparsing/testing.py b/pyparsing/testing.py index 393f37b..43f23ab 100644 --- a/pyparsing/testing.py +++ b/pyparsing/testing.py @@ -20,6 +20,7 @@ class pyparsing_test: """ Context manager to be used when writing unit tests that modify pyparsing config values: - packrat parsing + - bounded recursion parsing - default whitespace characters. - default keyword characters - literal string auto-conversion class @@ -61,6 +62,7 @@ class pyparsing_test: else: self._save_context["packrat_cache_size"] = None self._save_context["packrat_parse"] = ParserElement._parse + self._save_context["recursion_enabled"] = ParserElement._left_recursion_enabled self._save_context["__diag__"] = { name: getattr(__diag__, name) for name in __diag__._all_names @@ -97,6 +99,7 @@ class pyparsing_test: ParserElement.enablePackrat(self._save_context["packrat_cache_size"]) else: ParserElement._parse = self._save_context["packrat_parse"] + ParserElement._left_recursion_enabled = self._save_context["recursion_enabled"] __compat__.collect_all_And_tokens = self._save_context["__compat__"] diff --git a/pyparsing/util.py b/pyparsing/util.py index 3cb69d2..875799d 100644 --- a/pyparsing/util.py +++ b/pyparsing/util.py @@ -119,6 +119,52 @@ class _FifoCache: self.clear = types.MethodType(clear, self) +class LRUMemo: + """ + A memoizing mapping that retains `capacity` deleted items + + The memo tracks retained items by their access order; once `capacity` items + are retained, the least recently used item is discarded. + """ + def __init__(self, capacity): + self._capacity = capacity + self._active = {} + self._memory = collections.OrderedDict() + + def __getitem__(self, key): + try: + return self._active[key] + except KeyError: + self._memory.move_to_end(key) + return self._memory[key] + + def __setitem__(self, key, value): + self._memory.pop(key, None) + self._active[key] = value + + def __delitem__(self, key): + try: + value = self._active.pop(key) + except KeyError: + pass + else: + while len(self._memory) >= self._capacity: + self._memory.popitem(last=False) + self._memory[key] = value + + def clear(self): + self._active.clear() + self._memory.clear() + + +class UnboundedMemo(dict): + """ + A memoizing mapping that retains all deleted items + """ + def __delitem__(self, key): + pass + + def _escapeRegexRangeChars(s): # escape these chars: ^-[] for c in r"\^-[]": |