diff options
author | ptmcg <ptmcg@austin.rr.com> | 2020-12-24 01:21:57 -0600 |
---|---|---|
committer | ptmcg <ptmcg@austin.rr.com> | 2020-12-24 01:21:57 -0600 |
commit | 2896fad83357bde252c0304aa4c0c55661486a5e (patch) | |
tree | 466fff0292422bb9c6717bf6bddb7963622ebdad | |
parent | c3be8acb355559cc1fdcb673f8df904b6947b0af (diff) | |
download | pyparsing-git-2896fad83357bde252c0304aa4c0c55661486a5e.tar.gz |
Deprecate `locatedExpr` in favor of new `Located` class
-rw-r--r-- | CHANGES | 11 | ||||
-rw-r--r-- | docs/whats_new_in_3_0_0.rst | 40 | ||||
-rw-r--r-- | pyparsing/core.py | 39 | ||||
-rw-r--r-- | pyparsing/helpers.py | 4 | ||||
-rw-r--r-- | tests/test_unit.py | 55 |
5 files changed, 141 insertions, 8 deletions
@@ -2,6 +2,17 @@ Change Log ========== +Version 3.0.0b2 +--------------- +- API CHANGE + `locatedExpr` is being replaced by the class `Located`. `Located` has the same + constructor interface as `locatedExpr`, but fixes bugs in the returned + `ParseResults` when the searched expression contains multiple tokens, or + has internal results names. + + `locatedExpr` is deprecated, and will be removed in a future release. + + Version 3.0.0b1 --------------- - API CHANGE diff --git a/docs/whats_new_in_3_0_0.rst b/docs/whats_new_in_3_0_0.rst index 4ddd051..24e9423 100644 --- a/docs/whats_new_in_3_0_0.rst +++ b/docs/whats_new_in_3_0_0.rst @@ -4,7 +4,7 @@ What's New in Pyparsing 3.0.0 :author: Paul McGuire -:date: November, 2020 +:date: December, 2020 :abstract: This document summarizes the changes made in the 3.0.0 release of pyparsing. @@ -89,6 +89,44 @@ just namespaces, to add some helpful behavior: mistake when using Forwards) (**currently not working on PyPy**) +New Located class to replace locatedExpr helper method +------------------------------------------------------ +The new ``Located`` class will replace the current ``locatedExpr`` method for +marking parsed results with the start and end locations of the parsed data in +the input string. ``locatedExpr`` had several bugs, and returned its results +in a hard-to-use format (location data and results names were mixed in with +the located expression's parsed results, and wrapped in an unnecessary extra +nesting level). + +For this code:: + + wd = Word(alphas) + for match in locatedExpr(wd).searchString("ljsdf123lksdjjf123lkkjj1222"): + print(match) + +the docs for ``locaatedExpr`` show this output:: + + [[0, 'ljsdf', 5]] + [[8, 'lksdjjf', 15]] + [[18, 'lkkjj', 23]] + +The parsed values and the start and end locations are merged into a single +nested ParseResults (and any results names inthe parsed values are also +merged in with the start and end location names). + +Using ``Located``, the output is:: + + [0, ['ljsdf'], 5] + [8, ['lksdjjf'], 15] + [18, ['lkkjj'], 23] + +With ``Located``, the parsed expression values and results names are kept +separate in the second parsed value, and there is no extra grouping level +on the whole result. + +The existing ``locatedExpr`` is retained for backward-compatibility, but will be +deprecated in a future release. + New IndentedBlock class to replace indentedBlock helper method -------------------------------------------------------------- The new ``IndentedBlock`` class will replace the current ``indentedBlock`` method diff --git a/pyparsing/core.py b/pyparsing/core.py index 0649a3c..f79c86a 100644 --- a/pyparsing/core.py +++ b/pyparsing/core.py @@ -3903,6 +3903,45 @@ class PrecededBy(ParseElementEnhance): return loc, ret +class Located(ParseElementEnhance): + """ + Decorates a returned token with its starting and ending + locations in the input string. + + This helper adds the following results names: + + - ``locn_start`` - location where matched expression begins + - ``locn_end`` - location where matched expression ends + - ``value`` - the actual parsed results + + Be careful if the input text contains ``<TAB>`` characters, you + may want to call :class:`ParserElement.parseWithTabs` + + Example:: + + wd = Word(alphas) + for match in Located(wd).searchString("ljsdf123lksdjjf123lkkjj1222"): + print(match) + + prints:: + + [0, ['ljsdf'], 5] + [8, ['lksdjjf'], 15] + [18, ['lkkjj'], 23] + + """ + def parseImpl(self, instring, loc, doActions=True): + start = loc + loc, tokens = self.expr._parse( + instring, start, doActions, callPreParse=False + ) + ret_tokens = ParseResults([start, tokens, loc]) + ret_tokens['locn_start'] = start + ret_tokens['value'] = tokens + ret_tokens['locn_end'] = loc + return loc, ret_tokens + + class NotAny(ParseElementEnhance): """Lookahead to disallow matching with the given parse expression. ``NotAny`` does *not* advance the parsing position within the diff --git a/pyparsing/helpers.py b/pyparsing/helpers.py index cf59107..a5bc2cc 100644 --- a/pyparsing/helpers.py +++ b/pyparsing/helpers.py @@ -337,7 +337,9 @@ def ungroup(expr): def locatedExpr(expr): - """Helper to decorate a returned token with its starting and ending + """ + (DEPRECATED - future code should use the Located class) + Helper to decorate a returned token with its starting and ending locations in the input string. This helper adds the following results names: diff --git a/tests/test_unit.py b/tests/test_unit.py index d84d08d..9b47f28 100644 --- a/tests/test_unit.py +++ b/tests/test_unit.py @@ -9,6 +9,7 @@ import contextlib import datetime +import re import sys from io import StringIO from textwrap import dedent @@ -56,6 +57,21 @@ class resetting: setattr(self.ob, attr, value) +def find_all_re_matches(patt, s): + ret = [] + start = 0 + if isinstance(patt, str): + patt = re.compile(patt) + while True: + found = patt.search(s, pos=start) + if found: + ret.append(found) + start = found.end() + else: + break + return ret + + class Test1_PyparsingTestInit(TestCase): def runTest(self): from pyparsing import ( @@ -1438,8 +1454,7 @@ class Test2_WithoutPackrat(ppt.TestParseResultsAsserts, TestCase): {"_skipped": ["red ", "456 "]}, ) - def testEllipsisRepetion(self): - import re + def testEllipsisRepetition(self): word = pp.Word(pp.alphas).setName("word") num = pp.Word(pp.nums).setName("num") @@ -2938,7 +2953,6 @@ class Test2_WithoutPackrat(ppt.TestParseResultsAsserts, TestCase): ) def testParseUsingRegex(self): - import re signedInt = pp.Regex(r"[-+][0-9]+") unsignedInt = pp.Regex(r"[0-9]+") @@ -3471,8 +3485,6 @@ class Test2_WithoutPackrat(ppt.TestParseResultsAsserts, TestCase): ) print() - import re - k = pp.Regex(r"a+", flags=re.S + re.M) k = k.parseWithTabs() k = k.leaveWhitespace() @@ -4607,6 +4619,38 @@ class Test2_WithoutPackrat(ppt.TestParseResultsAsserts, TestCase): "incorrect location calculation", ) + def testLocatedExprUsingLocated(self): + # 012345678901234567890123456789012345678901234567890 + samplestr1 = "DOB 10-10-2010;more garbage;ID PARI12345678 ;more garbage" + + id_ref = pp.Located("ID" + pp.Word(pp.alphanums, exact=12)("id")) + + res = id_ref.searchString(samplestr1)[0] + print(res.dump()) + self.assertEqual( + "ID PARI12345678", + samplestr1[res.locn_start:res.locn_end], + "incorrect location calculation", + ) + self.assertParseResultsEquals(res, + [28, ['ID', 'PARI12345678'], 43], + {'locn_end': 43, + 'locn_start': 28, + 'value': {'id': 'PARI12345678'}} + ) + + wd = pp.Word(pp.alphas) + test_string = "ljsdf123lksdjjf123lkkjj1222" + pp_matches = pp.Located(wd).searchString(test_string) + re_matches = find_all_re_matches("[a-z]+", test_string) + for pp_match, re_match in zip(pp_matches, re_matches): + self.assertParseResultsEquals(pp_match, [re_match.start(), + [re_match.group(0)], + re_match.end()]) + print(pp_match) + print(re_match) + print(pp_match.value) + def testPop(self): source = "AAA 123 456 789 234" patt = pp.Word(pp.alphas)("name") + pp.Word(pp.nums) * (1,) @@ -7142,7 +7186,6 @@ class Test2_WithoutPackrat(ppt.TestParseResultsAsserts, TestCase): def testWordInternalReRanges(self): import random - import re self.assertEqual( "[!-~]+", |