diff options
author | ptmcg <ptmcg@austin.rr.com> | 2018-10-27 12:21:59 -0500 |
---|---|---|
committer | ptmcg <ptmcg@austin.rr.com> | 2018-10-27 12:21:59 -0500 |
commit | cd761d30e64c1ecaf3518087efe2f96df052c4fc (patch) | |
tree | fe2ab5e5b3942ab47af11825f680da5347663fcd /pyparsing.py | |
parent | ddd2ee7e9fcf5e5b7de6f1672d32ce8e5f94348d (diff) | |
download | pyparsing-git-cd761d30e64c1ecaf3518087efe2f96df052c4fc.tar.gz |
Add unicode character ranges by name
Diffstat (limited to 'pyparsing.py')
-rw-r--r-- | pyparsing.py | 542 |
1 files changed, 321 insertions, 221 deletions
diff --git a/pyparsing.py b/pyparsing.py index 9279d34..dda791c 100644 --- a/pyparsing.py +++ b/pyparsing.py @@ -1,3 +1,4 @@ +#-*- coding: utf-8 -*-
# module pyparsing.py
#
# Copyright (c) 2003-2018 Paul T. McGuire
@@ -75,7 +76,7 @@ classes inherit from. Use the docstrings for examples of how to: """
__version__ = "2.3.0"
-__versionTime__ = "06 Oct 2018 14:31 UTC"
+__versionTime__ = "27 Oct 2018 16:36 UTC"
__author__ = "Paul McGuire <ptmcg@users.sourceforge.net>"
import string
@@ -90,6 +91,7 @@ import pprint import traceback
import types
from datetime import datetime
+from itertools import filterfalse
try:
from _thread import RLock
@@ -121,7 +123,7 @@ __all__ = [ 'PrecededBy', 'MatchFirst', 'NoMatch', 'NotAny', 'OneOrMore', 'OnlyOnce', 'Optional', 'Or',
'ParseBaseException', 'ParseElementEnhance', 'ParseException', 'ParseExpression', 'ParseFatalException',
'ParseResults', 'ParseSyntaxException', 'ParserElement', 'QuotedString', 'RecursiveGrammarException',
-'Regex', 'SkipTo', 'StringEnd', 'StringStart', 'Suppress', 'Token', 'TokenConverter',
+'Regex', 'SkipTo', 'StringEnd', 'StringStart', 'Suppress', 'Token', 'TokenConverter',
'White', 'Word', 'WordEnd', 'WordStart', 'ZeroOrMore', 'Char',
'alphanums', 'alphas', 'alphas8bit', 'anyCloseTag', 'anyOpenTag', 'cStyleComment', 'col',
'commaSeparatedList', 'commonHTMLEntity', 'countedArray', 'cppStyleComment', 'dblQuotedString',
@@ -129,11 +131,11 @@ __all__ = [ 'htmlComment', 'javaStyleComment', 'line', 'lineEnd', 'lineStart', 'lineno',
'makeHTMLTags', 'makeXMLTags', 'matchOnlyAtCol', 'matchPreviousExpr', 'matchPreviousLiteral',
'nestedExpr', 'nullDebugAction', 'nums', 'oneOf', 'opAssoc', 'operatorPrecedence', 'printables',
-'punc8bit', 'pythonStyleComment', 'quotedString', 'removeQuotes', 'replaceHTMLEntity',
+'punc8bit', 'pythonStyleComment', 'quotedString', 'removeQuotes', 'replaceHTMLEntity',
'replaceWith', 'restOfLine', 'sglQuotedString', 'srange', 'stringEnd',
'stringStart', 'traceParseAction', 'unicodeString', 'upcaseTokens', 'withAttribute',
'indentedBlock', 'originalTextFor', 'ungroup', 'infixNotation','locatedExpr', 'withClass',
-'CloseMatch', 'tokenMap', 'pyparsing_common',
+'CloseMatch', 'tokenMap', 'pyparsing_common', 'pyparsing_unicode',
]
system_version = tuple(sys.version_info)[:3]
@@ -179,9 +181,9 @@ else: singleArgBuiltins.append(getattr(__builtin__,fname))
except AttributeError:
continue
-
+
_generatorType = type((y for y in range(1)))
-
+
def _xml_escape(data):
"""Escape &, <, >, ", ', etc. in a string of data."""
@@ -220,7 +222,7 @@ class ParseBaseException(Exception): @classmethod
def _from_exception(cls, pe):
"""
- internal factory method to simplify creating one type of ParseException
+ internal factory method to simplify creating one type of ParseException
from another - avoids having __init__ signature conflicts among subclasses
"""
return cls(pe.pstr, pe.loc, pe.msg, pe.parserElement)
@@ -265,14 +267,14 @@ class ParseException(ParseBaseException): - lineno - returns the line number of the exception text
- col - returns the column number of the exception text
- line - returns the line containing the exception text
-
+
Example::
try:
Word(nums).setName("integer").parseString("ABC")
except ParseException as pe:
print(pe)
print("column: {}".format(pe.col))
-
+
prints::
Expected integer (at char 0), (line:1, col:1)
column: 1
@@ -286,7 +288,7 @@ class ParseFatalException(ParseBaseException): class ParseSyntaxException(ParseFatalException):
"""just like L{ParseFatalException}, but thrown internally when an
- L{ErrorStop<And._ErrorStop>} ('-' operator) indicates that parsing is to stop
+ L{ErrorStop<And._ErrorStop>} ('-' operator) indicates that parsing is to stop
immediately because an unbacktrackable syntax error has been found"""
pass
@@ -330,8 +332,8 @@ class ParseResults(object): Example::
integer = Word(nums)
- date_str = (integer.setResultsName("year") + '/'
- + integer.setResultsName("month") + '/'
+ date_str = (integer.setResultsName("year") + '/'
+ + integer.setResultsName("month") + '/'
+ integer.setResultsName("day"))
# equivalent form:
# date_str = integer("year") + '/' + integer("month") + '/' + integer("day")
@@ -467,12 +469,12 @@ class ParseResults(object): def _itervalues( self ):
return (self[k] for k in self._iterkeys())
-
+
def _iteritems( self ):
return ((k, self[k]) for k in self._iterkeys())
if PY_3:
- keys = _iterkeys
+ keys = _iterkeys
"""Returns an iterator of all named result keys (Python 3.x only)."""
values = _itervalues
@@ -498,7 +500,7 @@ class ParseResults(object): def values( self ):
"""Returns all named result values (as a list in Python 2.x, as an iterator in Python 3.x)."""
return list(self.itervalues())
-
+
def items( self ):
"""Returns all named result key-values (as a list of tuples in Python 2.x, as an iterator in Python 3.x)."""
return list(self.iteritems())
@@ -507,16 +509,16 @@ class ParseResults(object): """Since keys() returns an iterator, this method is helpful in bypassing
code that looks for the existence of any defined results names."""
return bool(self.__tokdict)
-
+
def pop( self, *args, **kwargs):
"""
Removes and returns item at specified index (default=C{last}).
Supports both C{list} and C{dict} semantics for C{pop()}. If passed no
argument or an integer argument, it will use C{list} semantics
- and pop tokens from the list of parsed tokens. If passed a
+ and pop tokens from the list of parsed tokens. If passed a
non-integer argument (most likely a string), it will use C{dict}
- semantics and pop the corresponding value from any defined
- results names. A second default return value argument is
+ semantics and pop the corresponding value from any defined
+ results names. A second default return value argument is
supported, just as in C{dict.pop()}.
Example::
@@ -549,8 +551,8 @@ class ParseResults(object): args = (args[0], v)
else:
raise TypeError("pop() got an unexpected keyword argument '%s'" % k)
- if (isinstance(args[0], int) or
- len(args) == 1 or
+ if (isinstance(args[0], int) or
+ len(args) == 1 or
args[0] in self):
index = args[0]
ret = self[index]
@@ -567,10 +569,10 @@ class ParseResults(object): C{defaultValue} is specified.
Similar to C{dict.get()}.
-
+
Example::
integer = Word(nums)
- date_str = integer("year") + '/' + integer("month") + '/' + integer("day")
+ date_str = integer("year") + '/' + integer("month") + '/' + integer("day")
result = date_str.parseString("1999/12/31")
print(result.get("year")) # -> '1999'
@@ -585,7 +587,7 @@ class ParseResults(object): def insert( self, index, insStr ):
"""
Inserts new element at location index in the list of parsed tokens.
-
+
Similar to C{list.insert()}.
Example::
@@ -608,7 +610,7 @@ class ParseResults(object): Example::
print(OneOrMore(Word(nums)).parseString("0 123 321")) # -> ['0', '123', '321']
-
+
# use a parse action to compute the sum of the parsed integers, and add it to the end
def append_sum(tokens):
tokens.append(sum(map(int, tokens)))
@@ -622,7 +624,7 @@ class ParseResults(object): Example::
patt = OneOrMore(Word(alphas))
-
+
# use a parse action to append the reverse of the matched strings, to make a palindrome
def make_palindrome(tokens):
tokens.extend(reversed([t[::-1] for t in tokens]))
@@ -646,7 +648,7 @@ class ParseResults(object): return self[name]
except KeyError:
return ""
-
+
if name in self.__tokdict:
if name not in self.__accumNames:
return self.__tokdict[name][-1][0]
@@ -671,7 +673,7 @@ class ParseResults(object): self[k] = v
if isinstance(v[0],ParseResults):
v[0].__parent = wkref(self)
-
+
self.__toklist += other.__toklist
self.__accumNames.update( other.__accumNames )
return self
@@ -683,7 +685,7 @@ class ParseResults(object): else:
# this may raise a TypeError - so be it
return other + self
-
+
def __repr__( self ):
return "(%s, %s)" % ( repr( self.__toklist ), repr( self.__tokdict ) )
@@ -710,7 +712,7 @@ class ParseResults(object): result = patt.parseString("sldkj lsdkj sldkj")
# even though the result prints in string-like form, it is actually a pyparsing ParseResults
print(type(result), result) # -> <class 'pyparsing.ParseResults'> ['sldkj', 'lsdkj', 'sldkj']
-
+
# Use asList() to create an actual list
result_list = result.asList()
print(type(result_list), result_list) # -> <class 'list'> ['sldkj', 'lsdkj', 'sldkj']
@@ -724,10 +726,10 @@ class ParseResults(object): Example::
integer = Word(nums)
date_str = integer("year") + '/' + integer("month") + '/' + integer("day")
-
+
result = date_str.parseString('12/31/1999')
print(type(result), repr(result)) # -> <class 'pyparsing.ParseResults'> (['12', '/', '31', '/', '1999'], {'day': [('1999', 4)], 'year': [('12', 0)], 'month': [('31', 2)]})
-
+
result_dict = result.asDict()
print(type(result_dict), repr(result_dict)) # -> <class 'dict'> {'day': '1999', 'year': '12', 'month': '31'}
@@ -740,7 +742,7 @@ class ParseResults(object): item_fn = self.items
else:
item_fn = self.iteritems
-
+
def toItem(obj):
if isinstance(obj, ParseResults):
if obj.haskeys():
@@ -749,7 +751,7 @@ class ParseResults(object): return [toItem(v) for v in obj]
else:
return obj
-
+
return dict((k,toItem(v)) for k,v in item_fn())
def copy( self ):
@@ -757,7 +759,7 @@ class ParseResults(object): Returns a new copy of a C{ParseResults} object.
"""
ret = ParseResults( self.__toklist )
- ret.__tokdict = self.__tokdict.copy()
+ ret.__tokdict = dict(self.__tokdict.items())
ret.__parent = self.__parent
ret.__accumNames.update( self.__accumNames )
ret.__name = self.__name
@@ -833,18 +835,18 @@ class ParseResults(object): def getName(self):
r"""
- Returns the results name for this token expression. Useful when several
+ Returns the results name for this token expression. Useful when several
different expressions might match at a particular location.
Example::
integer = Word(nums)
ssn_expr = Regex(r"\d\d\d-\d\d-\d\d\d\d")
house_number_expr = Suppress('#') + Word(nums, alphanums)
- user_data = (Group(house_number_expr)("house_number")
+ user_data = (Group(house_number_expr)("house_number")
| Group(ssn_expr)("ssn")
| Group(integer)("age"))
user_info = OneOrMore(user_data)
-
+
result = user_info.parseString("22 111-22-3333 #221B")
for item in result:
print(item.getName(), ':', item[0])
@@ -877,7 +879,7 @@ class ParseResults(object): Example::
integer = Word(nums)
date_str = integer("year") + '/' + integer("month") + '/' + integer("day")
-
+
result = date_str.parseString('12/31/1999')
print(result.dump())
prints::
@@ -910,13 +912,13 @@ class ParseResults(object): out.append("\n%s%s[%d]:\n%s%s%s" % (indent,(' '*(depth)),i,indent,(' '*(depth+1)),vv.dump(indent,depth+1) ))
else:
out.append("\n%s%s[%d]:\n%s%s%s" % (indent,(' '*(depth)),i,indent,(' '*(depth+1)),_ustr(vv)))
-
+
return "".join(out)
def pprint(self, *args, **kwargs):
"""
Pretty-printer for parsed results as a list, using the C{pprint} module.
- Accepts additional positional or keyword args as defined for the
+ Accepts additional positional or keyword args as defined for the
C{pprint.pprint} method. (U{http://docs.python.org/3/library/pprint.html#pprint.pprint})
Example::
@@ -1041,7 +1043,7 @@ def _trim_arity(func, maxargs=2): return lambda s,l,t: func(t)
limit = [0]
foundArity = [False]
-
+
# traceback return data structure changed in Py3.5 - normalize back to plain tuples
if system_version[:2] >= (3,5):
def extract_stack(limit=0):
@@ -1056,12 +1058,12 @@ def _trim_arity(func, maxargs=2): else:
extract_stack = traceback.extract_stack
extract_tb = traceback.extract_tb
-
- # synthesize what would be returned by traceback.extract_stack at the call to
+
+ # synthesize what would be returned by traceback.extract_stack at the call to
# user's parse action 'func', so that we don't incur call penalty at parse time
-
+
LINE_DIFF = 6
- # IF ANY CODE CHANGES, EVEN JUST COMMENTS OR BLANK LINES, BETWEEN THE NEXT LINE AND
+ # IF ANY CODE CHANGES, EVEN JUST COMMENTS OR BLANK LINES, BETWEEN THE NEXT LINE AND
# THE CALL TO FUNC INSIDE WRAPPER, LINE_DIFF MUST BE MODIFIED!!!!
this_line = extract_stack(limit=2)[-1]
pa_call_line_synth = (this_line[0], this_line[1]+LINE_DIFF)
@@ -1092,7 +1094,7 @@ def _trim_arity(func, maxargs=2): # copy func name to wrapper for sensible debug output
func_name = "<parse action>"
try:
- func_name = getattr(func, '__name__',
+ func_name = getattr(func, '__name__',
getattr(func, '__class__').__name__)
except Exception:
func_name = str(func)
@@ -1113,7 +1115,7 @@ class ParserElement(object): Example::
# default whitespace chars are space, <TAB> and newline
OneOrMore(Word(alphas)).parseString("abc def\nghi jkl") # -> ['abc', 'def', 'ghi', 'jkl']
-
+
# change to just treat newline as significant
ParserElement.setDefaultWhitespaceChars(" \t")
OneOrMore(Word(alphas)).parseString("abc def\nghi jkl") # -> ['abc', 'def']
@@ -1124,18 +1126,18 @@ class ParserElement(object): def inlineLiteralsUsing(cls):
"""
Set class to be used for inclusion of string literals into a parser.
-
+
Example::
# default literal class used is Literal
integer = Word(nums)
- date_str = integer("year") + '/' + integer("month") + '/' + integer("day")
+ date_str = integer("year") + '/' + integer("month") + '/' + integer("day")
date_str.parseString("1999/12/31") # -> ['1999', '/', '12', '/', '31']
# change to Suppress
ParserElement.inlineLiteralsUsing(Suppress)
- date_str = integer("year") + '/' + integer("month") + '/' + integer("day")
+ date_str = integer("year") + '/' + integer("month") + '/' + integer("day")
date_str.parseString("1999/12/31") # -> ['1999', '12', '31']
"""
@@ -1168,12 +1170,12 @@ class ParserElement(object): """
Make a copy of this C{ParserElement}. Useful for defining different parse actions
for the same parsing pattern, using copies of the original parse element.
-
+
Example::
integer = Word(nums).setParseAction(lambda toks: int(toks[0]))
integerK = integer.copy().addParseAction(lambda toks: toks[0]*1024) + Suppress("K")
integerM = integer.copy().addParseAction(lambda toks: toks[0]*1024*1024) + Suppress("M")
-
+
print(OneOrMore(integerK | integerM | integer).parseString("5K 100 640K 256M"))
prints::
[5120, 100, 655360, 268435456]
@@ -1190,7 +1192,7 @@ class ParserElement(object): def setName( self, name ):
"""
Define name for this expression, makes debugging and exception messages clearer.
-
+
Example::
Word(nums).parseString("ABC") # -> Exception: Expected W:(0123...) (at char 0), (line:1, col:1)
Word(nums).setName("integer").parseString("ABC") # -> Exception: Expected integer (at char 0), (line:1, col:1)
@@ -1210,12 +1212,12 @@ class ParserElement(object): integer, and reference it in multiple places with different names.
You can also set results names using the abbreviated syntax,
- C{expr("name")} in place of C{expr.setResultsName("name")} -
+ C{expr("name")} in place of C{expr.setResultsName("name")} -
see L{I{__call__}<__call__>}.
Example::
- date_str = (integer.setResultsName("year") + '/'
- + integer.setResultsName("month") + '/'
+ date_str = (integer.setResultsName("year") + '/'
+ + integer.setResultsName("month") + '/'
+ integer.setResultsName("day"))
# equivalent form:
@@ -1267,7 +1269,7 @@ class ParserElement(object): on parsing strings containing C{<TAB>}s, and suggested methods to maintain a
consistent view of the parsed string, the parse location, and line and column
positions within the parsed string.
-
+
Example::
integer = Word(nums)
date_str = integer + '/' + integer + '/' + integer
@@ -1288,7 +1290,7 @@ class ParserElement(object): def addParseAction( self, *fns, **kwargs ):
"""
Add one or more parse actions to expression's list of parse actions. See L{I{setParseAction}<setParseAction>}.
-
+
See examples in L{I{copy}<copy>}.
"""
self.parseAction += list(map(_trim_arity, list(fns)))
@@ -1296,14 +1298,14 @@ class ParserElement(object): return self
def addCondition(self, *fns, **kwargs):
- """Add a boolean predicate function to expression's list of parse actions. See
- L{I{setParseAction}<setParseAction>} for function call signatures. Unlike C{setParseAction},
+ """Add a boolean predicate function to expression's list of parse actions. See
+ L{I{setParseAction}<setParseAction>} for function call signatures. Unlike C{setParseAction},
functions passed to C{addCondition} need to return boolean success/fail of the condition.
Optional keyword arguments:
- message = define a custom message to be used in the raised exception
- fatal = if True, will raise ParseFatalException to stop parsing immediately; otherwise will raise ParseException
-
+
Example::
integer = Word(nums).setParseAction(lambda toks: int(toks[0]))
year_int = integer.copy()
@@ -1455,7 +1457,7 @@ class ParserElement(object): return self._parse( instring, loc, doActions=False )[0]
except ParseFatalException:
raise ParseException( instring, loc, self.errmsg, self)
-
+
def canParseNext(self, instring, loc):
try:
self.tryParse(instring, loc)
@@ -1477,7 +1479,7 @@ class ParserElement(object): def clear(self):
cache.clear()
-
+
def cache_len(self):
return len(cache)
@@ -1589,13 +1591,13 @@ class ParserElement(object): often in many complex grammars) can immediately return a cached value,
instead of re-executing parsing/validating code. Memoizing is done of
both valid results and parsing exceptions.
-
+
Parameters:
- cache_size_limit - (default=C{128}) - if an integer value is provided
will limit the size of the packrat cache; if None is passed, then
the cache size will be unbounded; if 0 is passed, the cache will
be effectively disabled.
-
+
This speedup may break existing programs that use parse actions that
have side-effects. For this reason, packrat parsing is disabled when
you first import pyparsing. To activate the packrat feature, your
@@ -1604,7 +1606,7 @@ class ParserElement(object): C{enablePackrat} before calling C{psyco.full()}. If you do not do this,
Python will crash. For best results, call C{enablePackrat()} immediately
after importing pyparsing.
-
+
Example::
import pyparsing
pyparsing.ParserElement.enablePackrat()
@@ -1639,7 +1641,7 @@ class ParserElement(object): reference the input string using the parse action's C{s} argument
- explictly expand the tabs in your input string before calling
C{parseString}
-
+
Example::
Word('a').parseString('aaaaabaaa') # -> ['aaaaa']
Word('a').parseString('aaaaabaaa', parseAll=True) # -> Exception: Expected end of text
@@ -1684,9 +1686,9 @@ class ParserElement(object): for tokens,start,end in Word(alphas).scanString(source):
print(' '*start + '^'*(end-start))
print(' '*start + tokens[0])
-
+
prints::
-
+
sldjf123lsdjjkf345sldkjf879lkjsfd987
^^^^^
sldjf
@@ -1746,11 +1748,11 @@ class ParserElement(object): Invoking C{transformString()} on a target string will then scan for matches,
and replace the matched text patterns according to the logic in the parse
action. C{transformString()} returns the resulting transformed string.
-
+
Example::
wd = Word(alphas)
wd.setParseAction(lambda toks: toks[0].title())
-
+
print(wd.transformString("now is the winter of our discontent made glorious summer by this sun of york."))
Prints::
Now Is The Winter Of Our Discontent Made Glorious Summer By This Sun Of York.
@@ -1786,11 +1788,11 @@ class ParserElement(object): Another extension to C{L{scanString}}, simplifying the access to the tokens found
to match the given parse expression. May be called with optional
C{maxMatches} argument, to clip searching after 'n' matches are found.
-
+
Example::
# a capitalized word starts with an uppercase letter, followed by zero or more lowercase letters
cap_word = Word(alphas.upper(), alphas.lower())
-
+
print(cap_word.searchString("More than Iron, more than Lead, more than Gold I need Electricity"))
# the sum() builtin can be used to merge results into a single ParseResults object
@@ -1814,8 +1816,8 @@ class ParserElement(object): May be called with optional C{maxsplit} argument, to limit the number of splits;
and the optional C{includeSeparators} argument (default=C{False}), if the separating
matching text should be included in the split results.
-
- Example::
+
+ Example::
punc = oneOf(list(".,;:/-!?"))
print(list(punc.split("This, this?, this sentence, is badly punctuated!")))
prints::
@@ -1834,7 +1836,7 @@ class ParserElement(object): """
Implementation of + operator - returns C{L{And}}. Adding strings to a ParserElement
converts them to L{Literal}s by default.
-
+
Example::
greet = Word(alphas) + "," + Word(alphas) + "!"
hello = "Hello, World!"
@@ -2038,16 +2040,16 @@ class ParserElement(object): def __call__(self, name=None):
"""
Shortcut for C{L{setResultsName}}, with C{listAllMatches=False}.
-
+
If C{name} is given with a trailing C{'*'} character, then C{listAllMatches} will be
passed as C{True}.
-
+
If C{name} is omitted, same as calling C{L{copy}}.
Example::
# these are equivalent
userdata = Word(alphas).setResultsName("name") + Word(nums+"-").setResultsName("socsecno")
- userdata = Word(alphas)("name") + Word(nums+"-")("socsecno")
+ userdata = Word(alphas)("name") + Word(nums+"-")("socsecno")
"""
if name is not None:
return self.setResultsName(name)
@@ -2093,11 +2095,11 @@ class ParserElement(object): Define expression to be ignored (e.g., comments) while doing pattern
matching; may be called repeatedly, to define multiple comment or other
ignorable patterns.
-
+
Example::
patt = OneOrMore(Word(alphas))
patt.parseString('ablaj /* comment */ lskjd') # -> ['ablaj']
-
+
patt.ignore(cStyleComment)
patt.parseString('ablaj /* comment */ lskjd') # -> ['ablaj', 'lskjd']
"""
@@ -2130,12 +2132,12 @@ class ParserElement(object): wd = Word(alphas).setName("alphaword")
integer = Word(nums).setName("numword")
term = wd | integer
-
+
# turn on debugging for wd
wd.setDebug()
OneOrMore(term).parseString("abc 123 xyz 890")
-
+
prints::
Match alphaword at loc 0(1,1)
Matched alphaword -> ['abc']
@@ -2224,13 +2226,13 @@ class ParserElement(object): def matches(self, testString, parseAll=True):
"""
- Method for quick testing of a parser against a test string. Good for simple
+ Method for quick testing of a parser against a test string. Good for simple
inline microtests of sub expressions while building up larger parser.
-
+
Parameters:
- testString - to test against this expression for a match
- parseAll - (default=C{True}) - flag to pass to C{L{parseString}} when running tests
-
+
Example::
expr = Word(nums)
assert expr.matches("100")
@@ -2240,17 +2242,17 @@ class ParserElement(object): return True
except ParseBaseException:
return False
-
+
def runTests(self, tests, parseAll=True, comment='#', fullDump=True, printResults=True, failureTests=False):
"""
Execute the parse expression on a series of test strings, showing each
test, the parsed results or where the parse failed. Quick and easy way to
run a parse expression against a list of sample strings.
-
+
Parameters:
- tests - a list of separate test strings, or a multiline string of test strings
- - parseAll - (default=C{True}) - flag to pass to C{L{parseString}} when running tests
- - comment - (default=C{'#'}) - expression for indicating embedded comments in the test
+ - parseAll - (default=C{True}) - flag to pass to C{L{parseString}} when running tests
+ - comment - (default=C{'#'}) - expression for indicating embedded comments in the test
string; pass None to disable comment filtering
- fullDump - (default=C{True}) - dump results as list followed by results names in nested outline;
if False, only dump nested list
@@ -2258,9 +2260,9 @@ class ParserElement(object): - failureTests - (default=C{False}) indicates if these tests are expected to fail parsing
Returns: a (success, results) tuple, where success indicates that all tests succeeded
- (or failed if C{failureTests} is True), and the results contain a list of lines of each
+ (or failed if C{failureTests} is True), and the results contain a list of lines of each
test's output
-
+
Example::
number_expr = pyparsing_common.number.copy()
@@ -2303,7 +2305,7 @@ class ParserElement(object): [1e-12]
Success
-
+
# stray character
100Z
^
@@ -2325,7 +2327,7 @@ class ParserElement(object): lines, create a test like this::
expr.runTest(r"this is a test\\n of strings that spans \\n 3 lines")
-
+
(Note that this is a raw string literal, you must include the leading 'r'.)
"""
if isinstance(tests, basestring):
@@ -2369,10 +2371,10 @@ class ParserElement(object): print('\n'.join(out))
allResults.append((t, result))
-
+
return success, allResults
-
+
class Token(ParserElement):
"""
Abstract C{ParserElement} subclass, for defining atomic matching patterns.
@@ -2410,14 +2412,14 @@ class NoMatch(Token): class Literal(Token):
"""
Token to exactly match a specified string.
-
+
Example::
Literal('blah').parseString('blah') # -> ['blah']
Literal('blah').parseString('blahfooblah') # -> ['blah']
Literal('blah').parseString('bla') # -> Exception: Expected "blah"
-
+
For case-insensitive matching, use L{CaselessLiteral}.
-
+
For keyword matching (force word break before and after the matched string),
use L{Keyword} or L{CaselessKeyword}.
"""
@@ -2458,7 +2460,7 @@ class Keyword(Token): - C{identChars} is a string of characters that would be valid identifier characters,
defaulting to all alphanumerics + "_" and "$"
- C{caseless} allows case-insensitive matching, default is C{False}.
-
+
Example::
Keyword("start").parseString("start") # -> ['start']
Keyword("start").parseString("starting") # -> Exception
@@ -2521,7 +2523,7 @@ class CaselessLiteral(Literal): Example::
OneOrMore(CaselessLiteral("CMD")).parseString("cmd CMD Cmd10") # -> ['CMD', 'CMD', 'CMD']
-
+
(Contrast with example for L{CaselessKeyword}.)
"""
def __init__( self, matchString ):
@@ -2542,7 +2544,7 @@ class CaselessKeyword(Keyword): Example::
OneOrMore(CaselessKeyword("CMD")).parseString("cmd CMD Cmd10") # -> ['CMD', 'CMD']
-
+
(Contrast with example for L{CaselessLiteral}.)
"""
def __init__( self, matchString, identChars=None ):
@@ -2556,17 +2558,17 @@ class CaselessKeyword(Keyword): class CloseMatch(Token):
"""
- A variation on L{Literal} which matches "close" matches, that is,
+ A variation on L{Literal} which matches "close" matches, that is,
strings with at most 'n' mismatching characters. C{CloseMatch} takes parameters:
- C{match_string} - string to be matched
- C{maxMismatches} - (C{default=1}) maximum number of mismatches allowed to count as a match
-
+
The results from a successful parse will contain the matched text from the input string and the following named results:
- C{mismatches} - a list of the positions within the match_string where mismatches were found
- C{original} - the original match_string used to compare against the input string
-
+
If C{mismatches} is an empty list, then the match was an exact match.
-
+
Example::
patt = CloseMatch("ATCATCGAATGGA")
patt.parseString("ATCATCGAAXGGA") # -> (['ATCATCGAAXGGA'], {'mismatches': [[9]], 'original': ['ATCATCGAATGGA']})
@@ -2624,14 +2626,14 @@ class Word(Token): maximum, and/or exact length. The default value for C{min} is 1 (a
minimum value < 1 is not valid); the default values for C{max} and C{exact}
are 0, meaning no maximum or exact length restriction. An optional
- C{excludeChars} parameter can list characters that might be found in
+ C{excludeChars} parameter can list characters that might be found in
the input C{bodyChars} string; useful to define a word of all printables
except for one or two characters, for instance.
-
- L{srange} is useful for defining custom character set strings for defining
+
+ L{srange} is useful for defining custom character set strings for defining
C{Word} expressions, using range notation from regular expression character sets.
-
- A common mistake is to use C{Word} to match a specific literal string, as in
+
+ A common mistake is to use C{Word} to match a specific literal string, as in
C{Word("Address")}. Remember that C{Word} uses the string argument to define
I{sets} of matchable characters. This expression would match "Add", "AAA",
"dAred", or any other word made up of the characters 'A', 'd', 'r', 'e', and 's'.
@@ -2649,16 +2651,16 @@ class Word(Token): Example::
# a word composed of digits
integer = Word(nums) # equivalent to Word("0123456789") or Word(srange("0-9"))
-
+
# a word with a leading capital, and zero or more lowercase
capital_word = Word(alphas.upper(), alphas.lower())
# hostnames are alphanumeric, with leading alpha, and '-'
hostname = Word(alphas, alphanums+'-')
-
+
# roman numeral (not a strict parser, accepts invalid mix of characters)
roman = Word("IVXLCDM")
-
+
# any string of non-whitespace characters, except for ','
csv_value = Word(printables, excludeChars=",")
"""
@@ -2776,7 +2778,7 @@ class Word(Token): class Char(Word):
"""
- A short-cut class for defining C{Word(characters, exact=1)},
+ A short-cut class for defining C{Word(characters, exact=1)},
when defining a match of any single character in a string of characters.
"""
def __init__(self, charset):
@@ -2789,7 +2791,7 @@ class Regex(Token): r"""
Token for matching strings that match a given regular expression.
Defined with string specifying the regular expression in a form recognized by the inbuilt Python re module.
- If the given regex contains named groups (defined using C{(?P<name>...)}), these will be preserved as
+ If the given regex contains named groups (defined using C{(?P<name>...)}), these will be preserved as
named parse results.
Example::
@@ -2797,7 +2799,7 @@ class Regex(Token): date = Regex(r'(?P<year>\d{4})-(?P<month>\d\d?)-(?P<day>\d\d?)')
# ref: http://stackoverflow.com/questions/267399/how-do-you-match-only-valid-roman-numerals-with-a-regular-expression
roman = Regex(r"M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})")
-
+
make_html = Regex(r"(\w+):(.*?):").sub(r"<\1>\2</\1>")
print(make_html.transformString("h1:main title:"))
# prints "<h1>main title</h1>"
@@ -2828,7 +2830,7 @@ class Regex(Token): self.pattern = \
self.reString = str(pattern)
self.flags = flags
-
+
else:
raise ValueError("Regex may only be constructed with a string or a compiled RE object")
@@ -2874,14 +2876,14 @@ class Regex(Token): result as if called using C{re.sub(expr, repl, string)}.
"""
if self.asGroupList:
- warnings.warn("cannot use sub() with Regex(asGroupList=True)",
+ warnings.warn("cannot use sub() with Regex(asGroupList=True)",
SyntaxWarning, stacklevel=2)
raise SyntaxError()
if self.asMatch and callable(repl):
- warnings.warn("cannot use sub() with a callable with Regex(asMatch=True)",
+ warnings.warn("cannot use sub() with a callable with Regex(asMatch=True)",
SyntaxWarning, stacklevel=2)
- raise SyntaxError()
+ raise SyntaxError()
if self.asMatch:
def pa(tokens):
@@ -2894,7 +2896,7 @@ class Regex(Token): class QuotedString(Token):
r"""
Token for matching strings that are delimited by quoting characters.
-
+
Defined with the following parameters:
- quoteChar - string of one or more characters defining the quote delimiting string
- escChar - character to escape quotes, typically backslash (default=C{None})
@@ -3186,9 +3188,9 @@ class GoToColumn(_PositionToken): class LineStart(_PositionToken):
"""
Matches if current position is at the beginning of a line within the parse string
-
+
Example::
-
+
test = '''\
AAA this line
AAA and this line
@@ -3198,10 +3200,10 @@ class LineStart(_PositionToken): for t in (LineStart() + 'AAA' + restOfLine).searchString(test):
print(t)
-
+
Prints::
['AAA', ' this line']
- ['AAA', ' and this line']
+ ['AAA', ' and this line']
"""
def __init__( self ):
@@ -3403,7 +3405,7 @@ class ParseExpression(ParserElement): self.mayIndexError |= other.mayIndexError
self.errmsg = "Expected " + _ustr(self)
-
+
return self
def setResultsName( self, name, listAllMatches=False ):
@@ -3415,7 +3417,7 @@ class ParseExpression(ParserElement): for e in self.exprs:
e.validate(tmp)
self.checkRecursion( [] )
-
+
def copy(self):
ret = super(ParseExpression,self).copy()
ret.exprs = [e.copy() for e in self.exprs]
@@ -3505,7 +3507,7 @@ class Or(ParseExpression): Example::
# construct Or using '^' operator
-
+
number = Word(nums) ^ Combine(Word(nums) + '.' + Word(nums))
print(number.searchString("123 3.1416 789"))
prints::
@@ -3584,7 +3586,7 @@ class MatchFirst(ParseExpression): Example::
# construct MatchFirst using '|' operator
-
+
# watch the order of expressions to match
number = Word(nums) | Combine(Word(nums) + '.' + Word(nums))
print(number.searchString("123 3.1416 789")) # Fail! -> [['123'], ['3'], ['1416'], ['789']]
@@ -3659,7 +3661,7 @@ class Each(ParseExpression): color_attr = "color:" + color("color")
size_attr = "size:" + integer("size")
- # use Each (using operator '&') to accept attributes in any order
+ # use Each (using operator '&') to accept attributes in any order
# (shape and posn are required, color and size are optional)
shape_spec = shape_attr & posn_attr & Optional(color_attr) & Optional(size_attr)
@@ -3850,7 +3852,7 @@ class FollowedBy(ParseElementEnhance): Lookahead matching of the given parse expression. C{FollowedBy}
does I{not} advance the parsing position within the input string, it only
verifies that the specified parse expression matches at the current
- position. C{FollowedBy} always returns a null token list. If any
+ position. C{FollowedBy} always returns a null token list. If any
results names are defined in the lookahead expression, those *will* be
returned for access by name.
@@ -3859,7 +3861,7 @@ class FollowedBy(ParseElementEnhance): data_word = Word(alphas)
label = data_word + FollowedBy(':')
attr_expr = Group(label + Suppress(':') + OneOrMore(data_word, stopOn=label).setParseAction(' '.join))
-
+
OneOrMore(attr_expr).parseString("shape: SQUARE color: BLACK posn: upper left").pprint()
prints::
[['shape', 'SQUARE'], ['color', 'BLACK'], ['posn', 'upper left']]
@@ -3881,24 +3883,24 @@ class PrecededBy(ParseElementEnhance): verifies that the specified parse expression matches prior to the current
position. C{PrecededBy} always returns a null token list, but if
a results name is defined on the given expression, it is returned.
-
+
Parameters:
- expr - expression that must match prior to the current parse location
- - retreat - (default=C{None}) - (int) maximum number of characters to
+ - retreat - (default=C{None}) - (int) maximum number of characters to
lookbehind prior to the current parse location
-
+
If the lookbehind expression is a string, Literal, Keyword, or a
Word or CharsNotIn with a specified exact or maximum length, then
the retreat parameter is not required. Otherwise, retreat must be
specified to give a maximum number of characters to look back from
the current parse position for a lookbehind match.
-
+
Example::
# VB-style variable names with type prefixes
int_var = PrecededBy("#") + pyparsing_common.identifier
str_var = PrecededBy("$") + pyparsing_common.identifier
-
+
"""
def __init__(self, expr, retreat=None):
super(PrecededBy, self).__init__(expr)
@@ -3962,10 +3964,10 @@ class NotAny(ParseElementEnhance): ident = ~(AND | OR | NOT) + Word(alphas)
boolean_term = Optional(NOT) + ident
- # very crude boolean expression - to support parenthesis groups and
+ # very crude boolean expression - to support parenthesis groups and
# operation hierarchy, use infixNotation
boolean_expr = boolean_term + ZeroOrMore((AND | OR) + boolean_term)
-
+
# integers that are followed by "." are actually floats
integer = Word(nums) + ~Char(".")
"""
@@ -4005,7 +4007,7 @@ class _MultipleMatch(ParseElementEnhance): check_ender = self.not_ender is not None
if check_ender:
try_not_ender = self.not_ender.tryParse
-
+
# must be at least one (but first see if we are the stopOn sentinel;
# if so, fail)
if check_ender:
@@ -4027,16 +4029,16 @@ class _MultipleMatch(ParseElementEnhance): pass
return loc, tokens
-
+
class OneOrMore(_MultipleMatch):
"""
Repetition of one or more of the given expression.
-
+
Parameters:
- expr - expression that must match one or more times
- stopOn - (default=C{None}) - expression for a terminating sentinel
- (only required if the sentinel would ordinarily match the repetition
- expression)
+ (only required if the sentinel would ordinarily match the repetition
+ expression)
Example::
data_word = Word(alphas)
@@ -4049,7 +4051,7 @@ class OneOrMore(_MultipleMatch): # use stopOn attribute for OneOrMore to avoid reading label string as part of the data
attr_expr = Group(label + Suppress(':') + OneOrMore(data_word, stopOn=label).setParseAction(' '.join))
OneOrMore(attr_expr).parseString(text).pprint() # Better -> [['shape', 'SQUARE'], ['posn', 'upper left'], ['color', 'BLACK']]
-
+
# could also be written as
(attr_expr * (1,)).parseString(text).pprint()
"""
@@ -4066,19 +4068,19 @@ class OneOrMore(_MultipleMatch): class ZeroOrMore(_MultipleMatch):
"""
Optional repetition of zero or more of the given expression.
-
+
Parameters:
- expr - expression that must match zero or more times
- stopOn - (default=C{None}) - expression for a terminating sentinel
- (only required if the sentinel would ordinarily match the repetition
- expression)
+ (only required if the sentinel would ordinarily match the repetition
+ expression)
Example: similar to L{OneOrMore}
"""
def __init__( self, expr, stopOn=None):
super(ZeroOrMore,self).__init__(expr, stopOn=stopOn)
self.mayReturnEmpty = True
-
+
def parseImpl( self, instring, loc, doActions=True ):
try:
return super(ZeroOrMore, self).parseImpl(instring, loc, doActions)
@@ -4116,10 +4118,10 @@ class Optional(ParseElementEnhance): zip.runTests('''
# traditional ZIP code
12345
-
+
# ZIP+4 form
12101-0001
-
+
# invalid ZIP
98765-
''')
@@ -4172,12 +4174,12 @@ class SkipTo(ParseElementEnhance): Parameters:
- expr - target expression marking the end of the data to be skipped
- - include - (default=C{False}) if True, the target expression is also parsed
+ - include - (default=C{False}) if True, the target expression is also parsed
(the skipped text and target expression are returned as a 2-element list).
- - ignore - (default=C{None}) used to define grammars (typically quoted strings and
+ - ignore - (default=C{None}) used to define grammars (typically quoted strings and
comments) that might contain false matches to the target expression
- - failOn - (default=C{None}) define expressions that are not allowed to be
- included in the skipped test; if found before the target expression is found,
+ - failOn - (default=C{None}) define expressions that are not allowed to be
+ included in the skipped test; if found before the target expression is found,
the SkipTo is not a match
Example::
@@ -4197,11 +4199,11 @@ class SkipTo(ParseElementEnhance): # - parse action will call token.strip() for each matched token, i.e., the description body
string_data = SkipTo(SEP, ignore=quotedString)
string_data.setParseAction(tokenMap(str.strip))
- ticket_expr = (integer("issue_num") + SEP
- + string_data("sev") + SEP
- + string_data("desc") + SEP
+ ticket_expr = (integer("issue_num") + SEP
+ + string_data("sev") + SEP
+ + string_data("desc") + SEP
+ integer("days_open"))
-
+
for tkt in ticket_expr.searchString(report):
print tkt.dump()
prints::
@@ -4241,14 +4243,14 @@ class SkipTo(ParseElementEnhance): expr_parse = self.expr._parse
self_failOn_canParseNext = self.failOn.canParseNext if self.failOn is not None else None
self_ignoreExpr_tryParse = self.ignoreExpr.tryParse if self.ignoreExpr is not None else None
-
+
tmploc = loc
while tmploc <= instrlen:
if self_failOn_canParseNext is not None:
# break if failOn expression matches
if self_failOn_canParseNext(instring, tmploc):
break
-
+
if self_ignoreExpr_tryParse is not None:
# advance past ignore expressions
while 1:
@@ -4256,7 +4258,7 @@ class SkipTo(ParseElementEnhance): tmploc = self_ignoreExpr_tryParse(instring, tmploc)
except ParseBaseException:
break
-
+
try:
expr_parse(instring, tmploc, doActions=False, callPreParse=False)
except (ParseException, IndexError):
@@ -4274,7 +4276,7 @@ class SkipTo(ParseElementEnhance): loc = tmploc
skiptext = instring[startloc:loc]
skipresult = ParseResults(skiptext)
-
+
if self.includeMatch:
loc, mat = expr_parse(instring,loc,doActions,callPreParse=False)
skipresult += mat
@@ -4315,10 +4317,10 @@ class Forward(ParseElementEnhance): self.saveAsList = self.expr.saveAsList
self.ignoreExprs.extend(self.expr.ignoreExprs)
return self
-
+
def __ilshift__(self, other):
return self << other
-
+
def leaveWhitespace( self ):
self.skipWhitespace = False
return self
@@ -4452,16 +4454,16 @@ class Dict(TokenConverter): text = "shape: SQUARE posn: upper left color: light blue texture: burlap"
attr_expr = (label + Suppress(':') + OneOrMore(data_word, stopOn=label).setParseAction(' '.join))
-
+
# print attributes as plain groups
print(OneOrMore(attr_expr).parseString(text).dump())
-
+
# instead of OneOrMore(expr), parse using Dict(OneOrMore(Group(expr))) - Dict will auto-assign names
result = Dict(OneOrMore(Group(attr_expr))).parseString(text)
print(result.dump())
-
+
# access named fields as dict entries, or output as dict
- print(result['shape'])
+ print(result['shape'])
print(result.asDict())
prints::
['shape', 'SQUARE', 'posn', 'upper left', 'color', 'light blue', 'texture', 'burlap']
@@ -4548,8 +4550,8 @@ class OnlyOnce(object): def traceParseAction(f):
"""
- Decorator for debugging parse actions.
-
+ Decorator for debugging parse actions.
+
When the parse action is called, this decorator will print C{">> entering I{method-name}(line:I{current_source_line}, I{parse_location}, I{matched_tokens})".}
When the parse action completes, the decorator will print C{"<<"} followed by the returned value, or any exception that the parse action raised.
@@ -4616,7 +4618,7 @@ def countedArray( expr, intExpr=None ): integer expr expr expr...
where the leading integer tells how many expr expressions follow.
The matched tokens returns the array of expr tokens as a list - the leading count token is suppressed.
-
+
If C{intExpr} is specified, it should be a pyparsing expression that produces an integer value.
Example::
@@ -4799,7 +4801,7 @@ def dictOf( key, value ): text = "shape: SQUARE posn: upper left color: light blue texture: burlap"
attr_expr = (label + Suppress(':') + OneOrMore(data_word, stopOn=label).setParseAction(' '.join))
print(OneOrMore(attr_expr).parseString(text).dump())
-
+
attr_label = label
attr_value = Suppress(':') + OneOrMore(data_word, stopOn=label).setParseAction(' '.join)
@@ -4826,11 +4828,11 @@ def originalTextFor(expr, asString=True): Helper to return the original, untokenized text for a given expression. Useful to
restore the parsed fields of an HTML start tag into the raw tag text itself, or to
revert separate tokens with intervening whitespace back to the original matching
- input text. By default, returns astring containing the original parsed text.
-
- If the optional C{asString} argument is passed as C{False}, then the return value is a
- C{L{ParseResults}} containing any results names that were originally matched, and a
- single token containing the original matched text from the input string. So if
+ input text. By default, returns astring containing the original parsed text.
+
+ If the optional C{asString} argument is passed as C{False}, then the return value is a
+ C{L{ParseResults}} containing any results names that were originally matched, and a
+ single token containing the original matched text from the input string. So if
the expression passed to C{L{originalTextFor}} contains expressions with defined
results names, you must set C{asString} to C{False} if you want to preserve those
results name values.
@@ -4858,7 +4860,7 @@ def originalTextFor(expr, asString=True): matchExpr.ignoreExprs = expr.ignoreExprs
return matchExpr
-def ungroup(expr):
+def ungroup(expr):
"""
Helper to undo pyparsing's default grouping of And expressions, even
if all but one are non-empty.
@@ -4915,8 +4917,8 @@ def srange(s): The values enclosed in the []'s may be:
- a single character
- an escaped character with a leading backslash (such as C{\-} or C{\]})
- - an escaped hex character with a leading C{'\x'} (C{\x21}, which is a C{'!'} character)
- (C{\0x##} is also supported for backwards compatibility)
+ - an escaped hex character with a leading C{'\x'} (C{\x21}, which is a C{'!'} character)
+ (C{\0x##} is also supported for backwards compatibility)
- an escaped octal character with a leading C{'\0'} (C{\041}, which is a C{'!'} character)
- a range of any of the above, separated by a dash (C{'a-z'}, etc.)
- any combination of the above (C{'aeiouy'}, C{'a-zA-Z0-9_$'}, etc.)
@@ -4946,7 +4948,7 @@ def replaceWith(replStr): num = Word(nums).setParseAction(lambda toks: int(toks[0]))
na = oneOf("N/A NA").setParseAction(replaceWith(math.nan))
term = na | num
-
+
OneOrMore(term).parseString("324 234 N/A 234") # -> [324, 234, nan, 234]
"""
return lambda s,l,t: [replStr]
@@ -4967,7 +4969,7 @@ def removeQuotes(s,l,t): def tokenMap(func, *args):
"""
- Helper to define a parse action by mapping a function to all elements of a ParseResults list.If any additional
+ Helper to define a parse action by mapping a function to all elements of a ParseResults list.If any additional
args are passed, they are forwarded to the given function as additional arguments after
the token, as in C{hex_integer = Word(hexnums).setParseAction(tokenMap(int, 16))}, which will convert the
parsed data to an integer using base 16.
@@ -4977,7 +4979,7 @@ def tokenMap(func, *args): hex_ints.runTests('''
00 11 22 aa FF 0a 0d 1a
''')
-
+
upperword = Word(alphas).setParseAction(tokenMap(str.upper))
OneOrMore(upperword).runTests('''
my kingdom for a horse
@@ -5001,7 +5003,7 @@ def tokenMap(func, *args): return [func(tokn, *args) for tokn in t]
try:
- func_name = getattr(func, '__name__',
+ func_name = getattr(func, '__name__',
getattr(func, '__class__').__name__)
except Exception:
func_name = str(func)
@@ -5014,7 +5016,7 @@ upcaseTokens = tokenMap(lambda t: _ustr(t).upper()) downcaseTokens = tokenMap(lambda t: _ustr(t).lower())
"""(Deprecated) Helper parse action to convert tokens to lower case. Deprecated in favor of L{pyparsing_common.downcaseTokens}"""
-
+
def _makeTags(tagStr, xml):
"""Internal helper to construct opening and closing tag expressions, given a tag name"""
if isinstance(tagStr,basestring):
@@ -5054,7 +5056,7 @@ def makeHTMLTags(tagStr): # makeHTMLTags returns pyparsing expressions for the opening and closing tags as a 2-tuple
a,a_end = makeHTMLTags("A")
link_expr = a + SkipTo(a_end)("link_text") + a_end
-
+
for link in link_expr.searchString(text):
# attributes in the <A> tag (like "href" shown here) are also accessible as named results
print(link.link_text, '->', link.href)
@@ -5087,7 +5089,7 @@ def withAttribute(*args,**attrDict): - a list of name-value tuples, as in ( ("ns1:class", "Customer"), ("ns2:align","right") )
For attribute names with a namespace prefix, you must use the second form. Attribute
names are matched insensitive to upper/lower case.
-
+
If just testing for C{class} (with or without a namespace), use C{L{withClass}}.
To verify that the attribute exists, but without specifying a value, pass
@@ -5101,7 +5103,7 @@ def withAttribute(*args,**attrDict): <div type="graph">1,3 2,3 1,1</div>
<div>this has no type</div>
</div>
-
+
'''
div,div_end = makeHTMLTags("div")
@@ -5110,7 +5112,7 @@ def withAttribute(*args,**attrDict): grid_expr = div_grid + SkipTo(div | div_end)("body")
for grid_header in grid_expr.searchString(html):
print(grid_header.body)
-
+
# construct a match with any div tag having a type attribute, regardless of the value
div_any_type = div().setParseAction(withAttribute(type=withAttribute.ANY_VALUE))
div_expr = div_any_type + SkipTo(div | div_end)("body")
@@ -5150,15 +5152,15 @@ def withClass(classname, namespace=''): <div class="graph">1,3 2,3 1,1</div>
<div>this <div> has no class</div>
</div>
-
+
'''
div,div_end = makeHTMLTags("div")
div_grid = div().setParseAction(withClass("grid"))
-
+
grid_expr = div_grid + SkipTo(div | div_end)("body")
for grid_header in grid_expr.searchString(html):
print(grid_header.body)
-
+
div_any_type = div().setParseAction(withClass(withAttribute.ANY_VALUE))
div_expr = div_any_type + SkipTo(div | div_end)("body")
for div_header in div_expr.searchString(html):
@@ -5170,7 +5172,7 @@ def withClass(classname, namespace=''): 1,3 2,3 1,1
"""
classattr = "%s:class" % namespace if namespace else "class"
- return withAttribute(**{classattr : classname})
+ return withAttribute(**{classattr : classname})
opAssoc = _Constants()
opAssoc.LEFT = object()
@@ -5181,9 +5183,9 @@ def infixNotation( baseExpr, opList, lpar=Suppress('('), rpar=Suppress(')') ): Helper method for constructing grammars of expressions made up of
operators working in a precedence hierarchy. Operators may be unary or
binary, left- or right-associative. Parse actions can also be attached
- to operator expressions. The generated parser will also recognize the use
+ to operator expressions. The generated parser will also recognize the use
of parentheses to override operator precedences (see example below).
-
+
Note: if you define a deep operator list, you may see performance issues
when using infixNotation. See L{ParserElement.enablePackrat} for a
mechanism to potentially improve your parser performance.
@@ -5213,15 +5215,15 @@ def infixNotation( baseExpr, opList, lpar=Suppress('('), rpar=Suppress(')') ): Example::
# simple example of four-function arithmetic with ints and variable names
integer = pyparsing_common.signed_integer
- varname = pyparsing_common.identifier
-
+ varname = pyparsing_common.identifier
+
arith_expr = infixNotation(integer | varname,
[
('-', 1, opAssoc.RIGHT),
(oneOf('* /'), 2, opAssoc.LEFT),
(oneOf('+ -'), 2, opAssoc.LEFT),
])
-
+
arith_expr.runTests('''
5+3*6
(5+3)*6
@@ -5335,23 +5337,23 @@ def nestedExpr(opener="(", closer=")", content=None, ignoreExpr=quotedString.cop code_body = nestedExpr('{', '}', ignoreExpr=(quotedString | cStyleComment))
- c_function = (decl_data_type("type")
+ c_function = (decl_data_type("type")
+ ident("name")
- + LPAR + Optional(delimitedList(arg), [])("args") + RPAR
+ + LPAR + Optional(delimitedList(arg), [])("args") + RPAR
+ code_body("body"))
c_function.ignore(cStyleComment)
-
+
source_code = '''
- int is_odd(int x) {
- return (x%2);
+ int is_odd(int x) {
+ return (x%2);
}
-
- int dec_to_hex(char hchar) {
- if (hchar >= '0' && hchar <= '9') {
- return (ord(hchar)-ord('0'));
- } else {
+
+ int dec_to_hex(char hchar) {
+ if (hchar >= '0' && hchar <= '9') {
+ return (ord(hchar)-ord('0'));
+ } else {
return (10+ord(hchar)-ord('A'));
- }
+ }
}
'''
for func in c_function.searchString(source_code):
@@ -5375,7 +5377,7 @@ def nestedExpr(opener="(", closer=")", content=None, ignoreExpr=quotedString.cop ).setParseAction(lambda t:t[0].strip()))
else:
if ignoreExpr is not None:
- content = (Combine(OneOrMore(~ignoreExpr +
+ content = (Combine(OneOrMore(~ignoreExpr +
~Literal(opener) + ~Literal(closer) +
CharsNotIn(ParserElement.DEFAULT_WHITE_CHARS,exact=1))
).setParseAction(lambda t:t[0].strip()))
@@ -5469,7 +5471,7 @@ def indentedBlock(blockStatementExpr, indentStack, indent=True): 'spam',
['(', 'x', 'y', ')'],
':',
- [[['def', 'eggs', ['(', 'z', ')'], ':', [['pass']]]]]]]
+ [[['def', 'eggs', ['(', 'z', ')'], ':', [['pass']]]]]]]
"""
def checkPeerIndent(s,l,t):
if l >= len(s): return
@@ -5720,10 +5722,10 @@ class pyparsing_common: fnumber = Regex(r'[+-]?\d+\.?\d*([eE][+-]?\d+)?').setName("fnumber").setParseAction(convertToFloat)
"""any int or real number, returned as float"""
-
+
identifier = Word(alphas+'_', alphanums+'_').setName("identifier")
"""typical code identifier (leading alpha or '_', followed by 0 or more alphas, nums, or '_')"""
-
+
ipv4_address = Regex(r'(25[0-5]|2[0-4][0-9]|1?[0-9]{1,2})(\.(25[0-5]|2[0-4][0-9]|1?[0-9]{1,2})){3}').setName("IPv4 address")
"IPv4 address (C{0.0.0.0 - 255.255.255.255})"
@@ -5734,7 +5736,7 @@ class pyparsing_common: _mixed_ipv6_address = ("::ffff:" + ipv4_address).setName("mixed IPv6 address")
ipv6_address = Combine((_full_ipv6_address | _mixed_ipv6_address | _short_ipv6_address).setName("IPv6 address")).setName("IPv6 address")
"IPv6 address (long, short, or mixed form)"
-
+
mac_address = Regex(r'[0-9a-fA-F]{2}([:.-])[0-9a-fA-F]{2}(?:\1[0-9a-fA-F]{2}){4}').setName("MAC address")
"MAC address xx:xx:xx:xx:xx (may also have '-' or '.' delimiters)"
@@ -5798,16 +5800,16 @@ class pyparsing_common: Parse action to remove HTML tags from web page HTML source
Example::
- # strip HTML links from normal text
+ # strip HTML links from normal text
text = '<td>More info at the <a href="http://pyparsing.wikispaces.com">pyparsing</a> wiki page</td>'
td,td_end = makeHTMLTags("TD")
table_text = td + SkipTo(td_end).setParseAction(pyparsing_common.stripHTMLTags)("body") + td_end
-
+
print(table_text.parseString(text).body) # -> 'More info at the pyparsing wiki page'
"""
return pyparsing_common._html_stripper.transformString(tokens[0])
- _commasepitem = Combine(OneOrMore(~Literal(",") + ~LineEnd() + Word(printables, excludeChars=',')
+ _commasepitem = Combine(OneOrMore(~Literal(",") + ~LineEnd() + Word(printables, excludeChars=',')
+ Optional( White(" \t") ) ) ).streamline().setName("commaItem")
comma_separated_list = delimitedList( Optional( quotedString.copy() | _commasepitem, default="") ).setName("comma separated list")
"""Predefined expression of 1 or more printable words or quoted strings, separated by commas."""
@@ -5819,6 +5821,104 @@ class pyparsing_common: """Parse action to convert tokens to lower case."""
+class _lazyclassproperty(object):
+ def __init__(self, fn):
+ self.fn = fn
+
+ def __get__(self, obj, cls):
+ if cls is None:
+ cls = type(obj)
+ ret = self.fn(cls)
+ setattr(cls, self.fn.__name__, ret)
+ return ret
+
+
+class _unicode_set:
+ _ranges = []
+
+ @_lazyclassproperty
+ def printables(cls):
+ return ''.join(filterfalse(str.isspace, (chr(c) for r in cls._ranges for c in range(r[0], r[-1] + 1))))
+
+ @_lazyclassproperty
+ def alphas(cls):
+ return ''.join(filter(str.isalpha, (chr(c) for r in cls._ranges for c in range(r[0], r[-1] + 1))))
+
+ @_lazyclassproperty
+ def nums(cls):
+ return ''.join(filter(str.isdigit, (chr(c) for r in cls._ranges for c in range(r[0], r[-1] + 1))))
+
+ @_lazyclassproperty
+ def alphanums(cls):
+ return cls.alphas + cls.nums
+
+
+class pyparsing_unicode(_unicode_set):
+ _ranges = [(32, sys.maxunicode)]
+
+ class Latin1(_unicode_set):
+ _ranges = [
+ (0x0020, 0x007e), (0x00a0, 0x00ff),
+ ]
+
+ class Greek(_unicode_set):
+ _ranges = [
+ (0x0370, 0x03ff), (0x1f00, 0x1f15), (0x1f18, 0x1f1d), (0x1f20, 0x1f45), (0x1f48, 0x1f4d),
+ (0x1f50, 0x1f57), (0x1f59,), (0x1f5b,), (0x1f5d,), (0x1f5f, 0x1f7d), (0x1f80, 0x1fb4), (0x1fb6, 0x1fc4),
+ (0x1fc6, 0x1fd3), (0x1fd6, 0x1fdb), (0x1fdd, 0x1fef), (0x1ff2, 0x1ff4), (0x1ff6, 0x1ffe),
+ ]
+
+ class Cyrillic(_unicode_set):
+ _ranges = [(0x0400, 0x04ff)]
+
+ class Chinese(_unicode_set):
+ _ranges = [(0x4e00, 0x9fff)]
+
+ class Japanese(_unicode_set):
+ _ranges = [ ] # sum of Kanji, Hiragana, and Katakana ranges
+
+ class Kanji(_unicode_set):
+ _ranges = [(0x4E00, 0x9Fbf), ]
+
+ class Hiragana(_unicode_set):
+ _ranges = [(0x3040, 0x309f), ]
+
+ class Katakana(_unicode_set):
+ _ranges = [(0x30a0, 0x30ff), ]
+
+ class Korean(_unicode_set):
+ _ranges = [(0xac00, 0xd7af), (0x1100, 0x11ff), (0x3130, 0x318f), (0xa960, 0xa97f), (0xd7b0, 0xd7ff), ]
+
+ class CJK(_unicode_set):
+ _ranges = [ # sum of Chinese, Japanese, and Korean ranges
+ ]
+
+ class Thai(_unicode_set):
+ _ranges = [(0x0e01, 0x0e3a), (0x0e3f, 0x0e5b), ]
+
+ class Arabic(_unicode_set):
+ _ranges = [(0x0600, 0x061b), (0x061e, 0x06ff), (0x0700, 0x077f), ]
+
+ class Hebrew(_unicode_set):
+ _ranges = [(0x0590, 0x05ff), ]
+
+pyparsing_unicode.Japanese._ranges = pyparsing_unicode.Japanese.Kanji._ranges + pyparsing_unicode.Japanese.Hiragana._ranges + pyparsing_unicode.Japanese.Katakana._ranges
+pyparsing_unicode.CJK._ranges = pyparsing_unicode.Chinese._ranges + pyparsing_unicode.Japanese._ranges + pyparsing_unicode.Korean._ranges
+
+# define ranges in language character sets
+pyparsing_unicode.العربية = pyparsing_unicode.Arabic
+pyparsing_unicode.中文 = pyparsing_unicode.Chinese
+pyparsing_unicode.кириллица = pyparsing_unicode.Cyrillic
+pyparsing_unicode.Ελληνικά = pyparsing_unicode.Greek
+pyparsing_unicode.עִברִית = pyparsing_unicode.Hebrew
+pyparsing_unicode.日本語 = pyparsing_unicode.Japanese
+pyparsing_unicode.Japanese.漢字 = pyparsing_unicode.Japanese.Kanji
+pyparsing_unicode.Japanese.カタカナ = pyparsing_unicode.Japanese.Katakana
+pyparsing_unicode.Japanese.ひらがな = pyparsing_unicode.Japanese.Hiragana
+pyparsing_unicode.한국어 = pyparsing_unicode.Korean
+pyparsing_unicode.ไทย = pyparsing_unicode.Thai
+
+
if __name__ == "__main__":
selectToken = CaselessLiteral("select")
@@ -5832,7 +5932,7 @@ if __name__ == "__main__": tableName = delimitedList(ident, ".", combine=True).setParseAction(upcaseTokens)
tableNameList = Group(delimitedList(tableName)).setName("tables")
-
+
simpleSQL = selectToken("command") + columnSpec("columns") + fromToken + tableNameList("tables")
# demo runTests method, including embedded comments in test string
|