diff options
author | Georg Brandl <georg@python.org> | 2008-12-29 20:22:18 +0100 |
---|---|---|
committer | Georg Brandl <georg@python.org> | 2008-12-29 20:22:18 +0100 |
commit | 49c15e93a21014f6640bfd0e0c6752bdc61d105c (patch) | |
tree | 8cb3e80a39028c2a78fc6155d100d36cacbec333 /sphinx/pycode | |
parent | 2dc410c1bb8e078c5e98cb5bdad332c28abb998f (diff) | |
download | sphinx-git-49c15e93a21014f6640bfd0e0c6752bdc61d105c.tar.gz |
Add pgen2 and custom utilities.
Diffstat (limited to 'sphinx/pycode')
-rw-r--r-- | sphinx/pycode/Grammar.txt | 155 | ||||
-rw-r--r-- | sphinx/pycode/__init__.py | 142 | ||||
-rw-r--r-- | sphinx/pycode/pgen2/__init__.py | 4 | ||||
-rw-r--r-- | sphinx/pycode/pgen2/driver.py | 145 | ||||
-rw-r--r-- | sphinx/pycode/pgen2/grammar.py | 171 | ||||
-rw-r--r-- | sphinx/pycode/pgen2/literals.py | 60 | ||||
-rw-r--r-- | sphinx/pycode/pgen2/parse.py | 201 | ||||
-rw-r--r-- | sphinx/pycode/pgen2/pgen.py | 384 | ||||
-rwxr-xr-x | sphinx/pycode/pgen2/token.py | 82 | ||||
-rw-r--r-- | sphinx/pycode/pgen2/tokenize.py | 405 | ||||
-rw-r--r-- | sphinx/pycode/pytree.py | 295 |
11 files changed, 2044 insertions, 0 deletions
diff --git a/sphinx/pycode/Grammar.txt b/sphinx/pycode/Grammar.txt new file mode 100644 index 000000000..1f4a50ffb --- /dev/null +++ b/sphinx/pycode/Grammar.txt @@ -0,0 +1,155 @@ +# Grammar for Python + +# Note: Changing the grammar specified in this file will most likely +# require corresponding changes in the parser module +# (../Modules/parsermodule.c). If you can't make the changes to +# that module yourself, please co-ordinate the required changes +# with someone who can; ask around on python-dev for help. Fred +# Drake <fdrake@acm.org> will probably be listening there. + +# NOTE WELL: You should also follow all the steps listed in PEP 306, +# "How to Change Python's Grammar" + +# Commands for Kees Blom's railroad program +#diagram:token NAME +#diagram:token NUMBER +#diagram:token STRING +#diagram:token NEWLINE +#diagram:token ENDMARKER +#diagram:token INDENT +#diagram:output\input python.bla +#diagram:token DEDENT +#diagram:output\textwidth 20.04cm\oddsidemargin 0.0cm\evensidemargin 0.0cm +#diagram:rules + +# Start symbols for the grammar: +# file_input is a module or sequence of commands read from an input file; +# single_input is a single interactive statement; +# eval_input is the input for the eval() and input() functions. +# NB: compound_stmt in single_input is followed by extra NEWLINE! +file_input: (NEWLINE | stmt)* ENDMARKER +single_input: NEWLINE | simple_stmt | compound_stmt NEWLINE +eval_input: testlist NEWLINE* ENDMARKER + +decorator: '@' dotted_name [ '(' [arglist] ')' ] NEWLINE +decorators: decorator+ +decorated: decorators (classdef | funcdef) +funcdef: 'def' NAME parameters ['->' test] ':' suite +parameters: '(' [typedargslist] ')' +typedargslist: ((tfpdef ['=' test] ',')* + ('*' [tname] (',' tname ['=' test])* [',' '**' tname] | '**' tname) + | tfpdef ['=' test] (',' tfpdef ['=' test])* [',']) +tname: NAME [':' test] +tfpdef: tname | '(' tfplist ')' +tfplist: tfpdef (',' tfpdef)* [','] +varargslist: ((vfpdef ['=' test] ',')* + ('*' [vname] (',' vname ['=' test])* [',' '**' vname] | '**' vname) + | vfpdef ['=' test] (',' vfpdef ['=' test])* [',']) +vname: NAME +vfpdef: vname | '(' vfplist ')' +vfplist: vfpdef (',' vfpdef)* [','] + +stmt: simple_stmt | compound_stmt +simple_stmt: small_stmt (';' small_stmt)* [';'] NEWLINE +small_stmt: (expr_stmt | print_stmt | del_stmt | pass_stmt | flow_stmt | + import_stmt | global_stmt | exec_stmt | assert_stmt) +expr_stmt: testlist (augassign (yield_expr|testlist) | + ('=' (yield_expr|testlist))*) +augassign: ('+=' | '-=' | '*=' | '/=' | '%=' | '&=' | '|=' | '^=' | + '<<=' | '>>=' | '**=' | '//=') +# For normal assignments, additional restrictions enforced by the interpreter +print_stmt: 'print' ( [ test (',' test)* [','] ] | + '>>' test [ (',' test)+ [','] ] ) +del_stmt: 'del' exprlist +pass_stmt: 'pass' +flow_stmt: break_stmt | continue_stmt | return_stmt | raise_stmt | yield_stmt +break_stmt: 'break' +continue_stmt: 'continue' +return_stmt: 'return' [testlist] +yield_stmt: yield_expr +raise_stmt: 'raise' [test ['from' test | ',' test [',' test]]] +import_stmt: import_name | import_from +import_name: 'import' dotted_as_names +import_from: ('from' ('.'* dotted_name | '.'+) + 'import' ('*' | '(' import_as_names ')' | import_as_names)) +import_as_name: NAME ['as' NAME] +dotted_as_name: dotted_name ['as' NAME] +import_as_names: import_as_name (',' import_as_name)* [','] +dotted_as_names: dotted_as_name (',' dotted_as_name)* +dotted_name: NAME ('.' NAME)* +global_stmt: ('global' | 'nonlocal') NAME (',' NAME)* +exec_stmt: 'exec' expr ['in' test [',' test]] +assert_stmt: 'assert' test [',' test] + +compound_stmt: if_stmt | while_stmt | for_stmt | try_stmt | with_stmt | funcdef | classdef | decorated +if_stmt: 'if' test ':' suite ('elif' test ':' suite)* ['else' ':' suite] +while_stmt: 'while' test ':' suite ['else' ':' suite] +for_stmt: 'for' exprlist 'in' testlist ':' suite ['else' ':' suite] +try_stmt: ('try' ':' suite + ((except_clause ':' suite)+ + ['else' ':' suite] + ['finally' ':' suite] | + 'finally' ':' suite)) +with_stmt: 'with' test [ with_var ] ':' suite +with_var: 'as' expr +# NB compile.c makes sure that the default except clause is last +except_clause: 'except' [test [(',' | 'as') test]] +suite: simple_stmt | NEWLINE INDENT stmt+ DEDENT + +# Backward compatibility cruft to support: +# [ x for x in lambda: True, lambda: False if x() ] +# even while also allowing: +# lambda x: 5 if x else 2 +# (But not a mix of the two) +testlist_safe: old_test [(',' old_test)+ [',']] +old_test: or_test | old_lambdef +old_lambdef: 'lambda' [varargslist] ':' old_test + +test: or_test ['if' or_test 'else' test] | lambdef +or_test: and_test ('or' and_test)* +and_test: not_test ('and' not_test)* +not_test: 'not' not_test | comparison +comparison: expr (comp_op expr)* +comp_op: '<'|'>'|'=='|'>='|'<='|'<>'|'!='|'in'|'not' 'in'|'is'|'is' 'not' +expr: xor_expr ('|' xor_expr)* +xor_expr: and_expr ('^' and_expr)* +and_expr: shift_expr ('&' shift_expr)* +shift_expr: arith_expr (('<<'|'>>') arith_expr)* +arith_expr: term (('+'|'-') term)* +term: factor (('*'|'/'|'%'|'//') factor)* +factor: ('+'|'-'|'~') factor | power +power: atom trailer* ['**' factor] +atom: ('(' [yield_expr|testlist_gexp] ')' | + '[' [listmaker] ']' | + '{' [dictsetmaker] '}' | + '`' testlist1 '`' | + NAME | NUMBER | STRING+ | '.' '.' '.') +listmaker: test ( comp_for | (',' test)* [','] ) +testlist_gexp: test ( comp_for | (',' test)* [','] ) +lambdef: 'lambda' [varargslist] ':' test +trailer: '(' [arglist] ')' | '[' subscriptlist ']' | '.' NAME +subscriptlist: subscript (',' subscript)* [','] +subscript: test | [test] ':' [test] [sliceop] +sliceop: ':' [test] +exprlist: expr (',' expr)* [','] +testlist: test (',' test)* [','] +dictsetmaker: ( (test ':' test (comp_for | (',' test ':' test)* [','])) | + (test (comp_for | (',' test)* [','])) ) + +classdef: 'class' NAME ['(' [arglist] ')'] ':' suite + +arglist: (argument ',')* (argument [','] + |'*' test (',' argument)* [',' '**' test] + |'**' test) +argument: test [comp_for] | test '=' test # Really [keyword '='] test + +comp_iter: comp_for | comp_if +comp_for: 'for' exprlist 'in' testlist_safe [comp_iter] +comp_if: 'if' old_test [comp_iter] + +testlist1: test (',' test)* + +# not used in grammar, but may appear in "node" passed from Parser to Compiler +encoding_decl: NAME + +yield_expr: 'yield' [testlist] diff --git a/sphinx/pycode/__init__.py b/sphinx/pycode/__init__.py new file mode 100644 index 000000000..b5d83e679 --- /dev/null +++ b/sphinx/pycode/__init__.py @@ -0,0 +1,142 @@ +# -*- coding: utf-8 -*- +""" + sphinx.pycode + ~~~~~~~~~~~~~ + + Utilities parsing and analyzing Python code. + + :copyright: 2008 by Georg Brandl. + :license: BSD, see LICENSE for details. +""" + +import sys +from os import path + +from sphinx.pycode import pytree +from sphinx.pycode.pgen2 import driver, token + + +# load the Python grammar +_grammarfile = path.join(path.dirname(__file__), 'Grammar.txt') +pygrammar = driver.load_grammar(_grammarfile) +pydriver = driver.Driver(pygrammar, convert=pytree.convert) + +class sym: pass +for k, v in pygrammar.symbol2number.iteritems(): + setattr(sym, k, v) + +# a dict mapping terminal and nonterminal numbers to their names +number2name = pygrammar.number2symbol.copy() +number2name.update(token.tok_name) + + +def prepare_commentdoc(s): + result = [] + lines = [line.strip() for line in s.expandtabs().splitlines()] + for line in lines: + if line.startswith('#: '): + result.append(line[3:]) + if result and result[-1]: + result.append('') + return '\n'.join(result) + + +_eq = pytree.Leaf(token.EQUAL, '=') + + +class ClassAttrVisitor(pytree.NodeVisitor): + def init(self): + self.namespace = [] + + def visit_classdef(self, node): + self.namespace.append(node[1].value) + self.generic_visit(node) + self.namespace.pop() + + def visit_expr_stmt(self, node): + if _eq in node.children: + prefix = node[0].get_prefix() + if not prefix: + prev = node[0].get_prev_leaf() + if prev and prev.type == token.INDENT: + prefix = prev.prefix + doc = prepare_commentdoc(prefix) + if doc: + targ = '.'.join(self.namespace + [node[0].compact()]) + print targ + print doc + + def visit_funcdef(self, node): + return + + +class ModuleAnalyzer(object): + + def __init__(self, tree, modname, srcname): + self.tree = tree + self.modname = modname + self.srcname = srcname + + @classmethod + def for_string(cls, string, modname, srcname='<string>'): + return cls(pydriver.parse_string(string), modname, srcname) + + @classmethod + def for_file(cls, filename, modname): + # XXX if raises + fileobj = open(filename, 'r') + try: + return cls(pydriver.parse_stream(fileobj), modname, filename) + finally: + fileobj.close() + + @classmethod + def for_module(cls, modname): + if modname not in sys.modules: + # XXX + __import__(modname) + mod = sys.modules[modname] + if hasattr(mod, '__loader__'): + # XXX raises + source = mod.__loader__.get_source(modname) + return cls.for_string(source, modname) + filename = getattr(mod, '__file__', None) + if filename is None: + # XXX + raise RuntimeError('no source found') + if filename.lower().endswith('.pyo') or \ + filename.lower().endswith('.pyc'): + filename = filename[:-1] + elif not filename.lower().endswith('.py'): + raise RuntimeError('not a .py file') + if not path.isfile(filename): + # XXX + raise RuntimeError('source not present') + return cls.for_file(filename, modname) + + def find_defs(self): + attr_visitor = ClassAttrVisitor(number2name) + attr_visitor.namespace = [self.modname] + attr_visitor.visit(self.tree) + +class Test: + """doc""" + + #: testing... + x = 1 + """doc""" + + #: testing more... + x = 2 + + +#ma = ModuleAnalyzer.for_file(__file__.rstrip('c')) +import time +x0=time.time() +ma = ModuleAnalyzer.for_module('sphinx.builders.latex') +x1=time.time() +ma.find_defs() +x2=time.time() +print "%.4f %.4f" % (x1-x0, x2-x1) + +#print pytree.nice_repr(ma.tree, number2name, True) diff --git a/sphinx/pycode/pgen2/__init__.py b/sphinx/pycode/pgen2/__init__.py new file mode 100644 index 000000000..af3904845 --- /dev/null +++ b/sphinx/pycode/pgen2/__init__.py @@ -0,0 +1,4 @@ +# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. +# Licensed to PSF under a Contributor Agreement. + +"""The pgen2 package.""" diff --git a/sphinx/pycode/pgen2/driver.py b/sphinx/pycode/pgen2/driver.py new file mode 100644 index 000000000..3e9e10435 --- /dev/null +++ b/sphinx/pycode/pgen2/driver.py @@ -0,0 +1,145 @@ +# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. +# Licensed to PSF under a Contributor Agreement. + +# Modifications: +# Copyright 2006 Google, Inc. All Rights Reserved. +# Licensed to PSF under a Contributor Agreement. + +"""Parser driver. + +This provides a high-level interface to parse a file into a syntax tree. + +""" + +__author__ = "Guido van Rossum <guido@python.org>" + +__all__ = ["Driver", "load_grammar"] + +# Python imports +import os +import logging +import sys + +# Pgen imports +from sphinx.pycode.pgen2 import grammar, parse, token, tokenize, pgen + + +class Driver(object): + + def __init__(self, grammar, convert=None, logger=None): + self.grammar = grammar + if logger is None: + logger = logging.getLogger() + self.logger = logger + self.convert = convert + + def parse_tokens(self, tokens, debug=False): + """Parse a series of tokens and return the syntax tree.""" + # XXX Move the prefix computation into a wrapper around tokenize. + p = parse.Parser(self.grammar, self.convert) + p.setup() + lineno = 1 + column = 0 + type = value = start = end = line_text = None + prefix = "" + for quintuple in tokens: + type, value, start, end, line_text = quintuple + if start != (lineno, column): + assert (lineno, column) <= start, ((lineno, column), start) + s_lineno, s_column = start + if lineno < s_lineno: + prefix += "\n" * (s_lineno - lineno) + lineno = s_lineno + column = 0 + if column < s_column: + prefix += line_text[column:s_column] + column = s_column + if type in (tokenize.COMMENT, tokenize.NL): + prefix += value + lineno, column = end + if value.endswith("\n"): + lineno += 1 + column = 0 + continue + if type == token.OP: + type = grammar.opmap[value] + if debug: + self.logger.debug("%s %r (prefix=%r)", + token.tok_name[type], value, prefix) + if p.addtoken(type, value, (prefix, start)): + if debug: + self.logger.debug("Stop.") + break + prefix = "" + lineno, column = end + if value.endswith("\n"): + lineno += 1 + column = 0 + else: + # We never broke out -- EOF is too soon (how can this happen???) + raise parse.ParseError("incomplete input", t, v, x) + return p.rootnode + + def parse_stream_raw(self, stream, debug=False): + """Parse a stream and return the syntax tree.""" + tokens = tokenize.generate_tokens(stream.readline) + return self.parse_tokens(tokens, debug) + + def parse_stream(self, stream, debug=False): + """Parse a stream and return the syntax tree.""" + return self.parse_stream_raw(stream, debug) + + def parse_file(self, filename, debug=False): + """Parse a file and return the syntax tree.""" + stream = open(filename) + try: + return self.parse_stream(stream, debug) + finally: + stream.close() + + def parse_string(self, text, debug=False): + """Parse a string and return the syntax tree.""" + tokens = tokenize.generate_tokens(generate_lines(text).next) + return self.parse_tokens(tokens, debug) + + +def generate_lines(text): + """Generator that behaves like readline without using StringIO.""" + for line in text.splitlines(True): + yield line + while True: + yield "" + + +def load_grammar(gt="Grammar.txt", gp=None, + save=True, force=False, logger=None): + """Load the grammar (maybe from a pickle).""" + if logger is None: + logger = logging.getLogger() + if gp is None: + head, tail = os.path.splitext(gt) + if tail == ".txt": + tail = "" + gp = head + tail + ".".join(map(str, sys.version_info)) + ".pickle" + if force or not _newer(gp, gt): + logger.info("Generating grammar tables from %s", gt) + g = pgen.generate_grammar(gt) + if save: + logger.info("Writing grammar tables to %s", gp) + try: + g.dump(gp) + except IOError, e: + logger.info("Writing failed:"+str(e)) + else: + g = grammar.Grammar() + g.load(gp) + return g + + +def _newer(a, b): + """Inquire whether file a was written since file b.""" + if not os.path.exists(a): + return False + if not os.path.exists(b): + return True + return os.path.getmtime(a) >= os.path.getmtime(b) diff --git a/sphinx/pycode/pgen2/grammar.py b/sphinx/pycode/pgen2/grammar.py new file mode 100644 index 000000000..381d80e82 --- /dev/null +++ b/sphinx/pycode/pgen2/grammar.py @@ -0,0 +1,171 @@ +# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. +# Licensed to PSF under a Contributor Agreement. + +"""This module defines the data structures used to represent a grammar. + +These are a bit arcane because they are derived from the data +structures used by Python's 'pgen' parser generator. + +There's also a table here mapping operators to their names in the +token module; the Python tokenize module reports all operators as the +fallback token code OP, but the parser needs the actual token code. + +""" + +# Python imports +import pickle + +# Local imports +from sphinx.pycode.pgen2 import token, tokenize + + +class Grammar(object): + """Pgen parsing tables tables conversion class. + + Once initialized, this class supplies the grammar tables for the + parsing engine implemented by parse.py. The parsing engine + accesses the instance variables directly. The class here does not + provide initialization of the tables; several subclasses exist to + do this (see the conv and pgen modules). + + The load() method reads the tables from a pickle file, which is + much faster than the other ways offered by subclasses. The pickle + file is written by calling dump() (after loading the grammar + tables using a subclass). The report() method prints a readable + representation of the tables to stdout, for debugging. + + The instance variables are as follows: + + symbol2number -- a dict mapping symbol names to numbers. Symbol + numbers are always 256 or higher, to distinguish + them from token numbers, which are between 0 and + 255 (inclusive). + + number2symbol -- a dict mapping numbers to symbol names; + these two are each other's inverse. + + states -- a list of DFAs, where each DFA is a list of + states, each state is is a list of arcs, and each + arc is a (i, j) pair where i is a label and j is + a state number. The DFA number is the index into + this list. (This name is slightly confusing.) + Final states are represented by a special arc of + the form (0, j) where j is its own state number. + + dfas -- a dict mapping symbol numbers to (DFA, first) + pairs, where DFA is an item from the states list + above, and first is a set of tokens that can + begin this grammar rule (represented by a dict + whose values are always 1). + + labels -- a list of (x, y) pairs where x is either a token + number or a symbol number, and y is either None + or a string; the strings are keywords. The label + number is the index in this list; label numbers + are used to mark state transitions (arcs) in the + DFAs. + + start -- the number of the grammar's start symbol. + + keywords -- a dict mapping keyword strings to arc labels. + + tokens -- a dict mapping token numbers to arc labels. + + """ + + def __init__(self): + self.symbol2number = {} + self.number2symbol = {} + self.states = [] + self.dfas = {} + self.labels = [(0, "EMPTY")] + self.keywords = {} + self.tokens = {} + self.symbol2label = {} + self.start = 256 + + def dump(self, filename): + """Dump the grammar tables to a pickle file.""" + f = open(filename, "wb") + pickle.dump(self.__dict__, f, 2) + f.close() + + def load(self, filename): + """Load the grammar tables from a pickle file.""" + f = open(filename, "rb") + d = pickle.load(f) + f.close() + self.__dict__.update(d) + + def report(self): + """Dump the grammar tables to standard output, for debugging.""" + from pprint import pprint + print "s2n" + pprint(self.symbol2number) + print "n2s" + pprint(self.number2symbol) + print "states" + pprint(self.states) + print "dfas" + pprint(self.dfas) + print "labels" + pprint(self.labels) + print "start", self.start + + +# Map from operator to number (since tokenize doesn't do this) + +opmap_raw = """ +( LPAR +) RPAR +[ LSQB +] RSQB +: COLON +, COMMA +; SEMI ++ PLUS +- MINUS +* STAR +/ SLASH +| VBAR +& AMPER +< LESS +> GREATER += EQUAL +. DOT +% PERCENT +` BACKQUOTE +{ LBRACE +} RBRACE +@ AT +== EQEQUAL +!= NOTEQUAL +<> NOTEQUAL +<= LESSEQUAL +>= GREATEREQUAL +~ TILDE +^ CIRCUMFLEX +<< LEFTSHIFT +>> RIGHTSHIFT +** DOUBLESTAR ++= PLUSEQUAL +-= MINEQUAL +*= STAREQUAL +/= SLASHEQUAL +%= PERCENTEQUAL +&= AMPEREQUAL +|= VBAREQUAL +^= CIRCUMFLEXEQUAL +<<= LEFTSHIFTEQUAL +>>= RIGHTSHIFTEQUAL +**= DOUBLESTAREQUAL +// DOUBLESLASH +//= DOUBLESLASHEQUAL +-> RARROW +""" + +opmap = {} +for line in opmap_raw.splitlines(): + if line: + op, name = line.split() + opmap[op] = getattr(token, name) diff --git a/sphinx/pycode/pgen2/literals.py b/sphinx/pycode/pgen2/literals.py new file mode 100644 index 000000000..0b3948a54 --- /dev/null +++ b/sphinx/pycode/pgen2/literals.py @@ -0,0 +1,60 @@ +# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. +# Licensed to PSF under a Contributor Agreement. + +"""Safely evaluate Python string literals without using eval().""" + +import re + +simple_escapes = {"a": "\a", + "b": "\b", + "f": "\f", + "n": "\n", + "r": "\r", + "t": "\t", + "v": "\v", + "'": "'", + '"': '"', + "\\": "\\"} + +def escape(m): + all, tail = m.group(0, 1) + assert all.startswith("\\") + esc = simple_escapes.get(tail) + if esc is not None: + return esc + if tail.startswith("x"): + hexes = tail[1:] + if len(hexes) < 2: + raise ValueError("invalid hex string escape ('\\%s')" % tail) + try: + i = int(hexes, 16) + except ValueError: + raise ValueError("invalid hex string escape ('\\%s')" % tail) + else: + try: + i = int(tail, 8) + except ValueError: + raise ValueError("invalid octal string escape ('\\%s')" % tail) + return chr(i) + +def evalString(s): + assert s.startswith("'") or s.startswith('"'), repr(s[:1]) + q = s[0] + if s[:3] == q*3: + q = q*3 + assert s.endswith(q), repr(s[-len(q):]) + assert len(s) >= 2*len(q) + s = s[len(q):-len(q)] + return re.sub(r"\\(\'|\"|\\|[abfnrtv]|x.{0,2}|[0-7]{1,3})", escape, s) + +def test(): + for i in range(256): + c = chr(i) + s = repr(c) + e = evalString(s) + if e != c: + print i, c, s, e + + +if __name__ == "__main__": + test() diff --git a/sphinx/pycode/pgen2/parse.py b/sphinx/pycode/pgen2/parse.py new file mode 100644 index 000000000..60eec05ea --- /dev/null +++ b/sphinx/pycode/pgen2/parse.py @@ -0,0 +1,201 @@ +# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. +# Licensed to PSF under a Contributor Agreement. + +"""Parser engine for the grammar tables generated by pgen. + +The grammar table must be loaded first. + +See Parser/parser.c in the Python distribution for additional info on +how this parsing engine works. + +""" + +# Local imports +from sphinx.pycode.pgen2 import token + +class ParseError(Exception): + """Exception to signal the parser is stuck.""" + + def __init__(self, msg, type, value, context): + Exception.__init__(self, "%s: type=%r, value=%r, context=%r" % + (msg, type, value, context)) + self.msg = msg + self.type = type + self.value = value + self.context = context + +class Parser(object): + """Parser engine. + + The proper usage sequence is: + + p = Parser(grammar, [converter]) # create instance + p.setup([start]) # prepare for parsing + <for each input token>: + if p.addtoken(...): # parse a token; may raise ParseError + break + root = p.rootnode # root of abstract syntax tree + + A Parser instance may be reused by calling setup() repeatedly. + + A Parser instance contains state pertaining to the current token + sequence, and should not be used concurrently by different threads + to parse separate token sequences. + + See driver.py for how to get input tokens by tokenizing a file or + string. + + Parsing is complete when addtoken() returns True; the root of the + abstract syntax tree can then be retrieved from the rootnode + instance variable. When a syntax error occurs, addtoken() raises + the ParseError exception. There is no error recovery; the parser + cannot be used after a syntax error was reported (but it can be + reinitialized by calling setup()). + + """ + + def __init__(self, grammar, convert=None): + """Constructor. + + The grammar argument is a grammar.Grammar instance; see the + grammar module for more information. + + The parser is not ready yet for parsing; you must call the + setup() method to get it started. + + The optional convert argument is a function mapping concrete + syntax tree nodes to abstract syntax tree nodes. If not + given, no conversion is done and the syntax tree produced is + the concrete syntax tree. If given, it must be a function of + two arguments, the first being the grammar (a grammar.Grammar + instance), and the second being the concrete syntax tree node + to be converted. The syntax tree is converted from the bottom + up. + + A concrete syntax tree node is a (type, value, context, nodes) + tuple, where type is the node type (a token or symbol number), + value is None for symbols and a string for tokens, context is + None or an opaque value used for error reporting (typically a + (lineno, offset) pair), and nodes is a list of children for + symbols, and None for tokens. + + An abstract syntax tree node may be anything; this is entirely + up to the converter function. + + """ + self.grammar = grammar + self.convert = convert or (lambda grammar, node: node) + + def setup(self, start=None): + """Prepare for parsing. + + This *must* be called before starting to parse. + + The optional argument is an alternative start symbol; it + defaults to the grammar's start symbol. + + You can use a Parser instance to parse any number of programs; + each time you call setup() the parser is reset to an initial + state determined by the (implicit or explicit) start symbol. + + """ + if start is None: + start = self.grammar.start + # Each stack entry is a tuple: (dfa, state, node). + # A node is a tuple: (type, value, context, children), + # where children is a list of nodes or None, and context may be None. + newnode = (start, None, None, []) + stackentry = (self.grammar.dfas[start], 0, newnode) + self.stack = [stackentry] + self.rootnode = None + self.used_names = set() # Aliased to self.rootnode.used_names in pop() + + def addtoken(self, type, value, context): + """Add a token; return True iff this is the end of the program.""" + # Map from token to label + ilabel = self.classify(type, value, context) + # Loop until the token is shifted; may raise exceptions + while True: + dfa, state, node = self.stack[-1] + states, first = dfa + arcs = states[state] + # Look for a state with this label + for i, newstate in arcs: + t, v = self.grammar.labels[i] + if ilabel == i: + # Look it up in the list of labels + assert t < 256 + # Shift a token; we're done with it + self.shift(type, value, newstate, context) + # Pop while we are in an accept-only state + state = newstate + while states[state] == [(0, state)]: + self.pop() + if not self.stack: + # Done parsing! + return True + dfa, state, node = self.stack[-1] + states, first = dfa + # Done with this token + return False + elif t >= 256: + # See if it's a symbol and if we're in its first set + itsdfa = self.grammar.dfas[t] + itsstates, itsfirst = itsdfa + if ilabel in itsfirst: + # Push a symbol + self.push(t, self.grammar.dfas[t], newstate, context) + break # To continue the outer while loop + else: + if (0, state) in arcs: + # An accepting state, pop it and try something else + self.pop() + if not self.stack: + # Done parsing, but another token is input + raise ParseError("too much input", + type, value, context) + else: + # No success finding a transition + raise ParseError("bad input", type, value, context) + + def classify(self, type, value, context): + """Turn a token into a label. (Internal)""" + if type == token.NAME: + # Keep a listing of all used names + self.used_names.add(value) + # Check for reserved words + ilabel = self.grammar.keywords.get(value) + if ilabel is not None: + return ilabel + ilabel = self.grammar.tokens.get(type) + if ilabel is None: + raise ParseError("bad token", type, value, context) + return ilabel + + def shift(self, type, value, newstate, context): + """Shift a token. (Internal)""" + dfa, state, node = self.stack[-1] + newnode = (type, value, context, None) + newnode = self.convert(self.grammar, newnode) + if newnode is not None: + node[-1].append(newnode) + self.stack[-1] = (dfa, newstate, node) + + def push(self, type, newdfa, newstate, context): + """Push a nonterminal. (Internal)""" + dfa, state, node = self.stack[-1] + newnode = (type, None, context, []) + self.stack[-1] = (dfa, newstate, node) + self.stack.append((newdfa, 0, newnode)) + + def pop(self): + """Pop a nonterminal. (Internal)""" + popdfa, popstate, popnode = self.stack.pop() + newnode = self.convert(self.grammar, popnode) + if newnode is not None: + if self.stack: + dfa, state, node = self.stack[-1] + node[-1].append(newnode) + else: + self.rootnode = newnode + self.rootnode.used_names = self.used_names diff --git a/sphinx/pycode/pgen2/pgen.py b/sphinx/pycode/pgen2/pgen.py new file mode 100644 index 000000000..d6895eaef --- /dev/null +++ b/sphinx/pycode/pgen2/pgen.py @@ -0,0 +1,384 @@ +# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. +# Licensed to PSF under a Contributor Agreement. + +# Pgen imports +from sphinx.pycode.pgen2 import grammar, token, tokenize + +class PgenGrammar(grammar.Grammar): + pass + +class ParserGenerator(object): + + def __init__(self, filename, stream=None): + close_stream = None + if stream is None: + stream = open(filename) + close_stream = stream.close + self.filename = filename + self.stream = stream + self.generator = tokenize.generate_tokens(stream.readline) + self.gettoken() # Initialize lookahead + self.dfas, self.startsymbol = self.parse() + if close_stream is not None: + close_stream() + self.first = {} # map from symbol name to set of tokens + self.addfirstsets() + + def make_grammar(self): + c = PgenGrammar() + names = self.dfas.keys() + names.sort() + names.remove(self.startsymbol) + names.insert(0, self.startsymbol) + for name in names: + i = 256 + len(c.symbol2number) + c.symbol2number[name] = i + c.number2symbol[i] = name + for name in names: + dfa = self.dfas[name] + states = [] + for state in dfa: + arcs = [] + for label, next in state.arcs.iteritems(): + arcs.append((self.make_label(c, label), dfa.index(next))) + if state.isfinal: + arcs.append((0, dfa.index(state))) + states.append(arcs) + c.states.append(states) + c.dfas[c.symbol2number[name]] = (states, self.make_first(c, name)) + c.start = c.symbol2number[self.startsymbol] + return c + + def make_first(self, c, name): + rawfirst = self.first[name] + first = {} + for label in rawfirst: + ilabel = self.make_label(c, label) + ##assert ilabel not in first # XXX failed on <> ... != + first[ilabel] = 1 + return first + + def make_label(self, c, label): + # XXX Maybe this should be a method on a subclass of converter? + ilabel = len(c.labels) + if label[0].isalpha(): + # Either a symbol name or a named token + if label in c.symbol2number: + # A symbol name (a non-terminal) + if label in c.symbol2label: + return c.symbol2label[label] + else: + c.labels.append((c.symbol2number[label], None)) + c.symbol2label[label] = ilabel + return ilabel + else: + # A named token (NAME, NUMBER, STRING) + itoken = getattr(token, label, None) + assert isinstance(itoken, int), label + assert itoken in token.tok_name, label + if itoken in c.tokens: + return c.tokens[itoken] + else: + c.labels.append((itoken, None)) + c.tokens[itoken] = ilabel + return ilabel + else: + # Either a keyword or an operator + assert label[0] in ('"', "'"), label + value = eval(label) + if value[0].isalpha(): + # A keyword + if value in c.keywords: + return c.keywords[value] + else: + c.labels.append((token.NAME, value)) + c.keywords[value] = ilabel + return ilabel + else: + # An operator (any non-numeric token) + itoken = grammar.opmap[value] # Fails if unknown token + if itoken in c.tokens: + return c.tokens[itoken] + else: + c.labels.append((itoken, None)) + c.tokens[itoken] = ilabel + return ilabel + + def addfirstsets(self): + names = self.dfas.keys() + names.sort() + for name in names: + if name not in self.first: + self.calcfirst(name) + #print name, self.first[name].keys() + + def calcfirst(self, name): + dfa = self.dfas[name] + self.first[name] = None # dummy to detect left recursion + state = dfa[0] + totalset = {} + overlapcheck = {} + for label, next in state.arcs.iteritems(): + if label in self.dfas: + if label in self.first: + fset = self.first[label] + if fset is None: + raise ValueError("recursion for rule %r" % name) + else: + self.calcfirst(label) + fset = self.first[label] + totalset.update(fset) + overlapcheck[label] = fset + else: + totalset[label] = 1 + overlapcheck[label] = {label: 1} + inverse = {} + for label, itsfirst in overlapcheck.iteritems(): + for symbol in itsfirst: + if symbol in inverse: + raise ValueError("rule %s is ambiguous; %s is in the" + " first sets of %s as well as %s" % + (name, symbol, label, inverse[symbol])) + inverse[symbol] = label + self.first[name] = totalset + + def parse(self): + dfas = {} + startsymbol = None + # MSTART: (NEWLINE | RULE)* ENDMARKER + while self.type != token.ENDMARKER: + while self.type == token.NEWLINE: + self.gettoken() + # RULE: NAME ':' RHS NEWLINE + name = self.expect(token.NAME) + self.expect(token.OP, ":") + a, z = self.parse_rhs() + self.expect(token.NEWLINE) + #self.dump_nfa(name, a, z) + dfa = self.make_dfa(a, z) + #self.dump_dfa(name, dfa) + oldlen = len(dfa) + self.simplify_dfa(dfa) + newlen = len(dfa) + dfas[name] = dfa + #print name, oldlen, newlen + if startsymbol is None: + startsymbol = name + return dfas, startsymbol + + def make_dfa(self, start, finish): + # To turn an NFA into a DFA, we define the states of the DFA + # to correspond to *sets* of states of the NFA. Then do some + # state reduction. Let's represent sets as dicts with 1 for + # values. + assert isinstance(start, NFAState) + assert isinstance(finish, NFAState) + def closure(state): + base = {} + addclosure(state, base) + return base + def addclosure(state, base): + assert isinstance(state, NFAState) + if state in base: + return + base[state] = 1 + for label, next in state.arcs: + if label is None: + addclosure(next, base) + states = [DFAState(closure(start), finish)] + for state in states: # NB states grows while we're iterating + arcs = {} + for nfastate in state.nfaset: + for label, next in nfastate.arcs: + if label is not None: + addclosure(next, arcs.setdefault(label, {})) + for label, nfaset in arcs.iteritems(): + for st in states: + if st.nfaset == nfaset: + break + else: + st = DFAState(nfaset, finish) + states.append(st) + state.addarc(st, label) + return states # List of DFAState instances; first one is start + + def dump_nfa(self, name, start, finish): + print "Dump of NFA for", name + todo = [start] + for i, state in enumerate(todo): + print " State", i, state is finish and "(final)" or "" + for label, next in state.arcs: + if next in todo: + j = todo.index(next) + else: + j = len(todo) + todo.append(next) + if label is None: + print " -> %d" % j + else: + print " %s -> %d" % (label, j) + + def dump_dfa(self, name, dfa): + print "Dump of DFA for", name + for i, state in enumerate(dfa): + print " State", i, state.isfinal and "(final)" or "" + for label, next in state.arcs.iteritems(): + print " %s -> %d" % (label, dfa.index(next)) + + def simplify_dfa(self, dfa): + # This is not theoretically optimal, but works well enough. + # Algorithm: repeatedly look for two states that have the same + # set of arcs (same labels pointing to the same nodes) and + # unify them, until things stop changing. + + # dfa is a list of DFAState instances + changes = True + while changes: + changes = False + for i, state_i in enumerate(dfa): + for j in range(i+1, len(dfa)): + state_j = dfa[j] + if state_i == state_j: + #print " unify", i, j + del dfa[j] + for state in dfa: + state.unifystate(state_j, state_i) + changes = True + break + + def parse_rhs(self): + # RHS: ALT ('|' ALT)* + a, z = self.parse_alt() + if self.value != "|": + return a, z + else: + aa = NFAState() + zz = NFAState() + aa.addarc(a) + z.addarc(zz) + while self.value == "|": + self.gettoken() + a, z = self.parse_alt() + aa.addarc(a) + z.addarc(zz) + return aa, zz + + def parse_alt(self): + # ALT: ITEM+ + a, b = self.parse_item() + while (self.value in ("(", "[") or + self.type in (token.NAME, token.STRING)): + c, d = self.parse_item() + b.addarc(c) + b = d + return a, b + + def parse_item(self): + # ITEM: '[' RHS ']' | ATOM ['+' | '*'] + if self.value == "[": + self.gettoken() + a, z = self.parse_rhs() + self.expect(token.OP, "]") + a.addarc(z) + return a, z + else: + a, z = self.parse_atom() + value = self.value + if value not in ("+", "*"): + return a, z + self.gettoken() + z.addarc(a) + if value == "+": + return a, z + else: + return a, a + + def parse_atom(self): + # ATOM: '(' RHS ')' | NAME | STRING + if self.value == "(": + self.gettoken() + a, z = self.parse_rhs() + self.expect(token.OP, ")") + return a, z + elif self.type in (token.NAME, token.STRING): + a = NFAState() + z = NFAState() + a.addarc(z, self.value) + self.gettoken() + return a, z + else: + self.raise_error("expected (...) or NAME or STRING, got %s/%s", + self.type, self.value) + + def expect(self, type, value=None): + if self.type != type or (value is not None and self.value != value): + self.raise_error("expected %s/%s, got %s/%s", + type, value, self.type, self.value) + value = self.value + self.gettoken() + return value + + def gettoken(self): + tup = self.generator.next() + while tup[0] in (tokenize.COMMENT, tokenize.NL): + tup = self.generator.next() + self.type, self.value, self.begin, self.end, self.line = tup + #print token.tok_name[self.type], repr(self.value) + + def raise_error(self, msg, *args): + if args: + try: + msg = msg % args + except: + msg = " ".join([msg] + map(str, args)) + raise SyntaxError(msg, (self.filename, self.end[0], + self.end[1], self.line)) + +class NFAState(object): + + def __init__(self): + self.arcs = [] # list of (label, NFAState) pairs + + def addarc(self, next, label=None): + assert label is None or isinstance(label, str) + assert isinstance(next, NFAState) + self.arcs.append((label, next)) + +class DFAState(object): + + def __init__(self, nfaset, final): + assert isinstance(nfaset, dict) + assert isinstance(iter(nfaset).next(), NFAState) + assert isinstance(final, NFAState) + self.nfaset = nfaset + self.isfinal = final in nfaset + self.arcs = {} # map from label to DFAState + + def addarc(self, next, label): + assert isinstance(label, str) + assert label not in self.arcs + assert isinstance(next, DFAState) + self.arcs[label] = next + + def unifystate(self, old, new): + for label, next in self.arcs.iteritems(): + if next is old: + self.arcs[label] = new + + def __eq__(self, other): + # Equality test -- ignore the nfaset instance variable + assert isinstance(other, DFAState) + if self.isfinal != other.isfinal: + return False + # Can't just return self.arcs == other.arcs, because that + # would invoke this method recursively, with cycles... + if len(self.arcs) != len(other.arcs): + return False + for label, next in self.arcs.iteritems(): + if next is not other.arcs.get(label): + return False + return True + +def generate_grammar(filename="Grammar.txt"): + p = ParserGenerator(filename) + return p.make_grammar() diff --git a/sphinx/pycode/pgen2/token.py b/sphinx/pycode/pgen2/token.py new file mode 100755 index 000000000..61468b313 --- /dev/null +++ b/sphinx/pycode/pgen2/token.py @@ -0,0 +1,82 @@ +#! /usr/bin/env python + +"""Token constants (from "token.h").""" + +# Taken from Python (r53757) and modified to include some tokens +# originally monkeypatched in by pgen2.tokenize + +#--start constants-- +ENDMARKER = 0 +NAME = 1 +NUMBER = 2 +STRING = 3 +NEWLINE = 4 +INDENT = 5 +DEDENT = 6 +LPAR = 7 +RPAR = 8 +LSQB = 9 +RSQB = 10 +COLON = 11 +COMMA = 12 +SEMI = 13 +PLUS = 14 +MINUS = 15 +STAR = 16 +SLASH = 17 +VBAR = 18 +AMPER = 19 +LESS = 20 +GREATER = 21 +EQUAL = 22 +DOT = 23 +PERCENT = 24 +BACKQUOTE = 25 +LBRACE = 26 +RBRACE = 27 +EQEQUAL = 28 +NOTEQUAL = 29 +LESSEQUAL = 30 +GREATEREQUAL = 31 +TILDE = 32 +CIRCUMFLEX = 33 +LEFTSHIFT = 34 +RIGHTSHIFT = 35 +DOUBLESTAR = 36 +PLUSEQUAL = 37 +MINEQUAL = 38 +STAREQUAL = 39 +SLASHEQUAL = 40 +PERCENTEQUAL = 41 +AMPEREQUAL = 42 +VBAREQUAL = 43 +CIRCUMFLEXEQUAL = 44 +LEFTSHIFTEQUAL = 45 +RIGHTSHIFTEQUAL = 46 +DOUBLESTAREQUAL = 47 +DOUBLESLASH = 48 +DOUBLESLASHEQUAL = 49 +AT = 50 +OP = 51 +COMMENT = 52 +NL = 53 +RARROW = 54 +ERRORTOKEN = 55 +N_TOKENS = 56 +NT_OFFSET = 256 +#--end constants-- + +tok_name = {} +for _name, _value in globals().items(): + if type(_value) is type(0): + tok_name[_value] = _name + + +def ISTERMINAL(x): + return x < NT_OFFSET + +def ISNONTERMINAL(x): + return x >= NT_OFFSET + +def ISEOF(x): + return x == ENDMARKER diff --git a/sphinx/pycode/pgen2/tokenize.py b/sphinx/pycode/pgen2/tokenize.py new file mode 100644 index 000000000..46ee7842b --- /dev/null +++ b/sphinx/pycode/pgen2/tokenize.py @@ -0,0 +1,405 @@ +# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation. +# All rights reserved. + +"""Tokenization help for Python programs. + +generate_tokens(readline) is a generator that breaks a stream of +text into Python tokens. It accepts a readline-like method which is called +repeatedly to get the next line of input (or "" for EOF). It generates +5-tuples with these members: + + the token type (see token.py) + the token (a string) + the starting (row, column) indices of the token (a 2-tuple of ints) + the ending (row, column) indices of the token (a 2-tuple of ints) + the original line (string) + +It is designed to match the working of the Python tokenizer exactly, except +that it produces COMMENT tokens for comments and gives type OP for all +operators + +Older entry points + tokenize_loop(readline, tokeneater) + tokenize(readline, tokeneater=printtoken) +are the same, except instead of generating tokens, tokeneater is a callback +function to which the 5 fields described above are passed as 5 arguments, +each time a new token is found.""" + +__author__ = 'Ka-Ping Yee <ping@lfw.org>' +__credits__ = \ + 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro' + +import string, re +from sphinx.pycode.pgen2.token import * +from sphinx.pycode.pgen2 import token + +__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize", + "generate_tokens", "untokenize"] +del token + +def group(*choices): return '(' + '|'.join(choices) + ')' +def any(*choices): return group(*choices) + '*' +def maybe(*choices): return group(*choices) + '?' + +Whitespace = r'[ \f\t]*' +Comment = r'#[^\r\n]*' +Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment) +Name = r'[a-zA-Z_]\w*' + +Binnumber = r'0[bB][01]*' +Hexnumber = r'0[xX][\da-fA-F]*[lL]?' +Octnumber = r'0[oO]?[0-7]*[lL]?' +Decnumber = r'[1-9]\d*[lL]?' +Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber) +Exponent = r'[eE][-+]?\d+' +Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent) +Expfloat = r'\d+' + Exponent +Floatnumber = group(Pointfloat, Expfloat) +Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]') +Number = group(Imagnumber, Floatnumber, Intnumber) + +# Tail end of ' string. +Single = r"[^'\\]*(?:\\.[^'\\]*)*'" +# Tail end of " string. +Double = r'[^"\\]*(?:\\.[^"\\]*)*"' +# Tail end of ''' string. +Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" +# Tail end of """ string. +Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' +Triple = group("[ubUB]?[rR]?'''", '[ubUB]?[rR]?"""') +# Single-line ' or " string. +String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'", + r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"') + +# Because of leftmost-then-longest match semantics, be sure to put the +# longest operators first (e.g., if = came before ==, == would get +# recognized as two instances of =). +Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=", + r"//=?", r"->", + r"[+\-*/%&|^=<>]=?", + r"~") + +Bracket = '[][(){}]' +Special = group(r'\r?\n', r'[:;.,`@]') +Funny = group(Operator, Bracket, Special) + +PlainToken = group(Number, Funny, String, Name) +Token = Ignore + PlainToken + +# First (or only) line of ' or " string. +ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" + + group("'", r'\\\r?\n'), + r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' + + group('"', r'\\\r?\n')) +PseudoExtras = group(r'\\\r?\n', Comment, Triple) +PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) + +tokenprog, pseudoprog, single3prog, double3prog = map( + re.compile, (Token, PseudoToken, Single3, Double3)) +endprogs = {"'": re.compile(Single), '"': re.compile(Double), + "'''": single3prog, '"""': double3prog, + "r'''": single3prog, 'r"""': double3prog, + "u'''": single3prog, 'u"""': double3prog, + "b'''": single3prog, 'b"""': double3prog, + "ur'''": single3prog, 'ur"""': double3prog, + "br'''": single3prog, 'br"""': double3prog, + "R'''": single3prog, 'R"""': double3prog, + "U'''": single3prog, 'U"""': double3prog, + "B'''": single3prog, 'B"""': double3prog, + "uR'''": single3prog, 'uR"""': double3prog, + "Ur'''": single3prog, 'Ur"""': double3prog, + "UR'''": single3prog, 'UR"""': double3prog, + "bR'''": single3prog, 'bR"""': double3prog, + "Br'''": single3prog, 'Br"""': double3prog, + "BR'''": single3prog, 'BR"""': double3prog, + 'r': None, 'R': None, + 'u': None, 'U': None, + 'b': None, 'B': None} + +triple_quoted = {} +for t in ("'''", '"""', + "r'''", 'r"""', "R'''", 'R"""', + "u'''", 'u"""', "U'''", 'U"""', + "b'''", 'b"""', "B'''", 'B"""', + "ur'''", 'ur"""', "Ur'''", 'Ur"""', + "uR'''", 'uR"""', "UR'''", 'UR"""', + "br'''", 'br"""', "Br'''", 'Br"""', + "bR'''", 'bR"""', "BR'''", 'BR"""',): + triple_quoted[t] = t +single_quoted = {} +for t in ("'", '"', + "r'", 'r"', "R'", 'R"', + "u'", 'u"', "U'", 'U"', + "b'", 'b"', "B'", 'B"', + "ur'", 'ur"', "Ur'", 'Ur"', + "uR'", 'uR"', "UR'", 'UR"', + "br'", 'br"', "Br'", 'Br"', + "bR'", 'bR"', "BR'", 'BR"', ): + single_quoted[t] = t + +tabsize = 8 + +class TokenError(Exception): pass + +class StopTokenizing(Exception): pass + +def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing + print "%d,%d-%d,%d:\t%s\t%s" % \ + (srow, scol, erow, ecol, tok_name[type], repr(token)) + +def tokenize(readline, tokeneater=printtoken): + """ + The tokenize() function accepts two parameters: one representing the + input stream, and one providing an output mechanism for tokenize(). + + The first parameter, readline, must be a callable object which provides + the same interface as the readline() method of built-in file objects. + Each call to the function should return one line of input as a string. + + The second parameter, tokeneater, must also be a callable object. It is + called once for each token, with five arguments, corresponding to the + tuples generated by generate_tokens(). + """ + try: + tokenize_loop(readline, tokeneater) + except StopTokenizing: + pass + +# backwards compatible interface +def tokenize_loop(readline, tokeneater): + for token_info in generate_tokens(readline): + tokeneater(*token_info) + +class Untokenizer: + + def __init__(self): + self.tokens = [] + self.prev_row = 1 + self.prev_col = 0 + + def add_whitespace(self, start): + row, col = start + assert row <= self.prev_row + col_offset = col - self.prev_col + if col_offset: + self.tokens.append(" " * col_offset) + + def untokenize(self, iterable): + for t in iterable: + if len(t) == 2: + self.compat(t, iterable) + break + tok_type, token, start, end, line = t + self.add_whitespace(start) + self.tokens.append(token) + self.prev_row, self.prev_col = end + if tok_type in (NEWLINE, NL): + self.prev_row += 1 + self.prev_col = 0 + return "".join(self.tokens) + + def compat(self, token, iterable): + startline = False + indents = [] + toks_append = self.tokens.append + toknum, tokval = token + if toknum in (NAME, NUMBER): + tokval += ' ' + if toknum in (NEWLINE, NL): + startline = True + for tok in iterable: + toknum, tokval = tok[:2] + + if toknum in (NAME, NUMBER): + tokval += ' ' + + if toknum == INDENT: + indents.append(tokval) + continue + elif toknum == DEDENT: + indents.pop() + continue + elif toknum in (NEWLINE, NL): + startline = True + elif startline and indents: + toks_append(indents[-1]) + startline = False + toks_append(tokval) + +def untokenize(iterable): + """Transform tokens back into Python source code. + + Each element returned by the iterable must be a token sequence + with at least two elements, a token number and token value. If + only two tokens are passed, the resulting output is poor. + + Round-trip invariant for full input: + Untokenized source will match input source exactly + + Round-trip invariant for limited intput: + # Output text will tokenize the back to the input + t1 = [tok[:2] for tok in generate_tokens(f.readline)] + newcode = untokenize(t1) + readline = iter(newcode.splitlines(1)).next + t2 = [tok[:2] for tokin generate_tokens(readline)] + assert t1 == t2 + """ + ut = Untokenizer() + return ut.untokenize(iterable) + +def generate_tokens(readline): + """ + The generate_tokens() generator requires one argment, readline, which + must be a callable object which provides the same interface as the + readline() method of built-in file objects. Each call to the function + should return one line of input as a string. Alternately, readline + can be a callable function terminating with StopIteration: + readline = open(myfile).next # Example of alternate readline + + The generator produces 5-tuples with these members: the token type; the + token string; a 2-tuple (srow, scol) of ints specifying the row and + column where the token begins in the source; a 2-tuple (erow, ecol) of + ints specifying the row and column where the token ends in the source; + and the line on which the token was found. The line passed is the + logical line; continuation lines are included. + """ + lnum = parenlev = continued = 0 + namechars, numchars = string.ascii_letters + '_', '0123456789' + contstr, needcont = '', 0 + contline = None + indents = [0] + + while 1: # loop over lines in stream + try: + line = readline() + except StopIteration: + line = '' + lnum = lnum + 1 + pos, max = 0, len(line) + + if contstr: # continued string + if not line: + raise TokenError, ("EOF in multi-line string", strstart) + endmatch = endprog.match(line) + if endmatch: + pos = end = endmatch.end(0) + yield (STRING, contstr + line[:end], + strstart, (lnum, end), contline + line) + contstr, needcont = '', 0 + contline = None + elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n': + yield (ERRORTOKEN, contstr + line, + strstart, (lnum, len(line)), contline) + contstr = '' + contline = None + continue + else: + contstr = contstr + line + contline = contline + line + continue + + elif parenlev == 0 and not continued: # new statement + if not line: break + column = 0 + while pos < max: # measure leading whitespace + if line[pos] == ' ': column = column + 1 + elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize + elif line[pos] == '\f': column = 0 + else: break + pos = pos + 1 + if pos == max: break + + if line[pos] in '#\r\n': # skip comments or blank lines + if line[pos] == '#': + comment_token = line[pos:].rstrip('\r\n') + nl_pos = pos + len(comment_token) + yield (COMMENT, comment_token, + (lnum, pos), (lnum, pos + len(comment_token)), line) + yield (NL, line[nl_pos:], + (lnum, nl_pos), (lnum, len(line)), line) + else: + yield ((NL, COMMENT)[line[pos] == '#'], line[pos:], + (lnum, pos), (lnum, len(line)), line) + continue + + if column > indents[-1]: # count indents or dedents + indents.append(column) + yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line) + while column < indents[-1]: + if column not in indents: + raise IndentationError( + "unindent does not match any outer indentation level", + ("<tokenize>", lnum, pos, line)) + indents = indents[:-1] + yield (DEDENT, '', (lnum, pos), (lnum, pos), line) + + else: # continued statement + if not line: + raise TokenError, ("EOF in multi-line statement", (lnum, 0)) + continued = 0 + + while pos < max: + pseudomatch = pseudoprog.match(line, pos) + if pseudomatch: # scan for tokens + start, end = pseudomatch.span(1) + spos, epos, pos = (lnum, start), (lnum, end), end + token, initial = line[start:end], line[start] + + if initial in numchars or \ + (initial == '.' and token != '.'): # ordinary number + yield (NUMBER, token, spos, epos, line) + elif initial in '\r\n': + newline = NEWLINE + if parenlev > 0: + newline = NL + yield (newline, token, spos, epos, line) + elif initial == '#': + assert not token.endswith("\n") + yield (COMMENT, token, spos, epos, line) + elif token in triple_quoted: + endprog = endprogs[token] + endmatch = endprog.match(line, pos) + if endmatch: # all on one line + pos = endmatch.end(0) + token = line[start:pos] + yield (STRING, token, spos, (lnum, pos), line) + else: + strstart = (lnum, start) # multiple lines + contstr = line[start:] + contline = line + break + elif initial in single_quoted or \ + token[:2] in single_quoted or \ + token[:3] in single_quoted: + if token[-1] == '\n': # continued string + strstart = (lnum, start) + endprog = (endprogs[initial] or endprogs[token[1]] or + endprogs[token[2]]) + contstr, needcont = line[start:], 1 + contline = line + break + else: # ordinary string + yield (STRING, token, spos, epos, line) + elif initial in namechars: # ordinary name + yield (NAME, token, spos, epos, line) + elif initial == '\\': # continued stmt + # This yield is new; needed for better idempotency: + yield (NL, token, spos, (lnum, pos), line) + continued = 1 + else: + if initial in '([{': parenlev = parenlev + 1 + elif initial in ')]}': parenlev = parenlev - 1 + yield (OP, token, spos, epos, line) + else: + yield (ERRORTOKEN, line[pos], + (lnum, pos), (lnum, pos+1), line) + pos = pos + 1 + + for indent in indents[1:]: # pop remaining indent levels + yield (DEDENT, '', (lnum, 0), (lnum, 0), '') + yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '') + +if __name__ == '__main__': # testing + import sys + if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline) + else: tokenize(sys.stdin.readline) diff --git a/sphinx/pycode/pytree.py b/sphinx/pycode/pytree.py new file mode 100644 index 000000000..950319c59 --- /dev/null +++ b/sphinx/pycode/pytree.py @@ -0,0 +1,295 @@ +# Copyright 2006 Google, Inc. All Rights Reserved. +# Licensed to PSF under a Contributor Agreement. + +"""Python parse tree definitions. + +This is a very concrete parse tree; we need to keep every token and +even the comments and whitespace between tokens. + +Adapted for read-only nodes from pytree.py in Python's 2to3 tool, and +added a few bits. +""" + +__author__ = "Guido van Rossum <guido@python.org>" + + +class Base(object): + + """Abstract base class for Node and Leaf. + + This provides some default functionality and boilerplate using the + template pattern. + + A node may be a subnode of at most one parent. + """ + + # Default values for instance variables + type = None # int: token number (< 256) or symbol number (>= 256) + parent = None # Parent node pointer, or None + children = () # Tuple of subnodes + was_changed = False + + def __new__(cls, *args, **kwds): + """Constructor that prevents Base from being instantiated.""" + assert cls is not Base, "Cannot instantiate Base" + return object.__new__(cls) + + def __eq__(self, other): + """Compares two nodes for equality. + + This calls the method _eq(). + """ + if self.__class__ is not other.__class__: + return NotImplemented + return self._eq(other) + + def __ne__(self, other): + """Compares two nodes for inequality. + + This calls the method _eq(). + """ + if self.__class__ is not other.__class__: + return NotImplemented + return not self._eq(other) + + def _eq(self, other): + """Compares two nodes for equality. + + This is called by __eq__ and __ne__. It is only called if the + two nodes have the same type. This must be implemented by the + concrete subclass. Nodes should be considered equal if they + have the same structure, ignoring the prefix string and other + context information. + """ + raise NotImplementedError + + def get_lineno(self): + """Returns the line number which generated the invocant node.""" + node = self + while not isinstance(node, Leaf): + if not node.children: + return + node = node.children[0] + return node.lineno + + def get_next_sibling(self): + """Return the node immediately following the invocant in their + parent's children list. If the invocant does not have a next + sibling, return None.""" + if self.parent is None: + return None + + # Can't use index(); we need to test by identity + for i, child in enumerate(self.parent.children): + if child is self: + try: + return self.parent.children[i+1] + except IndexError: + return None + + def get_prev_sibling(self): + """Return the node immediately preceding the invocant in their + parent's children list. If the invocant does not have a previous + sibling, return None.""" + if self.parent is None: + return None + + # Can't use index(); we need to test by identity + for i, child in enumerate(self.parent.children): + if child is self: + if i == 0: + return None + return self.parent.children[i-1] + + def get_prev_leaf(self): + """Return the leaf node that precedes this node in the parse tree.""" + def last_child(node): + if isinstance(node, Leaf): + return node + elif not node.children: + return None + else: + return last_child(node.children[-1]) + if self.parent is None: + return None + prev = self.get_prev_sibling() + if isinstance(prev, Leaf): + return prev + elif prev is not None: + return last_child(prev) + return self.parent.get_prev_leaf() + + def get_suffix(self): + """Return the string immediately following the invocant node. This + is effectively equivalent to node.get_next_sibling().get_prefix()""" + next_sib = self.get_next_sibling() + if next_sib is None: + return "" + return next_sib.get_prefix() + + +class Node(Base): + + """Concrete implementation for interior nodes.""" + + def __init__(self, type, children, context=None, prefix=None): + """Initializer. + + Takes a type constant (a symbol number >= 256), a sequence of + child nodes, and an optional context keyword argument. + + As a side effect, the parent pointers of the children are updated. + """ + assert type >= 256, type + self.type = type + self.children = list(children) + for ch in self.children: + assert ch.parent is None, repr(ch) + ch.parent = self + if prefix is not None: + self.set_prefix(prefix) + + def __repr__(self): + return "%s(%s, %r)" % (self.__class__.__name__, + self.type, self.children) + + def __str__(self): + """This reproduces the input source exactly.""" + return "".join(map(str, self.children)) + + def compact(self): + return ''.join(child.compact() for child in self.children) + + def __getitem__(self, index): + return self.children[index] + + def __iter__(self): + return iter(self.children) + + def _eq(self, other): + """Compares two nodes for equality.""" + return (self.type, self.children) == (other.type, other.children) + + def post_order(self): + """Returns a post-order iterator for the tree.""" + for child in self.children: + for node in child.post_order(): + yield node + yield self + + def pre_order(self): + """Returns a pre-order iterator for the tree.""" + yield self + for child in self.children: + for node in child.post_order(): + yield node + + def get_prefix(self): + """Returns the prefix for the node. + + This passes the call on to the first child. + """ + if not self.children: + return "" + return self.children[0].get_prefix() + + +class Leaf(Base): + + """Concrete implementation for leaf nodes.""" + + # Default values for instance variables + prefix = "" # Whitespace and comments preceding this token in the input + lineno = 0 # Line where this token starts in the input + column = 0 # Column where this token tarts in the input + + def __init__(self, type, value, context=None, prefix=None): + """Initializer. + + Takes a type constant (a token number < 256), a string value, + and an optional context keyword argument. + """ + assert 0 <= type < 256, type + if context is not None: + self.prefix, (self.lineno, self.column) = context + self.type = type + self.value = value + if prefix is not None: + self.prefix = prefix + + def __repr__(self): + return "%s(%r, %r)" % (self.__class__.__name__, + self.type, self.value) + + def __str__(self): + """This reproduces the input source exactly.""" + return self.prefix + str(self.value) + + def compact(self): + return str(self.value) + + def _eq(self, other): + """Compares two nodes for equality.""" + return (self.type, self.value) == (other.type, other.value) + + def post_order(self): + """Returns a post-order iterator for the tree.""" + yield self + + def pre_order(self): + """Returns a pre-order iterator for the tree.""" + yield self + + def get_prefix(self): + """Returns the prefix for the node.""" + return self.prefix + + +def convert(grammar, raw_node): + """Convert raw node to a Node or Leaf instance.""" + type, value, context, children = raw_node + if children or type in grammar.number2symbol: + # If there's exactly one child, return that child instead of + # creating a new node. + if len(children) == 1: + return children[0] + return Node(type, children, context=context) + else: + return Leaf(type, value, context=context) + + +def nice_repr(node, number2name, prefix=False): + def _repr(node): + if isinstance(node, Leaf): + return "%s(%r)" % (number2name[node.type], node.value) + else: + return "%s(%s)" % (number2name[node.type], + ', '.join(map(_repr, node.children))) + def _prepr(node): + if isinstance(node, Leaf): + return "%s(%r, %r)" % (number2name[node.type], node.prefix, node.value) + else: + return "%s(%s)" % (number2name[node.type], + ', '.join(map(_prepr, node.children))) + return (prefix and _prepr or _repr)(node) + + +class NodeVisitor(object): + def __init__(self, number2name): + self.number2name = number2name + self.init() + + def init(self): + pass + + def visit(self, node): + """Visit a node.""" + method = 'visit_' + self.number2name[node.type] + visitor = getattr(self, method, self.generic_visit) + return visitor(node) + + def generic_visit(self, node): + """Called if no explicit visitor function exists for a node.""" + if isinstance(node, Node): + for child in node: + self.visit(child) |