From b75e46e8a06dd8170c9fbabaee4c4aaa22050e67 Mon Sep 17 00:00:00 2001 From: goodger Date: Thu, 12 Dec 2002 03:26:55 +0000 Subject: Updated. Dead-end with AssignmentVisitor reconstructing expressions. TokenReader seems to be the way to go. git-svn-id: http://svn.code.sf.net/p/docutils/code/trunk/docutils@1017 929543f6-e4f2-0310-98a6-ba3bd3dd1d04 --- docutils/readers/python/moduleparser.py | 250 ++++++++++++++++++++++++++++++-- 1 file changed, 238 insertions(+), 12 deletions(-) (limited to 'docutils/readers/python/moduleparser.py') diff --git a/docutils/readers/python/moduleparser.py b/docutils/readers/python/moduleparser.py index 5aab372b2..9ab3eea79 100644 --- a/docutils/readers/python/moduleparser.py +++ b/docutils/readers/python/moduleparser.py @@ -14,6 +14,150 @@ Ideas: * Merge the compiler & tokenize output such that the raw text hangs off of nodes? Especially assignment expressions (RHS). +What I'd like to do is to take a module, read in the text, run it through the +module parser (using compiler.py and tokenize.py) and produce a high-level AST +full of nodes that are interesting from an auto-documentation standpoint. For +example, given this module (x.py):: + + # comment + + '''Docstring''' + + '''Additional docstring''' + + __docformat__ = 'reStructuredText' + + a = 1 + '''Attribute docstring''' + + class C(Super): + + '''C's docstring''' + + class_attribute = 1 + '''class_attribute's docstring''' + + def __init__(self, text=None): + '''__init__'s docstring''' + + self.instance_attribute = (text * 7 + + ' whaddyaknow') + '''instance_attribute's docstring''' + + + def f(x, # parameter x + y=a*5, # parameter y + *args): # parameter args + '''f's docstring''' + return [x + item for item in args] + + f.function_attribute = 1 + '''f.function_attribute's docstring''' + +The module parser should produce a high-level AST, something like this:: + + + + comment + + Docstring + (I'll leave out the lineno's) + Additional docstring + + + 'reStructuredText' + + + 1 + + Attribute docstring + + + C's docstring + + + 1 + + class_attribute's docstring + + + __init__'s docstring + + + (text * 7 + + ' whaddyaknow') + + class_attribute's docstring + + + + + # parameter x + + + a*5 + + # parameter y + + + # parameter args + + f's docstring + + + 1 + + f.function_attribute's docstring + +compiler.parse() provides most of what's needed for this AST. I think that +"tokenize" can be used to get the rest, and all that's left is to hunker down +and figure out how. We can determine the line number from the +compiler.parse() AST, and a get_rhs(lineno) method would provide the rest. + +The Docutils Python reader component will transform this AST into a +Python-specific doctree, and then a `stylist transform`_ would further +transform it into a generic doctree. Namespaces will have to be compiled for +each of the scopes, but I'm not certain at what stage of processing. + +It's very important to keep all docstring processing out of this, so that it's +a completely generic and not tool-specific. + +> Why perform all of those transformations? Why not go from the AST to a +> generic doctree? Or, even from the AST to the final output? + +I want the docutils.readers.python.moduleparser.parse_module() function to +produce a standard documentation-oriented AST that can be used by any tool. +We can develop it together without having to compromise on the rest of our +design (i.e., HappyDoc doesn't have to be made to work like Docutils, and +vice-versa). It would be a higher-level version of what compiler.py provides. + +The Python reader component transforms this generic AST into a Python-specific +doctree (it knows about modules, classes, functions, etc.), but this is +specific to Docutils and cannot be used by HappyDoc or others. The stylist +transform does the final layout, converting Python-specific structures +("class" sections, etc.) into a generic doctree using primitives (tables, +sections, lists, etc.). This generic doctree does *not* know about Python +structures any more. The advantage is that this doctree can be handed off to +any of the output writers to create any output format we like. + +The latter two transforms are separate because I want to be able to have +multiple independent layout styles (multiple runtime-selectable "stylist +transforms"). Each of the existing tools (HappyDoc, pydoc, epydoc, Crystal, +etc.) has its own fixed format. I personally don't like the tables-based +format produced by these tools, and I'd like to be able to customize the +format easily. That's the goal of stylist transforms, which are independent +from the Reader component itself. One stylist transform could produce +HappyDoc-like output, another could produce output similar to module docs in +the Python library reference manual, and so on. + +It's for exactly this reason: + +>> It's very important to keep all docstring processing out of this, so that +>> it's a completely generic and not tool-specific. + +... but it goes past docstring processing. It's also important to keep style +decisions and tool-specific data transforms out of this module parser. + """ __docformat__ = 'reStructuredText' @@ -21,8 +165,10 @@ __docformat__ = 'reStructuredText' import sys import compiler import compiler.ast -import compiler.visitor +import tokenize +import token from compiler.consts import OP_ASSIGN +from compiler.visitor import ASTVisitor from types import StringType, UnicodeType @@ -33,10 +179,10 @@ def parse_module(module_text, filename): return visitor.module -class ModuleVisitor(compiler.visitor.ASTVisitor): +class ModuleVisitor(ASTVisitor): def __init__(self, filename): - compiler.visitor.ASTVisitor.__init__(self) + ASTVisitor.__init__(self) self.filename = filename self.module = None self.context = [] @@ -45,11 +191,11 @@ class ModuleVisitor(compiler.visitor.ASTVisitor): def default(self, node, *args): self.documentable = None #print 'in default (%s)' % node.__class__.__name__ - #compiler.visitor.ASTVisitor.default(self, node, *args) + #ASTVisitor.default(self, node, *args) def default_ignore(self, node, *args): #print 'in default_ignore (%s)' % node.__class__.__name__ - compiler.visitor.ASTVisitor.default(self, node, *args) + ASTVisitor.default(self, node, *args) def visitModule(self, node): #print dir(node) @@ -95,23 +241,66 @@ class ModuleVisitor(compiler.visitor.ASTVisitor): self.documentable = None -class AssignmentVisitor(compiler.visitor.ASTVisitor): +class AssignmentVisitor(ASTVisitor): + + """ + Tried reconstructing expressions (the RHS of assignments) by + visiting the compiler.parse() tree, but a lot of information is + missing, like parenthesis-grouping of expressions. + + Gotta do it by parsing tokens. + """ def __init__(self): - compiler.visitor.ASTVisitor.__init__(self) + ASTVisitor.__init__(self) self.attributes = [] + self.parts = [] def default(self, node, *args): - pass + print >>sys.stderr, '%s not visited!' % node.__class__.__name__ + ASTVisitor.default(self, node) def visitAssign(self, node): - compiler.visitor.ASTVisitor.default(self, node) + ASTVisitor.default(self, node) + self.attributes[-1].append(Expression(node, ''.join(self.parts))) def visitAssName(self, node): self.attributes.append(Attribute(node, node.name)) - def get_rhs(self, node): - return "'TBD'" + def visitAdd(self, node): + ASTVisitor.default(self, node) + self.parts[-2:] = ' + '.join(self.parts[-2:]) + + def visitAnd(self, node): + ASTVisitor.default(self, node) + self.parts.insert(len(self.parts) - 1, ' and ') + + def visitBackquote(self, node): + self.parts.append('`') + ASTVisitor.default(self, node) + self.parts.append('`') + + def visitBitand(self, node): + ASTVisitor.default(self, node) + self.parts.insert(len(self.parts) - 1, ' & ') + + def visitBitor(self, node): + ASTVisitor.default(self, node) + self.parts.insert(len(self.parts) - 1, ' | ') + + def visitBitxor(self, node): + ASTVisitor.default(self, node) + self.parts.insert(len(self.parts) - 1, ' ^ ') + + def visitConst(self, node): + self.parts.append(repr(node.value)) + + def visitConst(self, node): + self.parts.append(repr(node.value)) + + def visitInvert(self, node): + self.parts.append('~ ') + ASTVisitor.default(self, node) class Node: @@ -211,7 +400,44 @@ class Expression(Node): def __init__(self, node, text): Node.__init__(self, node) self.text = text - + + def __str__(self, indent=' ', level=0): + prefix = indent * (level + 1) + return '%s%s%s\n' % (Node.__str__(self, indent, level), + prefix, self.text) + + +class TokenReader: + + def __init__(self, text): + self.text = text + self.lines = text.splitlines(1) + self.generator = tokenize.generate_tokens(iter(self.lines).next) + + def __iter__(self): + return self + + def next(self): + token = self.generator.next() + self.type, self.string, self.start, self.end, self.line = token + return token + + def goto_line(self, lineno): + for token in self: + if self.start[0] >= lineno: + return token + else: + raise IndexError + + def rhs(self, name, lineno): + self.goto_line(lineno) + while self.start[0] == lineno: + if self.type == token.OP and self.string == '=': + break + self.next() + else: + raise IndexError + def trim_docstring(text): """ -- cgit v1.2.1