summaryrefslogtreecommitdiff
path: root/docutils/readers/python/moduleparser.py
diff options
context:
space:
mode:
authorgoodger <goodger@929543f6-e4f2-0310-98a6-ba3bd3dd1d04>2002-12-12 03:26:55 +0000
committergoodger <goodger@929543f6-e4f2-0310-98a6-ba3bd3dd1d04>2002-12-12 03:26:55 +0000
commitb75e46e8a06dd8170c9fbabaee4c4aaa22050e67 (patch)
tree12202e9746cf485b7afff0a7b3f579569a1c59f7 /docutils/readers/python/moduleparser.py
parentf4a682232afed2927fee6f572267b87559fa417b (diff)
downloaddocutils-b75e46e8a06dd8170c9fbabaee4c4aaa22050e67.tar.gz
Updated. Dead-end with AssignmentVisitor reconstructing expressions. TokenReader seems to be the way to go.
git-svn-id: http://svn.code.sf.net/p/docutils/code/trunk/docutils@1017 929543f6-e4f2-0310-98a6-ba3bd3dd1d04
Diffstat (limited to 'docutils/readers/python/moduleparser.py')
-rw-r--r--docutils/readers/python/moduleparser.py250
1 files changed, 238 insertions, 12 deletions
diff --git a/docutils/readers/python/moduleparser.py b/docutils/readers/python/moduleparser.py
index 5aab372b2..9ab3eea79 100644
--- a/docutils/readers/python/moduleparser.py
+++ b/docutils/readers/python/moduleparser.py
@@ -14,6 +14,150 @@ Ideas:
* Merge the compiler & tokenize output such that the raw text hangs off of
nodes? Especially assignment expressions (RHS).
+What I'd like to do is to take a module, read in the text, run it through the
+module parser (using compiler.py and tokenize.py) and produce a high-level AST
+full of nodes that are interesting from an auto-documentation standpoint. For
+example, given this module (x.py)::
+
+ # comment
+
+ '''Docstring'''
+
+ '''Additional docstring'''
+
+ __docformat__ = 'reStructuredText'
+
+ a = 1
+ '''Attribute docstring'''
+
+ class C(Super):
+
+ '''C's docstring'''
+
+ class_attribute = 1
+ '''class_attribute's docstring'''
+
+ def __init__(self, text=None):
+ '''__init__'s docstring'''
+
+ self.instance_attribute = (text * 7
+ + ' whaddyaknow')
+ '''instance_attribute's docstring'''
+
+
+ def f(x, # parameter x
+ y=a*5, # parameter y
+ *args): # parameter args
+ '''f's docstring'''
+ return [x + item for item in args]
+
+ f.function_attribute = 1
+ '''f.function_attribute's docstring'''
+
+The module parser should produce a high-level AST, something like this::
+
+ <Module filename="x.py">
+ <Comment lineno=1>
+ comment
+ <Docstring lineno=3>
+ Docstring
+ <Docstring lineno=...> (I'll leave out the lineno's)
+ Additional docstring
+ <Attribute name="__docformat__">
+ <Expression>
+ 'reStructuredText'
+ <Attribute name="a">
+ <Expression>
+ 1
+ <Docstring>
+ Attribute docstring
+ <Class name="C" inheritance="Super">
+ <Docstring>
+ C's docstring
+ <Attribute name="class_attribute">
+ <Expression>
+ 1
+ <Docstring>
+ class_attribute's docstring
+ <Method name="__init__" argnames=['self', ('text', 'None')]>
+ <Docstring>
+ __init__'s docstring
+ <Attribute name="instance_attribute" instance=True>
+ <Expression>
+ (text * 7
+ + ' whaddyaknow')
+ <Docstring>
+ class_attribute's docstring
+ <Function name="f">
+ <Parameters>
+ <Parameter name="x">
+ <Comment>
+ # parameter x
+ <Parameter name="y">
+ <Expression>
+ a*5
+ <Comment>
+ # parameter y
+ <Parameter name="args" varargs=True>
+ <Comment>
+ # parameter args
+ <Docstring>
+ f's docstring
+ <Attribute name="function_attribute">
+ <Expression>
+ 1
+ <Docstring>
+ f.function_attribute's docstring
+
+compiler.parse() provides most of what's needed for this AST. I think that
+"tokenize" can be used to get the rest, and all that's left is to hunker down
+and figure out how. We can determine the line number from the
+compiler.parse() AST, and a get_rhs(lineno) method would provide the rest.
+
+The Docutils Python reader component will transform this AST into a
+Python-specific doctree, and then a `stylist transform`_ would further
+transform it into a generic doctree. Namespaces will have to be compiled for
+each of the scopes, but I'm not certain at what stage of processing.
+
+It's very important to keep all docstring processing out of this, so that it's
+a completely generic and not tool-specific.
+
+> Why perform all of those transformations? Why not go from the AST to a
+> generic doctree? Or, even from the AST to the final output?
+
+I want the docutils.readers.python.moduleparser.parse_module() function to
+produce a standard documentation-oriented AST that can be used by any tool.
+We can develop it together without having to compromise on the rest of our
+design (i.e., HappyDoc doesn't have to be made to work like Docutils, and
+vice-versa). It would be a higher-level version of what compiler.py provides.
+
+The Python reader component transforms this generic AST into a Python-specific
+doctree (it knows about modules, classes, functions, etc.), but this is
+specific to Docutils and cannot be used by HappyDoc or others. The stylist
+transform does the final layout, converting Python-specific structures
+("class" sections, etc.) into a generic doctree using primitives (tables,
+sections, lists, etc.). This generic doctree does *not* know about Python
+structures any more. The advantage is that this doctree can be handed off to
+any of the output writers to create any output format we like.
+
+The latter two transforms are separate because I want to be able to have
+multiple independent layout styles (multiple runtime-selectable "stylist
+transforms"). Each of the existing tools (HappyDoc, pydoc, epydoc, Crystal,
+etc.) has its own fixed format. I personally don't like the tables-based
+format produced by these tools, and I'd like to be able to customize the
+format easily. That's the goal of stylist transforms, which are independent
+from the Reader component itself. One stylist transform could produce
+HappyDoc-like output, another could produce output similar to module docs in
+the Python library reference manual, and so on.
+
+It's for exactly this reason:
+
+>> It's very important to keep all docstring processing out of this, so that
+>> it's a completely generic and not tool-specific.
+
+... but it goes past docstring processing. It's also important to keep style
+decisions and tool-specific data transforms out of this module parser.
+
"""
__docformat__ = 'reStructuredText'
@@ -21,8 +165,10 @@ __docformat__ = 'reStructuredText'
import sys
import compiler
import compiler.ast
-import compiler.visitor
+import tokenize
+import token
from compiler.consts import OP_ASSIGN
+from compiler.visitor import ASTVisitor
from types import StringType, UnicodeType
@@ -33,10 +179,10 @@ def parse_module(module_text, filename):
return visitor.module
-class ModuleVisitor(compiler.visitor.ASTVisitor):
+class ModuleVisitor(ASTVisitor):
def __init__(self, filename):
- compiler.visitor.ASTVisitor.__init__(self)
+ ASTVisitor.__init__(self)
self.filename = filename
self.module = None
self.context = []
@@ -45,11 +191,11 @@ class ModuleVisitor(compiler.visitor.ASTVisitor):
def default(self, node, *args):
self.documentable = None
#print 'in default (%s)' % node.__class__.__name__
- #compiler.visitor.ASTVisitor.default(self, node, *args)
+ #ASTVisitor.default(self, node, *args)
def default_ignore(self, node, *args):
#print 'in default_ignore (%s)' % node.__class__.__name__
- compiler.visitor.ASTVisitor.default(self, node, *args)
+ ASTVisitor.default(self, node, *args)
def visitModule(self, node):
#print dir(node)
@@ -95,23 +241,66 @@ class ModuleVisitor(compiler.visitor.ASTVisitor):
self.documentable = None
-class AssignmentVisitor(compiler.visitor.ASTVisitor):
+class AssignmentVisitor(ASTVisitor):
+
+ """
+ Tried reconstructing expressions (the RHS of assignments) by
+ visiting the compiler.parse() tree, but a lot of information is
+ missing, like parenthesis-grouping of expressions.
+
+ Gotta do it by parsing tokens.
+ """
def __init__(self):
- compiler.visitor.ASTVisitor.__init__(self)
+ ASTVisitor.__init__(self)
self.attributes = []
+ self.parts = []
def default(self, node, *args):
- pass
+ print >>sys.stderr, '%s not visited!' % node.__class__.__name__
+ ASTVisitor.default(self, node)
def visitAssign(self, node):
- compiler.visitor.ASTVisitor.default(self, node)
+ ASTVisitor.default(self, node)
+ self.attributes[-1].append(Expression(node, ''.join(self.parts)))
def visitAssName(self, node):
self.attributes.append(Attribute(node, node.name))
- def get_rhs(self, node):
- return "'TBD'"
+ def visitAdd(self, node):
+ ASTVisitor.default(self, node)
+ self.parts[-2:] = ' + '.join(self.parts[-2:])
+
+ def visitAnd(self, node):
+ ASTVisitor.default(self, node)
+ self.parts.insert(len(self.parts) - 1, ' and ')
+
+ def visitBackquote(self, node):
+ self.parts.append('`')
+ ASTVisitor.default(self, node)
+ self.parts.append('`')
+
+ def visitBitand(self, node):
+ ASTVisitor.default(self, node)
+ self.parts.insert(len(self.parts) - 1, ' & ')
+
+ def visitBitor(self, node):
+ ASTVisitor.default(self, node)
+ self.parts.insert(len(self.parts) - 1, ' | ')
+
+ def visitBitxor(self, node):
+ ASTVisitor.default(self, node)
+ self.parts.insert(len(self.parts) - 1, ' ^ ')
+
+ def visitConst(self, node):
+ self.parts.append(repr(node.value))
+
+ def visitConst(self, node):
+ self.parts.append(repr(node.value))
+
+ def visitInvert(self, node):
+ self.parts.append('~ ')
+ ASTVisitor.default(self, node)
class Node:
@@ -211,7 +400,44 @@ class Expression(Node):
def __init__(self, node, text):
Node.__init__(self, node)
self.text = text
-
+
+ def __str__(self, indent=' ', level=0):
+ prefix = indent * (level + 1)
+ return '%s%s%s\n' % (Node.__str__(self, indent, level),
+ prefix, self.text)
+
+
+class TokenReader:
+
+ def __init__(self, text):
+ self.text = text
+ self.lines = text.splitlines(1)
+ self.generator = tokenize.generate_tokens(iter(self.lines).next)
+
+ def __iter__(self):
+ return self
+
+ def next(self):
+ token = self.generator.next()
+ self.type, self.string, self.start, self.end, self.line = token
+ return token
+
+ def goto_line(self, lineno):
+ for token in self:
+ if self.start[0] >= lineno:
+ return token
+ else:
+ raise IndexError
+
+ def rhs(self, name, lineno):
+ self.goto_line(lineno)
+ while self.start[0] == lineno:
+ if self.type == token.OP and self.string == '=':
+ break
+ self.next()
+ else:
+ raise IndexError
+
def trim_docstring(text):
"""