summaryrefslogtreecommitdiff
path: root/markdown/extensions
diff options
context:
space:
mode:
authorWaylan Limberg <waylan.limberg@icloud.com>2020-09-22 10:42:17 -0400
committerGitHub <noreply@github.com>2020-09-22 10:42:17 -0400
commitb701c34ebd7b2d0eb319517b9a275ddf0c89608d (patch)
treeb79839201a337d38276f345595947b0a15a7567b /markdown/extensions
parent90e750b1f4fa8d150d7b5a4709858c786f2794dd (diff)
downloadpython-markdown-b701c34ebd7b2d0eb319517b9a275ddf0c89608d.tar.gz
Refactor HTML Parser (#803)
The HTML parser has been completely replaced. The new HTML parser is built on Python's html.parser.HTMLParser, which alleviates various bugs and simplifies maintenance of the code. The md_in_html extension has been rebuilt on the new HTML Parser, which drastically simplifies it. Note that raw HTML elements with a markdown attribute defined are now converted to ElementTree Elements and are rendered by the serializer. Various bugs have been fixed. Link reference parsing, abbreviation reference parsing and footnote reference parsing has all been moved from preprocessors to blockprocessors, which allows them to be nested within other block level elements. Specifically, this change was necessary to maintain the current behavior in the rebuilt md_in_html extension. A few random edge-case bugs (see the included tests) were resolved in the process. Closes #595, closes #780, closes #830 and closes #1012.
Diffstat (limited to 'markdown/extensions')
-rw-r--r--markdown/extensions/abbr.py49
-rw-r--r--markdown/extensions/footnotes.py164
-rw-r--r--markdown/extensions/md_in_html.py303
3 files changed, 343 insertions, 173 deletions
diff --git a/markdown/extensions/abbr.py b/markdown/extensions/abbr.py
index b53f2c4..9879314 100644
--- a/markdown/extensions/abbr.py
+++ b/markdown/extensions/abbr.py
@@ -17,48 +17,53 @@ License: [BSD](https://opensource.org/licenses/bsd-license.php)
'''
from . import Extension
-from ..preprocessors import Preprocessor
+from ..blockprocessors import BlockProcessor
from ..inlinepatterns import InlineProcessor
from ..util import AtomicString
import re
import xml.etree.ElementTree as etree
-# Global Vars
-ABBR_REF_RE = re.compile(r'[*]\[(?P<abbr>[^\]]*)\][ ]?:\s*(?P<title>.*)')
-
class AbbrExtension(Extension):
""" Abbreviation Extension for Python-Markdown. """
def extendMarkdown(self, md):
""" Insert AbbrPreprocessor before ReferencePreprocessor. """
- md.preprocessors.register(AbbrPreprocessor(md), 'abbr', 12)
+ md.parser.blockprocessors.register(AbbrPreprocessor(md.parser), 'abbr', 16)
-class AbbrPreprocessor(Preprocessor):
+class AbbrPreprocessor(BlockProcessor):
""" Abbreviation Preprocessor - parse text for abbr references. """
- def run(self, lines):
+ RE = re.compile(r'^[*]\[(?P<abbr>[^\]]*)\][ ]?:[ ]*\n?[ ]*(?P<title>.*)$', re.MULTILINE)
+
+ def test(self, parent, block):
+ return True
+
+ def run(self, parent, blocks):
'''
Find and remove all Abbreviation references from the text.
Each reference is set as a new AbbrPattern in the markdown instance.
'''
- new_text = []
- for line in lines:
- m = ABBR_REF_RE.match(line)
- if m:
- abbr = m.group('abbr').strip()
- title = m.group('title').strip()
- self.md.inlinePatterns.register(
- AbbrInlineProcessor(self._generate_pattern(abbr), title), 'abbr-%s' % abbr, 2
- )
- # Preserve the line to prevent raw HTML indexing issue.
- # https://github.com/Python-Markdown/markdown/issues/584
- new_text.append('')
- else:
- new_text.append(line)
- return new_text
+ block = blocks.pop(0)
+ m = self.RE.search(block)
+ if m:
+ abbr = m.group('abbr').strip()
+ title = m.group('title').strip()
+ self.parser.md.inlinePatterns.register(
+ AbbrInlineProcessor(self._generate_pattern(abbr), title), 'abbr-%s' % abbr, 2
+ )
+ if block[m.end():].strip():
+ # Add any content after match back to blocks as separate block
+ blocks.insert(0, block[m.end():].lstrip('\n'))
+ if block[:m.start()].strip():
+ # Add any content before match back to blocks as separate block
+ blocks.insert(0, block[:m.start()].rstrip('\n'))
+ return True
+ # No match. Restore block.
+ blocks.insert(0, block)
+ return False
def _generate_pattern(self, text):
'''
diff --git a/markdown/extensions/footnotes.py b/markdown/extensions/footnotes.py
index beab919..f6f4c85 100644
--- a/markdown/extensions/footnotes.py
+++ b/markdown/extensions/footnotes.py
@@ -14,7 +14,7 @@ License: [BSD](https://opensource.org/licenses/bsd-license.php)
"""
from . import Extension
-from ..preprocessors import Preprocessor
+from ..blockprocessors import BlockProcessor
from ..inlinepatterns import InlineProcessor
from ..treeprocessors import Treeprocessor
from ..postprocessors import Postprocessor
@@ -26,8 +26,6 @@ import xml.etree.ElementTree as etree
FN_BACKLINK_TEXT = util.STX + "zz1337820767766393qq" + util.ETX
NBSP_PLACEHOLDER = util.STX + "qq3936677670287331zz" + util.ETX
-DEF_RE = re.compile(r'[ ]{0,3}\[\^([^\]]*)\]:\s*(.*)')
-TABBED_RE = re.compile(r'((\t)|( ))(.*)')
RE_REF_ID = re.compile(r'(fnref)(\d+)')
@@ -72,8 +70,8 @@ class FootnoteExtension(Extension):
md.registerExtension(self)
self.parser = md.parser
self.md = md
- # Insert a preprocessor before ReferencePreprocessor
- md.preprocessors.register(FootnotePreprocessor(self), 'footnote', 15)
+ # Insert a blockprocessor before ReferencePreprocessor
+ md.parser.blockprocessors.register(FootnoteBlockProcessor(self), 'footnote', 17)
# Insert an inline pattern before ImageReferencePattern
FOOTNOTE_RE = r'\[\^([^\]]*)\]' # blah blah [^1] blah
@@ -202,106 +200,92 @@ class FootnoteExtension(Extension):
return div
-class FootnotePreprocessor(Preprocessor):
+class FootnoteBlockProcessor(BlockProcessor):
""" Find all footnote references and store for later use. """
+ RE = re.compile(r'^[ ]{0,3}\[\^([^\]]*)\]:[ ]*(.*)$', re.MULTILINE)
+
def __init__(self, footnotes):
+ super().__init__(footnotes.parser)
self.footnotes = footnotes
- def run(self, lines):
- """
- Loop through lines and find, set, and remove footnote definitions.
-
- Keywords:
-
- * lines: A list of lines of text
-
- Return: A list of lines of text with footnote definitions removed.
-
- """
- newlines = []
- i = 0
- while True:
- m = DEF_RE.match(lines[i])
- if m:
- fn, _i = self.detectTabbed(lines[i+1:])
- fn.insert(0, m.group(2))
- i += _i-1 # skip past footnote
- footnote = "\n".join(fn)
- self.footnotes.setFootnote(m.group(1), footnote.rstrip())
- # Preserve a line for each block to prevent raw HTML indexing issue.
- # https://github.com/Python-Markdown/markdown/issues/584
- num_blocks = (len(footnote.split('\n\n')) * 2)
- newlines.extend([''] * (num_blocks))
+ def test(self, parent, block):
+ return True
+
+ def run(self, parent, blocks):
+ """ Find, set, and remove footnote definitions. """
+ block = blocks.pop(0)
+ m = self.RE.search(block)
+ if m:
+ id = m.group(1)
+ fn_blocks = [m.group(2)]
+
+ # Handle rest of block
+ therest = block[m.end():].lstrip('\n')
+ m2 = self.RE.search(therest)
+ if m2:
+ # Another footnote exists in the rest of this block.
+ # Any content before match is continuation of this footnote, which may be lazily indented.
+ before = therest[:m2.start()].rstrip('\n')
+ fn_blocks[0] = '\n'.join([fn_blocks[0], self.detab(before)]).lstrip('\n')
+ # Add back to blocks everything from begining of match forward for next iteration.
+ blocks.insert(0, therest[m2.start():])
else:
- newlines.append(lines[i])
- if len(lines) > i+1:
- i += 1
- else:
- break
- return newlines
+ # All remaining lines of block are continuation of this footnote, which may be lazily indented.
+ fn_blocks[0] = '\n'.join([fn_blocks[0], self.detab(therest)]).strip('\n')
- def detectTabbed(self, lines):
- """ Find indented text and remove indent before further proccesing.
+ # Check for child elements in remaining blocks.
+ fn_blocks.extend(self.detectTabbed(blocks))
- Keyword arguments:
+ footnote = "\n\n".join(fn_blocks)
+ self.footnotes.setFootnote(id, footnote.rstrip())
- * lines: an array of strings
+ if block[:m.start()].strip():
+ # Add any content before match back to blocks as separate block
+ blocks.insert(0, block[:m.start()].rstrip('\n'))
+ return True
+ # No match. Restore block.
+ blocks.insert(0, block)
+ return False
- Returns: a list of post processed items and the index of last line.
+ def detectTabbed(self, blocks):
+ """ Find indented text and remove indent before further proccesing.
+ Returns: a list of blocks with indentation removed.
"""
- items = []
- blank_line = False # have we encountered a blank line yet?
- i = 0 # to keep track of where we are
-
- def detab(line):
- match = TABBED_RE.match(line)
- if match:
- return match.group(4)
-
- for line in lines:
- if line.strip(): # Non-blank line
- detabbed_line = detab(line)
- if detabbed_line:
- items.append(detabbed_line)
- i += 1
- continue
- elif not blank_line and not DEF_RE.match(line):
- # not tabbed but still part of first par.
- items.append(line)
- i += 1
- continue
- else:
- return items, i+1
-
- else: # Blank line: _maybe_ we are done.
- blank_line = True
- i += 1 # advance
-
- # Find the next non-blank line
- for j in range(i, len(lines)):
- if lines[j].strip():
- next_line = lines[j]
- break
- else:
- # Include extreaneous padding to prevent raw HTML
- # parsing issue: https://github.com/Python-Markdown/markdown/issues/584
- items.append("")
- i += 1
+ fn_blocks = []
+ while blocks:
+ if blocks[0].startswith(' '*4):
+ block = blocks.pop(0)
+ # Check for new footnotes within this block and split at new footnote.
+ m = self.RE.search(block)
+ if m:
+ # Another footnote exists in this block.
+ # Any content before match is continuation of this footnote, which may be lazily indented.
+ before = block[:m.start()].rstrip('\n')
+ fn_blocks.append(self.detab(before))
+ # Add back to blocks everything from begining of match forward for next iteration.
+ blocks.insert(0, block[m.start():])
+ # End of this footnote.
+ break
else:
- break # There is no more text; we are done.
+ # Entire block is part of this footnote.
+ fn_blocks.append(self.detab(block))
+ else:
+ # End of this footnote.
+ break
+ return fn_blocks
- # Check if the next non-blank line is tabbed
- if detab(next_line): # Yes, more work to do.
- items.append("")
- continue
- else:
- break # No, we are done.
- else:
- i += 1
+ def detab(self, block):
+ """ Remove one level of indent from a block.
- return items, i
+ Preserve lazily indented blocks by only removing indent from indented lines.
+ """
+ lines = block.split('\n')
+ for i, line in enumerate(lines):
+ if line.startswith(' '*4):
+ lines[i] = line[4:]
+ return '\n'.join(lines)
class FootnoteInlineProcessor(InlineProcessor):
diff --git a/markdown/extensions/md_in_html.py b/markdown/extensions/md_in_html.py
index 500c166..3518d05 100644
--- a/markdown/extensions/md_in_html.py
+++ b/markdown/extensions/md_in_html.py
@@ -16,68 +16,251 @@ License: [BSD](https://opensource.org/licenses/bsd-license.php)
from . import Extension
from ..blockprocessors import BlockProcessor
+from ..preprocessors import Preprocessor
from .. import util
-import re
+from ..htmlparser import HTMLExtractor
import xml.etree.ElementTree as etree
+# Block-level tags in which the content only gets span level parsing
+span_tags = ['address', 'dd', 'dt', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'legend', 'li', 'p', 'td', 'th']
+
+# Block-level tags in which the content gets parsed as blocks
+block_tags = [
+ 'address', 'article', 'aside', 'blockquote', 'body', 'colgroup', 'details', 'div', 'dl', 'fieldset',
+ 'figcaption', 'figure', 'footer', 'form', 'iframe', 'header', 'hr', 'main', 'menu', 'nav', 'map',
+ 'noscript', 'object', 'ol', 'section', 'table', 'tbody', 'thead', 'tfoot', 'tr', 'ul'
+]
+
+# Block-level tags which never get their content parsed.
+raw_tags = ['canvas', 'math', 'option', 'pre', 'script', 'style', 'textarea']
+
+block_level_tags = span_tags + block_tags + raw_tags
+
+
+class HTMLExtractorExtra(HTMLExtractor):
+ """
+ Override HTMLExtractor and create etree Elements for any elements which should have content parsed as Markdown.
+ """
+
+ def reset(self):
+ """Reset this instance. Loses all unprocessed data."""
+ self.mdstack = [] # When markdown=1, stack contains a list of tags
+ self.treebuilder = etree.TreeBuilder()
+ self.mdstate = [] # one of 'block', 'span', 'off', or None
+ super().reset()
+
+ def close(self):
+ """Handle any buffered data."""
+ super().close()
+ # Handle any unclosed tags.
+ if self.mdstack:
+ # Close the outermost parent. handle_endtag will close all unclosed children.
+ self.handle_endtag(self.mdstack[0])
+
+ def get_element(self):
+ """ Return element from treebuilder and reset treebuilder for later use. """
+ element = self.treebuilder.close()
+ self.treebuilder = etree.TreeBuilder()
+ return element
+
+ def get_state(self, tag, attrs):
+ """ Return state from tag and `markdown` attr. One of 'block', 'span', or 'off'. """
+ md_attr = attrs.get('markdown', '0')
+ if md_attr == 'markdown':
+ # `<tag markdown>` is the same as `<tag markdown='1'>`.
+ md_attr = '1'
+ parent_state = self.mdstate[-1] if self.mdstate else None
+ if parent_state == 'off' or (parent_state == 'span' and md_attr != '0'):
+ # Only use the parent state if it is more restrictive than the markdown attribute.
+ md_attr = parent_state
+ if ((md_attr == '1' and tag in block_tags) or
+ (md_attr == 'block' and tag in span_tags + block_tags)):
+ return 'block'
+ elif ((md_attr == '1' and tag in span_tags) or
+ (md_attr == 'span' and tag in span_tags + block_tags)):
+ return 'span'
+ elif tag in block_level_tags:
+ return 'off'
+ else: # pragma: no cover
+ return None
+
+ def handle_starttag(self, tag, attrs):
+ if tag in block_level_tags:
+ # Valueless attr (ex: `<tag checked>`) results in `[('checked', None)]`.
+ # Convert to `{'checked': 'checked'}`.
+ attrs = {key: value if value is not None else key for key, value in attrs}
+ state = self.get_state(tag, attrs)
+
+ if self.inraw or (state in [None, 'off'] and not self.mdstack):
+ # fall back to default behavior
+ attrs.pop('markdown', None)
+ super().handle_starttag(tag, attrs)
+ else:
+ if 'p' in self.mdstack and tag in block_level_tags:
+ # Close unclosed 'p' tag
+ self.handle_endtag('p')
+ self.mdstate.append(state)
+ self.mdstack.append(tag)
+ attrs['markdown'] = state
+ self.treebuilder.start(tag, attrs)
+ else:
+ # Span level tag
+ if self.inraw:
+ super().handle_starttag(tag, attrs)
+ else:
+ text = self.get_starttag_text()
+ self.handle_data(text)
+
+ def handle_endtag(self, tag):
+ if tag in block_level_tags:
+ if self.inraw:
+ super().handle_endtag(tag)
+ elif tag in self.mdstack:
+ # Close element and any unclosed children
+ while self.mdstack:
+ item = self.mdstack.pop()
+ self.mdstate.pop()
+ self.treebuilder.end(item)
+ if item == tag:
+ break
+ if not self.mdstack:
+ # Last item in stack is closed. Stash it
+ element = self.get_element()
+ self.cleandoc.append(self.md.htmlStash.store(element))
+ self.cleandoc.append('\n\n')
+ self.state = []
+ else:
+ # Treat orphan closing tag as a span level tag.
+ text = self.get_endtag_text(tag)
+ self.handle_data(text)
+ else:
+ # Span level tag
+ if self.inraw:
+ super().handle_endtag(tag)
+ else:
+ text = self.get_endtag_text(tag)
+ self.handle_data(text)
+
+ def handle_data(self, data):
+ if self.inraw or not self.mdstack:
+ super().handle_data(data)
+ else:
+ self.treebuilder.data(data)
+
+ def handle_empty_tag(self, data, is_block):
+ if self.inraw or not self.mdstack:
+ super().handle_empty_tag(data, is_block)
+ else:
+ if self.at_line_start() and is_block:
+ self.handle_data('\n' + self.md.htmlStash.store(data) + '\n\n')
+ else:
+ self.handle_data(data)
+
+
+class HtmlBlockPreprocessor(Preprocessor):
+ """Remove html blocks from the text and store them for later retrieval."""
+
+ def run(self, lines):
+ source = '\n'.join(lines)
+ parser = HTMLExtractorExtra(self.md)
+ parser.feed(source)
+ parser.close()
+ return ''.join(parser.cleandoc).split('\n')
+
+
class MarkdownInHtmlProcessor(BlockProcessor):
- """Process Markdown Inside HTML Blocks."""
+ """Process Markdown Inside HTML Blocks which have been stored in the HtmlStash."""
+
def test(self, parent, block):
- return block == util.TAG_PLACEHOLDER % \
- str(self.parser.blockprocessors.tag_counter + 1)
-
- def _process_nests(self, element, block):
- """Process the element's child elements in self.run."""
- # Build list of indexes of each nest within the parent element.
- nest_index = [] # a list of tuples: (left index, right index)
- i = self.parser.blockprocessors.tag_counter + 1
- while len(self._tag_data) > i and self._tag_data[i]['left_index']:
- left_child_index = self._tag_data[i]['left_index']
- right_child_index = self._tag_data[i]['right_index']
- nest_index.append((left_child_index - 1, right_child_index))
- i += 1
-
- # Create each nest subelement.
- for i, (left_index, right_index) in enumerate(nest_index[:-1]):
- self.run(element, block[left_index:right_index],
- block[right_index:nest_index[i + 1][0]], True)
- self.run(element, block[nest_index[-1][0]:nest_index[-1][1]], # last
- block[nest_index[-1][1]:], True) # nest
-
- def run(self, parent, blocks, tail=None, nest=False):
- self._tag_data = self.parser.md.htmlStash.tag_data
-
- self.parser.blockprocessors.tag_counter += 1
- tag = self._tag_data[self.parser.blockprocessors.tag_counter]
-
- # Create Element
- markdown_value = tag['attrs'].pop('markdown')
- element = etree.SubElement(parent, tag['tag'], tag['attrs'])
-
- # Slice Off Block
- if nest:
- self.parser.parseBlocks(parent, tail) # Process Tail
- block = blocks[1:]
- else: # includes nests since a third level of nesting isn't supported
- block = blocks[tag['left_index'] + 1: tag['right_index']]
- del blocks[:tag['right_index']]
-
- # Process Text
- if (self.parser.blockprocessors.contain_span_tags.match( # Span Mode
- tag['tag']) and markdown_value != 'block') or \
- markdown_value == 'span':
- element.text = '\n'.join(block)
- else: # Block Mode
- i = self.parser.blockprocessors.tag_counter + 1
- if len(self._tag_data) > i and self._tag_data[i]['left_index']:
- first_subelement_index = self._tag_data[i]['left_index'] - 1
- self.parser.parseBlocks(
- element, block[:first_subelement_index])
- if not nest:
- block = self._process_nests(element, block)
- else:
- self.parser.parseBlocks(element, block)
+ # ALways return True. `run` will return `False` it not a valid match.
+ return True
+
+ def parse_element_content(self, element):
+ """
+ Resursively parse the text content of an etree Element as Markdown.
+
+ Any block level elements generated from the Markdown will be inserted as children of the element in place
+ of the text content. All `markdown` attributes are removed. For any elements in which Markdown parsing has
+ been dissabled, the text content of it and its chidlren are wrapped in an `AtomicString`.
+ """
+
+ md_attr = element.attrib.pop('markdown', 'off')
+
+ if md_attr == 'block':
+ # Parse content as block level
+ # The order in which the different parts are parsed (text, children, tails) is important here as the
+ # order of elements needs to be preserved. We can't be inserting items at a later point in the current
+ # iteration as we don't want to do raw processing on elements created from parsing Markdown text (for
+ # example). Therefore, the order of operations is children, tails, text.
+
+ # Recursively parse existing children from raw HTML
+ for child in list(element):
+ self.parse_element_content(child)
+
+ # Parse Markdown text in tail of children. Do this seperate to avoid raw HTML parsing.
+ # Save the position of each item to be inserted later in reverse.
+ tails = []
+ for pos, child in enumerate(element):
+ if child.tail:
+ block = child.tail.rstrip('\n')
+ child.tail = ''
+ # Use a dummy placeholder element.
+ dummy = etree.Element('div')
+ self.parser.parseBlocks(dummy, block.split('\n\n'))
+ children = list(dummy)
+ children.reverse()
+ tails.append((pos + 1, children))
+
+ # Insert the elements created from the tails in reverse.
+ tails.reverse()
+ for pos, tail in tails:
+ for item in tail:
+ element.insert(pos, item)
+
+ # Parse Markdown text content. Do this last to avoid raw HTML parsing.
+ if element.text:
+ block = element.text.rstrip('\n')
+ element.text = ''
+ # Use a dummy placeholder element as the content needs to get inserted before existing children.
+ dummy = etree.Element('div')
+ self.parser.parseBlocks(dummy, block.split('\n\n'))
+ children = list(dummy)
+ children.reverse()
+ for child in children:
+ element.insert(0, child)
+
+ elif md_attr == 'span':
+ # Span level parsing will be handled by inlineprocessors.
+ # Walk children here to remove any `markdown` attributes.
+ for child in list(element):
+ self.parse_element_content(child)
+
+ else:
+ # Disable inline parsing for everything else
+ element.text = util.AtomicString(element.text)
+ for child in list(element):
+ self.parse_element_content(child)
+ if child.tail:
+ child.tail = util.AtomicString(child.tail)
+
+ def run(self, parent, blocks):
+ m = util.HTML_PLACEHOLDER_RE.match(blocks[0])
+ if m:
+ index = int(m.group(1))
+ element = self.parser.md.htmlStash.rawHtmlBlocks[index]
+ if isinstance(element, etree.Element):
+ # We have a matched element. Process it.
+ blocks.pop(0)
+ self.parse_element_content(element)
+ parent.append(element)
+ # Cleanup stash. Replace element with empty string to avoid confusing postprocessor.
+ self.parser.md.htmlStash.rawHtmlBlocks.pop(index)
+ self.parser.md.htmlStash.rawHtmlBlocks.insert(index, '')
+ # Comfirm the match to the blockparser.
+ return True
+ # No match found.
+ return False
class MarkdownInHtmlExtension(Extension):
@@ -86,14 +269,12 @@ class MarkdownInHtmlExtension(Extension):
def extendMarkdown(self, md):
""" Register extension instances. """
- # Turn on processing of markdown text within raw html
- md.preprocessors['html_block'].markdown_in_raw = True
+ # Replace raw HTML preprocessor
+ md.preprocessors.register(HtmlBlockPreprocessor(md), 'html_block', 20)
+ # Add blockprocessor which handles the placeholders for etree elements
md.parser.blockprocessors.register(
MarkdownInHtmlProcessor(md.parser), 'markdown_block', 105
)
- md.parser.blockprocessors.tag_counter = -1
- md.parser.blockprocessors.contain_span_tags = re.compile(
- r'^(p|h[1-6]|li|dd|dt|td|th|legend|address)$', re.IGNORECASE)
def makeExtension(**kwargs): # pragma: no cover