summaryrefslogtreecommitdiff
path: root/pygments
diff options
context:
space:
mode:
authorLeistungsabfall <Leistungsabfall@users.noreply.github.com>2020-07-21 21:06:39 +0200
committerGitHub <noreply@github.com>2020-07-21 21:06:39 +0200
commita486735f2b485ed35bcb60101fc9a3e2a0eca11d (patch)
tree81c3f0e065e4aa40dac31631197277270a0a1a54 /pygments
parent728fd19ebcb69ddb2c7af159939e1a19ae53a892 (diff)
downloadpygments-git-a486735f2b485ed35bcb60101fc9a3e2a0eca11d.tar.gz
Improve Markdown lexer (#1495)
* Add support for Setext-style headings in Markdown * Improve inline code detection in Markdown * Add support for indented code blocks in Markdown * Improve italics & bold detection in Markdown * Simplify italics & bold regexes in Markdown * Add warning about possible unrecognized internal tags in Markdown * Improve striktethrough detection in Markdown * Small bugfix in Markdown * Small bugfix in Markdown * Small refactoring in Markdown
Diffstat (limited to 'pygments')
-rw-r--r--pygments/lexers/markup.py53
1 files changed, 32 insertions, 21 deletions
diff --git a/pygments/lexers/markup.py b/pygments/lexers/markup.py
index 8c03d0cd..f185ce9e 100644
--- a/pygments/lexers/markup.py
+++ b/pygments/lexers/markup.py
@@ -519,9 +519,9 @@ class MarkdownLexer(RegexLexer):
from pygments.lexers import get_lexer_by_name
# section header
- yield match.start(1), String , match.group(1)
- yield match.start(2), String , match.group(2)
- yield match.start(3), Text , match.group(3)
+ yield match.start(1), String.Backtick, match.group(1)
+ yield match.start(2), String.Backtick, match.group(2)
+ yield match.start(3), Text , match.group(3)
# lookup lexer if wanted and existing
lexer = None
@@ -539,44 +539,55 @@ class MarkdownLexer(RegexLexer):
for item in do_insertions([], lexer.get_tokens_unprocessed(code)):
yield item
- yield match.start(5), String , match.group(5)
+ yield match.start(5), String.Backtick, match.group(5)
tokens = {
'root': [
- # heading with pound prefix
- (r'^(#)([^#].+\n)', bygroups(Generic.Heading, Text)),
- (r'^(#{2,6})(.+\n)', bygroups(Generic.Subheading, Text)),
+ # heading with '#' prefix (atx-style)
+ (r'(^#[^#].+)(\n)', bygroups(Generic.Heading, Text)),
+ # subheading with '#' prefix (atx-style)
+ (r'(^#{2,6}[^#].+)(\n)', bygroups(Generic.Subheading, Text)),
+ # heading with '=' underlines (Setext-style)
+ (r'^(.+)(\n)(=+)(\n)', bygroups(Generic.Heading, Text, Generic.Heading, Text)),
+ # subheading with '-' underlines (Setext-style)
+ (r'^(.+)(\n)(-+)(\n)', bygroups(Generic.Subheading, Text, Generic.Subheading, Text)),
# task list
(r'^(\s*)([*-] )(\[[ xX]\])( .+\n)',
bygroups(Text, Keyword, Keyword, using(this, state='inline'))),
- # bulleted lists
+ # bulleted list
(r'^(\s*)([*-])(\s)(.+\n)',
bygroups(Text, Keyword, Text, using(this, state='inline'))),
- # numbered lists
+ # numbered list
(r'^(\s*)([0-9]+\.)( .+\n)',
bygroups(Text, Keyword, using(this, state='inline'))),
# quote
(r'^(\s*>\s)(.+\n)', bygroups(Keyword, Generic.Emph)),
- # text block
- (r'^(```\n)([\w\W]*?)(^```$)', bygroups(String, Text, String)),
+ # code block fenced by 3 backticks
+ (r'^(\s*```\n(.+\n)+\s*```$)', String.Backtick),
# code block with language
- (r'^(```)(\w+)(\n)([\w\W]*?)(^```$)', _handle_codeblock),
+ (r'^(\s*```)(\w+)(\n)([\w\W]*?)(^\s*```$)', _handle_codeblock),
+ # code block indented with 4 spaces or 1 tab
+ (r'(\n\n)((\ {4}|\t)(.+\n)+)', bygroups(Text, String.Backtick)),
include('inline'),
],
'inline': [
# escape
(r'\\.', Text),
- # italics
- (r'(\s)([*_][^*_]+[*_])(\W|\n)', bygroups(Text, Generic.Emph, Text)),
- # bold
- # warning: the following rule eats internal tags. eg. **foo _bar_ baz** bar is not italics
- (r'(\s)((\*\*|__).*\3)((?=\W|\n))', bygroups(Text, Generic.Strong, None, Text)),
- # "proper way" (r'(\s)([*_]{2}[^*_]+[*_]{2})((?=\W|\n))', bygroups(Text, Generic.Strong, Text)),
- # strikethrough
- (r'(\s)(~~[^~]+~~)((?=\W|\n))', bygroups(Text, Generic.Deleted, Text)),
# inline code
- (r'`[^`]+`', String.Backtick),
+ (r'([^`])(`[^`\n]+`)', bygroups(Text, String.Backtick)),
+ # warning: the following rules eat outer tags.
+ # eg. **foo _bar_ baz** => foo and baz are not recognized as bold
+ # bold fenced by '**'
+ (r'(\*\*[^\*\n\ ][^\*\n]*\*\*)', bygroups(Generic.Strong)),
+ # # bold fenced by '__'
+ (r'(\_\_[^\_\n\ ][^\_\n]*\_\_)', bygroups(Generic.Strong)),
+ # italics fenced by '*'
+ (r'(\*[^\*\n\ ][^\*\n]*\*)', bygroups(Generic.Emph)),
+ # italics fenced by '_'
+ (r'(\_[^\_\n\ ][^\_\n]*\_)', bygroups(Generic.Emph)),
+ # strikethrough
+ (r'([^~]*)(~~[^~]+~~)', bygroups(Text, Generic.Deleted)),
# mentions and topics (twitter and github stuff)
(r'[@#][\w/:]+', Name.Entity),
# (image?) links eg: ![Image of Yaktocat](https://octodex.github.com/images/yaktocat.png)