summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--pygments/lexers/markup.py53
-rw-r--r--tests/test_markdown_lexer.py525
2 files changed, 557 insertions, 21 deletions
diff --git a/pygments/lexers/markup.py b/pygments/lexers/markup.py
index 8c03d0cd..f185ce9e 100644
--- a/pygments/lexers/markup.py
+++ b/pygments/lexers/markup.py
@@ -519,9 +519,9 @@ class MarkdownLexer(RegexLexer):
from pygments.lexers import get_lexer_by_name
# section header
- yield match.start(1), String , match.group(1)
- yield match.start(2), String , match.group(2)
- yield match.start(3), Text , match.group(3)
+ yield match.start(1), String.Backtick, match.group(1)
+ yield match.start(2), String.Backtick, match.group(2)
+ yield match.start(3), Text , match.group(3)
# lookup lexer if wanted and existing
lexer = None
@@ -539,44 +539,55 @@ class MarkdownLexer(RegexLexer):
for item in do_insertions([], lexer.get_tokens_unprocessed(code)):
yield item
- yield match.start(5), String , match.group(5)
+ yield match.start(5), String.Backtick, match.group(5)
tokens = {
'root': [
- # heading with pound prefix
- (r'^(#)([^#].+\n)', bygroups(Generic.Heading, Text)),
- (r'^(#{2,6})(.+\n)', bygroups(Generic.Subheading, Text)),
+ # heading with '#' prefix (atx-style)
+ (r'(^#[^#].+)(\n)', bygroups(Generic.Heading, Text)),
+ # subheading with '#' prefix (atx-style)
+ (r'(^#{2,6}[^#].+)(\n)', bygroups(Generic.Subheading, Text)),
+ # heading with '=' underlines (Setext-style)
+ (r'^(.+)(\n)(=+)(\n)', bygroups(Generic.Heading, Text, Generic.Heading, Text)),
+ # subheading with '-' underlines (Setext-style)
+ (r'^(.+)(\n)(-+)(\n)', bygroups(Generic.Subheading, Text, Generic.Subheading, Text)),
# task list
(r'^(\s*)([*-] )(\[[ xX]\])( .+\n)',
bygroups(Text, Keyword, Keyword, using(this, state='inline'))),
- # bulleted lists
+ # bulleted list
(r'^(\s*)([*-])(\s)(.+\n)',
bygroups(Text, Keyword, Text, using(this, state='inline'))),
- # numbered lists
+ # numbered list
(r'^(\s*)([0-9]+\.)( .+\n)',
bygroups(Text, Keyword, using(this, state='inline'))),
# quote
(r'^(\s*>\s)(.+\n)', bygroups(Keyword, Generic.Emph)),
- # text block
- (r'^(```\n)([\w\W]*?)(^```$)', bygroups(String, Text, String)),
+ # code block fenced by 3 backticks
+ (r'^(\s*```\n(.+\n)+\s*```$)', String.Backtick),
# code block with language
- (r'^(```)(\w+)(\n)([\w\W]*?)(^```$)', _handle_codeblock),
+ (r'^(\s*```)(\w+)(\n)([\w\W]*?)(^\s*```$)', _handle_codeblock),
+ # code block indented with 4 spaces or 1 tab
+ (r'(\n\n)((\ {4}|\t)(.+\n)+)', bygroups(Text, String.Backtick)),
include('inline'),
],
'inline': [
# escape
(r'\\.', Text),
- # italics
- (r'(\s)([*_][^*_]+[*_])(\W|\n)', bygroups(Text, Generic.Emph, Text)),
- # bold
- # warning: the following rule eats internal tags. eg. **foo _bar_ baz** bar is not italics
- (r'(\s)((\*\*|__).*\3)((?=\W|\n))', bygroups(Text, Generic.Strong, None, Text)),
- # "proper way" (r'(\s)([*_]{2}[^*_]+[*_]{2})((?=\W|\n))', bygroups(Text, Generic.Strong, Text)),
- # strikethrough
- (r'(\s)(~~[^~]+~~)((?=\W|\n))', bygroups(Text, Generic.Deleted, Text)),
# inline code
- (r'`[^`]+`', String.Backtick),
+ (r'([^`])(`[^`\n]+`)', bygroups(Text, String.Backtick)),
+ # warning: the following rules eat outer tags.
+ # eg. **foo _bar_ baz** => foo and baz are not recognized as bold
+ # bold fenced by '**'
+ (r'(\*\*[^\*\n\ ][^\*\n]*\*\*)', bygroups(Generic.Strong)),
+ # # bold fenced by '__'
+ (r'(\_\_[^\_\n\ ][^\_\n]*\_\_)', bygroups(Generic.Strong)),
+ # italics fenced by '*'
+ (r'(\*[^\*\n\ ][^\*\n]*\*)', bygroups(Generic.Emph)),
+ # italics fenced by '_'
+ (r'(\_[^\_\n\ ][^\_\n]*\_)', bygroups(Generic.Emph)),
+ # strikethrough
+ (r'([^~]*)(~~[^~]+~~)', bygroups(Text, Generic.Deleted)),
# mentions and topics (twitter and github stuff)
(r'[@#][\w/:]+', Name.Entity),
# (image?) links eg: ![Image of Yaktocat](https://octodex.github.com/images/yaktocat.png)
diff --git a/tests/test_markdown_lexer.py b/tests/test_markdown_lexer.py
index 9024bf07..524becd7 100644
--- a/tests/test_markdown_lexer.py
+++ b/tests/test_markdown_lexer.py
@@ -8,6 +8,7 @@
"""
import pytest
+from pygments.token import Generic, Token, String, Keyword, Name
from pygments.lexers.markup import MarkdownLexer
@@ -34,3 +35,527 @@ def test_code_fence_gsm(lexer):
def test_code_fence_gsm_with_no_lexer(lexer):
assert_same_text(lexer, r'```invalid-lexer\nfoo\n```\n')
+
+
+def test_invalid_atx_heading(lexer):
+ fragments = (
+ '#',
+ 'a #',
+ '*#',
+ )
+
+ for fragment in fragments:
+ for token, _ in lexer.get_tokens(fragment):
+ assert token != Generic.Heading
+
+
+def test_atx_heading(lexer):
+ fragments = (
+ '#Heading',
+ '# Heading',
+ '# Another heading',
+ '# Another # heading',
+ '# Heading #',
+ )
+
+ for fragment in fragments:
+ tokens = [
+ (Generic.Heading, fragment),
+ (Token.Text, '\n'),
+ ]
+ assert list(lexer.get_tokens(fragment)) == tokens
+
+
+def test_invalid_atx_subheading(lexer):
+ fragments = (
+ '##',
+ 'a ##',
+ '*##',
+ '####### too many hashes'
+ )
+
+ for fragment in fragments:
+ for token, _ in lexer.get_tokens(fragment):
+ assert token != Generic.Subheading
+
+
+def test_atx_subheading(lexer):
+ fragments = (
+ '##Subheading',
+ '## Subheading',
+ '### Subheading',
+ '#### Subheading',
+ '##### Subheading',
+ '###### Subheading',
+ '## Another subheading',
+ '## Another ## subheading',
+ '###### Subheading #',
+ '###### Subheading ######',
+ )
+
+ for fragment in fragments:
+ tokens = [
+ (Generic.Subheading, fragment),
+ (Token.Text, '\n'),
+ ]
+ assert list(lexer.get_tokens(fragment)) == tokens
+
+
+def test_invalid_setext_heading(lexer):
+ fragments = (
+ 'Heading\n',
+ 'Heading\n_',
+ 'Heading\n =====',
+ 'Heading\na=====',
+ '=====',
+ '\n=\n',
+ 'Heading\n=====Text'
+ )
+
+ for fragment in fragments:
+ for token, _ in lexer.get_tokens(fragment):
+ assert token != Generic.Heading
+
+
+def test_setext_heading(lexer):
+ fragments = (
+ 'Heading\n=',
+ 'Heading\n=======',
+ 'Heading\n==========',
+ )
+
+ for fragment in fragments:
+ tokens = [
+ (Generic.Heading, fragment.split('\n')[0]),
+ (Token.Text, '\n'),
+ (Generic.Heading, fragment.split('\n')[1]),
+ (Token.Text, '\n'),
+ ]
+ assert list(lexer.get_tokens(fragment)) == tokens
+
+
+def test_invalid_setext_subheading(lexer):
+ fragments = (
+ 'Subheading\n',
+ 'Subheading\n_',
+ 'Subheading\n -----',
+ 'Subheading\na-----',
+ '-----',
+ '\n-\n',
+ 'Subheading\n-----Text'
+ )
+
+ for fragment in fragments:
+ for token, _ in lexer.get_tokens(fragment):
+ assert token != Generic.Subheading
+
+
+def test_setext_subheading(lexer):
+ fragments = (
+ 'Subheading\n-',
+ 'Subheading\n----------',
+ 'Subheading\n-----------',
+ )
+
+ for fragment in fragments:
+ tokens = [
+ (Generic.Subheading, fragment.split('\n')[0]),
+ (Token.Text, '\n'),
+ (Generic.Subheading, fragment.split('\n')[1]),
+ (Token.Text, '\n'),
+ ]
+ assert list(lexer.get_tokens(fragment)) == tokens
+
+
+def test_task_list(lexer):
+ fragment = '- [ ] sample task'
+ tokens = [
+ (Keyword, '- '),
+ (Keyword, '[ ]'),
+ (Token.Text, ' '),
+ (Token.Text, 'sample'),
+ (Token.Text, ' '),
+ (Token.Text, 'task'),
+ (Token.Text, '\n'),
+ ]
+ assert list(lexer.get_tokens(fragment)) == tokens
+
+ fragment = '* [ ] sample task'
+ tokens = [
+ (Keyword, '* '),
+ (Keyword, '[ ]'),
+ (Token.Text, ' '),
+ (Token.Text, 'sample'),
+ (Token.Text, ' '),
+ (Token.Text, 'task'),
+ (Token.Text, '\n'),
+ ]
+ assert list(lexer.get_tokens(fragment)) == tokens
+
+ fragment = ' * [ ] sample task'
+ tokens = [
+ (Token.Text, ' '),
+ (Keyword, '* '),
+ (Keyword, '[ ]'),
+ (Token.Text, ' '),
+ (Token.Text, 'sample'),
+ (Token.Text, ' '),
+ (Token.Text, 'task'),
+ (Token.Text, '\n'),
+ ]
+ assert list(lexer.get_tokens(fragment)) == tokens
+
+
+def test_bulleted_list(lexer):
+ fragment = '* foo\n* bar'
+ tokens = [
+ (Keyword, '*'),
+ (Token.Text, ' '),
+ (Token.Text, 'foo'),
+ (Token.Text, '\n'),
+ (Keyword, '*'),
+ (Token.Text, ' '),
+ (Token.Text, 'bar'),
+ (Token.Text, '\n'),
+ ]
+ assert list(lexer.get_tokens(fragment)) == tokens
+
+ fragment = '- foo\n- bar'
+ tokens = [
+ (Keyword, '-'),
+ (Token.Text, ' '),
+ (Token.Text, 'foo'),
+ (Token.Text, '\n'),
+ (Keyword, '-'),
+ (Token.Text, ' '),
+ (Token.Text, 'bar'),
+ (Token.Text, '\n'),
+ ]
+ assert list(lexer.get_tokens(fragment)) == tokens
+
+ fragment = '* *foo*\n* bar'
+ tokens = [
+ (Keyword, '*'),
+ (Token.Text, ' '),
+ (Generic.Emph, '*foo*'),
+ (Token.Text, '\n'),
+ (Keyword, '*'),
+ (Token.Text, ' '),
+ (Token.Text, 'bar'),
+ (Token.Text, '\n'),
+ ]
+ assert list(lexer.get_tokens(fragment)) == tokens
+
+
+def test_numbered_list(lexer):
+ fragment = '1. foo\n2. bar'
+ tokens = [
+ (Keyword, '1.'),
+ (Token.Text, ' '),
+ (Token.Text, 'foo'),
+ (Token.Text, '\n'),
+ (Keyword, '2.'),
+ (Token.Text, ' '),
+ (Token.Text, 'bar'),
+ (Token.Text, '\n'),
+ ]
+ assert list(lexer.get_tokens(fragment)) == tokens
+
+
+def test_quote(lexer):
+ fragment = '> a\n> quote'
+ tokens = [
+ (Keyword, '> '),
+ (Generic.Emph, 'a\n'),
+ (Keyword, '> '),
+ (Generic.Emph, 'quote\n'),
+ ]
+ assert list(lexer.get_tokens(fragment)) == tokens
+
+
+def test_invalid_code_block(lexer):
+ fragments = (
+ '```code```',
+ 'prefix not allowed before ```\ncode block\n```'
+ ' code',
+ )
+
+ for fragment in fragments:
+ for token, _ in lexer.get_tokens(fragment):
+ assert token != String.Backtick
+
+
+def test_code_block_fenced_by_backticks(lexer):
+ fragments = (
+ '```\ncode\n```',
+ '```\nmulti\n`line`\ncode\n```',
+ )
+ for fragment in fragments:
+ tokens = [
+ (String.Backtick, fragment),
+ (Token.Text, '\n'),
+ ]
+ assert list(lexer.get_tokens(fragment)) == tokens
+
+
+def test_code_block_with_language(lexer):
+ fragments = (
+ '```python\nimport this\n```',
+ )
+ for fragment in fragments:
+ tokens = [
+ (String.Backtick, '```'),
+ (String.Backtick, 'python'),
+ (Token.Text, '\n'),
+ (Token.Keyword.Namespace, 'import'),
+ (Token.Text, ' '),
+ (Token.Name.Namespace, 'this'),
+ (Token.Text, '\n'),
+ (String.Backtick, '```'),
+ (Token.Text, '\n'),
+ ]
+ assert list(lexer.get_tokens(fragment)) == tokens
+
+
+def test_code_indented_with_spaces(lexer):
+ fragments = (
+ 'sample:\n\n code\n',
+ )
+ for fragment in fragments:
+ tokens = [
+ (Token.Text, 'sample:'),
+ (Token.Text, '\n\n'),
+ (String.Backtick, ' code\n'),
+ ]
+ assert list(lexer.get_tokens(fragment)) == tokens
+
+ fragments = (
+ 'sample:\n\n\tcode\n',
+ )
+ for fragment in fragments:
+ tokens = [
+ (Token.Text, 'sample:'),
+ (Token.Text, '\n\n'),
+ (String.Backtick, '\tcode\n'),
+ ]
+ assert list(lexer.get_tokens(fragment)) == tokens
+
+
+def test_inline_code(lexer):
+ fragment = 'code: `code`'
+ tokens = [
+ (Token.Text, 'code:'),
+ (Token.Text, ' '),
+ (String.Backtick, '`code`'),
+ (Token.Text, '\n'),
+ ]
+ assert list(lexer.get_tokens(fragment)) == tokens
+
+ fragment = ' `**code**`'
+ tokens = [
+ (Token.Text, ' '),
+ (String.Backtick, '`**code**`'),
+ (Token.Text, '\n'),
+ ]
+ assert list(lexer.get_tokens(fragment)) == tokens
+
+ fragment = '(`code`)'
+ tokens = [
+ (Token.Text, '('),
+ (String.Backtick, '`code`'),
+ (Token.Text, ')'),
+ (Token.Text, '\n'),
+ ]
+ assert list(lexer.get_tokens(fragment)) == tokens
+
+
+def test_invalid_bold(lexer):
+ fragments = (
+ '**no bold__',
+ '__no bold**',
+ '*no bold*',
+ '_no bold_',
+ )
+
+ for fragment in fragments:
+ for token, _ in lexer.get_tokens(fragment):
+ assert token != Generic.Strong
+
+
+def test_bold_fenced_by_asterisk(lexer):
+ fragment = '**bold**'
+ tokens = [
+ (Generic.Strong, '**bold**'),
+ (Token.Text, '\n'),
+ ]
+ assert list(lexer.get_tokens(fragment)) == tokens
+
+
+def test_bold_fenced_by_underscore(lexer):
+ fragment = '__bold__'
+ tokens = [
+ (Generic.Strong, '__bold__'),
+ (Token.Text, '\n'),
+ ]
+ assert list(lexer.get_tokens(fragment)) == tokens
+
+
+def test_invalid_italics(lexer):
+ fragments = (
+ '*no italics_',
+ '_no italics*',
+ '**no italics**',
+ '__no italics__',
+ )
+
+ for fragment in fragments:
+ for token, _ in lexer.get_tokens(fragment):
+ assert token != Generic.Emph
+
+
+def test_italics_fenced_by_asterisk(lexer):
+ fragment = '*italics*'
+ tokens = [
+ (Generic.Emph, '*italics*'),
+ (Token.Text, '\n'),
+ ]
+ assert list(lexer.get_tokens(fragment)) == tokens
+
+
+def test_italics_fenced_by_underscore(lexer):
+ fragment = '_italics_'
+ tokens = [
+ (Generic.Emph, '_italics_'),
+ (Token.Text, '\n'),
+ ]
+ assert list(lexer.get_tokens(fragment)) == tokens
+
+
+def test_escape_italics(lexer):
+ fragments = (
+ r'\*no italics\*',
+ r'\_ no italics \_',
+ )
+
+ for fragment in fragments:
+ for token, _ in lexer.get_tokens(fragment):
+ assert token != Generic.Emph
+
+
+def test_italics_no_multiline(lexer):
+ fragment = '*no\nitalics*'
+
+ for token, _ in lexer.get_tokens(fragment):
+ assert token != Generic.Emph
+
+
+def test_italics_and_bold(lexer):
+ fragment = '**bold** and *italics*'
+ tokens = [
+ (Generic.Strong, '**bold**'),
+ (Token.Text, ' '),
+ (Token.Text, 'and'),
+ (Token.Text, ' '),
+ (Generic.Emph, '*italics*'),
+ (Token.Text, '\n'),
+ ]
+ assert list(lexer.get_tokens(fragment)) == tokens
+
+ fragment = '*italics* and **bold**'
+ tokens = [
+ (Generic.Emph, '*italics*'),
+ (Token.Text, ' '),
+ (Token.Text, 'and'),
+ (Token.Text, ' '),
+ (Generic.Strong, '**bold**'),
+ (Token.Text, '\n'),
+ ]
+ assert list(lexer.get_tokens(fragment)) == tokens
+
+
+def test_strikethrough(lexer):
+ fragment = '~~striked~~not striked'
+ tokens = [
+ (Generic.Deleted, '~~striked~~'),
+ (Token.Text, 'not'),
+ (Token.Text, ' '),
+ (Token.Text, 'striked'),
+ (Token.Text, '\n'),
+ ]
+ assert list(lexer.get_tokens(fragment)) == tokens
+
+
+def test_mentions(lexer):
+ fragment = 'note for @me:'
+ tokens = [
+ (Token.Text, 'note'),
+ (Token.Text, ' '),
+ (Token.Text, 'for'),
+ (Token.Text, ' '),
+ (Name.Entity, '@me:'),
+ (Token.Text, '\n'),
+ ]
+ assert list(lexer.get_tokens(fragment)) == tokens
+
+
+def test_topics(lexer):
+ fragment = 'message to #you:'
+ tokens = [
+ (Token.Text, 'message'),
+ (Token.Text, ' '),
+ (Token.Text, 'to'),
+ (Token.Text, ' '),
+ (Name.Entity, '#you:'),
+ (Token.Text, '\n'),
+ ]
+ assert list(lexer.get_tokens(fragment)) == tokens
+
+
+def test_links(lexer):
+ fragment = '[text](link)'
+ tokens = [
+ (Token.Text, '['),
+ (Token.Name.Tag, 'text'),
+ (Token.Text, ']'),
+ (Token.Text, '('),
+ (Token.Name.Attribute, 'link'),
+ (Token.Text, ')'),
+ (Token.Text, '\n'),
+ ]
+ assert list(lexer.get_tokens(fragment)) == tokens
+
+ fragment = '![Image of foo](https://bar.baz)'
+ tokens = [
+ (Token.Text, '!['),
+ (Token.Name.Tag, 'Image of foo'),
+ (Token.Text, ']'),
+ (Token.Text, '('),
+ (Token.Name.Attribute, 'https://bar.baz'),
+ (Token.Text, ')'),
+ (Token.Text, '\n'),
+ ]
+ assert list(lexer.get_tokens(fragment)) == tokens
+
+
+def test_reference_style_links(lexer):
+ fragment = '[an example][id]'
+ tokens = [
+ (Token.Text, '['),
+ (Token.Name.Tag, 'an example'),
+ (Token.Text, ']'),
+ (Token.Text, '['),
+ (Token.Name.Label, 'id'),
+ (Token.Text, ']'),
+ (Token.Text, '\n'),
+ ]
+ assert list(lexer.get_tokens(fragment)) == tokens
+
+ fragment = '[id]: http://example.com'
+ tokens = [
+ (Token.Text, '['),
+ (Token.Name.Label, 'id'),
+ (Token.Text, ']: '),
+ (Token.Name.Attribute, 'http://example.com'),
+ (Token.Text, '\n'),
+ ]
+ assert list(lexer.get_tokens(fragment)) == tokens