diff options
-rw-r--r-- | pygments/lexers/markup.py | 53 | ||||
-rw-r--r-- | tests/test_markdown_lexer.py | 525 |
2 files changed, 557 insertions, 21 deletions
diff --git a/pygments/lexers/markup.py b/pygments/lexers/markup.py index 8c03d0cd..f185ce9e 100644 --- a/pygments/lexers/markup.py +++ b/pygments/lexers/markup.py @@ -519,9 +519,9 @@ class MarkdownLexer(RegexLexer): from pygments.lexers import get_lexer_by_name # section header - yield match.start(1), String , match.group(1) - yield match.start(2), String , match.group(2) - yield match.start(3), Text , match.group(3) + yield match.start(1), String.Backtick, match.group(1) + yield match.start(2), String.Backtick, match.group(2) + yield match.start(3), Text , match.group(3) # lookup lexer if wanted and existing lexer = None @@ -539,44 +539,55 @@ class MarkdownLexer(RegexLexer): for item in do_insertions([], lexer.get_tokens_unprocessed(code)): yield item - yield match.start(5), String , match.group(5) + yield match.start(5), String.Backtick, match.group(5) tokens = { 'root': [ - # heading with pound prefix - (r'^(#)([^#].+\n)', bygroups(Generic.Heading, Text)), - (r'^(#{2,6})(.+\n)', bygroups(Generic.Subheading, Text)), + # heading with '#' prefix (atx-style) + (r'(^#[^#].+)(\n)', bygroups(Generic.Heading, Text)), + # subheading with '#' prefix (atx-style) + (r'(^#{2,6}[^#].+)(\n)', bygroups(Generic.Subheading, Text)), + # heading with '=' underlines (Setext-style) + (r'^(.+)(\n)(=+)(\n)', bygroups(Generic.Heading, Text, Generic.Heading, Text)), + # subheading with '-' underlines (Setext-style) + (r'^(.+)(\n)(-+)(\n)', bygroups(Generic.Subheading, Text, Generic.Subheading, Text)), # task list (r'^(\s*)([*-] )(\[[ xX]\])( .+\n)', bygroups(Text, Keyword, Keyword, using(this, state='inline'))), - # bulleted lists + # bulleted list (r'^(\s*)([*-])(\s)(.+\n)', bygroups(Text, Keyword, Text, using(this, state='inline'))), - # numbered lists + # numbered list (r'^(\s*)([0-9]+\.)( .+\n)', bygroups(Text, Keyword, using(this, state='inline'))), # quote (r'^(\s*>\s)(.+\n)', bygroups(Keyword, Generic.Emph)), - # text block - (r'^(```\n)([\w\W]*?)(^```$)', bygroups(String, Text, String)), + # code block fenced by 3 backticks + (r'^(\s*```\n(.+\n)+\s*```$)', String.Backtick), # code block with language - (r'^(```)(\w+)(\n)([\w\W]*?)(^```$)', _handle_codeblock), + (r'^(\s*```)(\w+)(\n)([\w\W]*?)(^\s*```$)', _handle_codeblock), + # code block indented with 4 spaces or 1 tab + (r'(\n\n)((\ {4}|\t)(.+\n)+)', bygroups(Text, String.Backtick)), include('inline'), ], 'inline': [ # escape (r'\\.', Text), - # italics - (r'(\s)([*_][^*_]+[*_])(\W|\n)', bygroups(Text, Generic.Emph, Text)), - # bold - # warning: the following rule eats internal tags. eg. **foo _bar_ baz** bar is not italics - (r'(\s)((\*\*|__).*\3)((?=\W|\n))', bygroups(Text, Generic.Strong, None, Text)), - # "proper way" (r'(\s)([*_]{2}[^*_]+[*_]{2})((?=\W|\n))', bygroups(Text, Generic.Strong, Text)), - # strikethrough - (r'(\s)(~~[^~]+~~)((?=\W|\n))', bygroups(Text, Generic.Deleted, Text)), # inline code - (r'`[^`]+`', String.Backtick), + (r'([^`])(`[^`\n]+`)', bygroups(Text, String.Backtick)), + # warning: the following rules eat outer tags. + # eg. **foo _bar_ baz** => foo and baz are not recognized as bold + # bold fenced by '**' + (r'(\*\*[^\*\n\ ][^\*\n]*\*\*)', bygroups(Generic.Strong)), + # # bold fenced by '__' + (r'(\_\_[^\_\n\ ][^\_\n]*\_\_)', bygroups(Generic.Strong)), + # italics fenced by '*' + (r'(\*[^\*\n\ ][^\*\n]*\*)', bygroups(Generic.Emph)), + # italics fenced by '_' + (r'(\_[^\_\n\ ][^\_\n]*\_)', bygroups(Generic.Emph)), + # strikethrough + (r'([^~]*)(~~[^~]+~~)', bygroups(Text, Generic.Deleted)), # mentions and topics (twitter and github stuff) (r'[@#][\w/:]+', Name.Entity), # (image?) links eg:  diff --git a/tests/test_markdown_lexer.py b/tests/test_markdown_lexer.py index 9024bf07..524becd7 100644 --- a/tests/test_markdown_lexer.py +++ b/tests/test_markdown_lexer.py @@ -8,6 +8,7 @@ """ import pytest +from pygments.token import Generic, Token, String, Keyword, Name from pygments.lexers.markup import MarkdownLexer @@ -34,3 +35,527 @@ def test_code_fence_gsm(lexer): def test_code_fence_gsm_with_no_lexer(lexer): assert_same_text(lexer, r'```invalid-lexer\nfoo\n```\n') + + +def test_invalid_atx_heading(lexer): + fragments = ( + '#', + 'a #', + '*#', + ) + + for fragment in fragments: + for token, _ in lexer.get_tokens(fragment): + assert token != Generic.Heading + + +def test_atx_heading(lexer): + fragments = ( + '#Heading', + '# Heading', + '# Another heading', + '# Another # heading', + '# Heading #', + ) + + for fragment in fragments: + tokens = [ + (Generic.Heading, fragment), + (Token.Text, '\n'), + ] + assert list(lexer.get_tokens(fragment)) == tokens + + +def test_invalid_atx_subheading(lexer): + fragments = ( + '##', + 'a ##', + '*##', + '####### too many hashes' + ) + + for fragment in fragments: + for token, _ in lexer.get_tokens(fragment): + assert token != Generic.Subheading + + +def test_atx_subheading(lexer): + fragments = ( + '##Subheading', + '## Subheading', + '### Subheading', + '#### Subheading', + '##### Subheading', + '###### Subheading', + '## Another subheading', + '## Another ## subheading', + '###### Subheading #', + '###### Subheading ######', + ) + + for fragment in fragments: + tokens = [ + (Generic.Subheading, fragment), + (Token.Text, '\n'), + ] + assert list(lexer.get_tokens(fragment)) == tokens + + +def test_invalid_setext_heading(lexer): + fragments = ( + 'Heading\n', + 'Heading\n_', + 'Heading\n =====', + 'Heading\na=====', + '=====', + '\n=\n', + 'Heading\n=====Text' + ) + + for fragment in fragments: + for token, _ in lexer.get_tokens(fragment): + assert token != Generic.Heading + + +def test_setext_heading(lexer): + fragments = ( + 'Heading\n=', + 'Heading\n=======', + 'Heading\n==========', + ) + + for fragment in fragments: + tokens = [ + (Generic.Heading, fragment.split('\n')[0]), + (Token.Text, '\n'), + (Generic.Heading, fragment.split('\n')[1]), + (Token.Text, '\n'), + ] + assert list(lexer.get_tokens(fragment)) == tokens + + +def test_invalid_setext_subheading(lexer): + fragments = ( + 'Subheading\n', + 'Subheading\n_', + 'Subheading\n -----', + 'Subheading\na-----', + '-----', + '\n-\n', + 'Subheading\n-----Text' + ) + + for fragment in fragments: + for token, _ in lexer.get_tokens(fragment): + assert token != Generic.Subheading + + +def test_setext_subheading(lexer): + fragments = ( + 'Subheading\n-', + 'Subheading\n----------', + 'Subheading\n-----------', + ) + + for fragment in fragments: + tokens = [ + (Generic.Subheading, fragment.split('\n')[0]), + (Token.Text, '\n'), + (Generic.Subheading, fragment.split('\n')[1]), + (Token.Text, '\n'), + ] + assert list(lexer.get_tokens(fragment)) == tokens + + +def test_task_list(lexer): + fragment = '- [ ] sample task' + tokens = [ + (Keyword, '- '), + (Keyword, '[ ]'), + (Token.Text, ' '), + (Token.Text, 'sample'), + (Token.Text, ' '), + (Token.Text, 'task'), + (Token.Text, '\n'), + ] + assert list(lexer.get_tokens(fragment)) == tokens + + fragment = '* [ ] sample task' + tokens = [ + (Keyword, '* '), + (Keyword, '[ ]'), + (Token.Text, ' '), + (Token.Text, 'sample'), + (Token.Text, ' '), + (Token.Text, 'task'), + (Token.Text, '\n'), + ] + assert list(lexer.get_tokens(fragment)) == tokens + + fragment = ' * [ ] sample task' + tokens = [ + (Token.Text, ' '), + (Keyword, '* '), + (Keyword, '[ ]'), + (Token.Text, ' '), + (Token.Text, 'sample'), + (Token.Text, ' '), + (Token.Text, 'task'), + (Token.Text, '\n'), + ] + assert list(lexer.get_tokens(fragment)) == tokens + + +def test_bulleted_list(lexer): + fragment = '* foo\n* bar' + tokens = [ + (Keyword, '*'), + (Token.Text, ' '), + (Token.Text, 'foo'), + (Token.Text, '\n'), + (Keyword, '*'), + (Token.Text, ' '), + (Token.Text, 'bar'), + (Token.Text, '\n'), + ] + assert list(lexer.get_tokens(fragment)) == tokens + + fragment = '- foo\n- bar' + tokens = [ + (Keyword, '-'), + (Token.Text, ' '), + (Token.Text, 'foo'), + (Token.Text, '\n'), + (Keyword, '-'), + (Token.Text, ' '), + (Token.Text, 'bar'), + (Token.Text, '\n'), + ] + assert list(lexer.get_tokens(fragment)) == tokens + + fragment = '* *foo*\n* bar' + tokens = [ + (Keyword, '*'), + (Token.Text, ' '), + (Generic.Emph, '*foo*'), + (Token.Text, '\n'), + (Keyword, '*'), + (Token.Text, ' '), + (Token.Text, 'bar'), + (Token.Text, '\n'), + ] + assert list(lexer.get_tokens(fragment)) == tokens + + +def test_numbered_list(lexer): + fragment = '1. foo\n2. bar' + tokens = [ + (Keyword, '1.'), + (Token.Text, ' '), + (Token.Text, 'foo'), + (Token.Text, '\n'), + (Keyword, '2.'), + (Token.Text, ' '), + (Token.Text, 'bar'), + (Token.Text, '\n'), + ] + assert list(lexer.get_tokens(fragment)) == tokens + + +def test_quote(lexer): + fragment = '> a\n> quote' + tokens = [ + (Keyword, '> '), + (Generic.Emph, 'a\n'), + (Keyword, '> '), + (Generic.Emph, 'quote\n'), + ] + assert list(lexer.get_tokens(fragment)) == tokens + + +def test_invalid_code_block(lexer): + fragments = ( + '```code```', + 'prefix not allowed before ```\ncode block\n```' + ' code', + ) + + for fragment in fragments: + for token, _ in lexer.get_tokens(fragment): + assert token != String.Backtick + + +def test_code_block_fenced_by_backticks(lexer): + fragments = ( + '```\ncode\n```', + '```\nmulti\n`line`\ncode\n```', + ) + for fragment in fragments: + tokens = [ + (String.Backtick, fragment), + (Token.Text, '\n'), + ] + assert list(lexer.get_tokens(fragment)) == tokens + + +def test_code_block_with_language(lexer): + fragments = ( + '```python\nimport this\n```', + ) + for fragment in fragments: + tokens = [ + (String.Backtick, '```'), + (String.Backtick, 'python'), + (Token.Text, '\n'), + (Token.Keyword.Namespace, 'import'), + (Token.Text, ' '), + (Token.Name.Namespace, 'this'), + (Token.Text, '\n'), + (String.Backtick, '```'), + (Token.Text, '\n'), + ] + assert list(lexer.get_tokens(fragment)) == tokens + + +def test_code_indented_with_spaces(lexer): + fragments = ( + 'sample:\n\n code\n', + ) + for fragment in fragments: + tokens = [ + (Token.Text, 'sample:'), + (Token.Text, '\n\n'), + (String.Backtick, ' code\n'), + ] + assert list(lexer.get_tokens(fragment)) == tokens + + fragments = ( + 'sample:\n\n\tcode\n', + ) + for fragment in fragments: + tokens = [ + (Token.Text, 'sample:'), + (Token.Text, '\n\n'), + (String.Backtick, '\tcode\n'), + ] + assert list(lexer.get_tokens(fragment)) == tokens + + +def test_inline_code(lexer): + fragment = 'code: `code`' + tokens = [ + (Token.Text, 'code:'), + (Token.Text, ' '), + (String.Backtick, '`code`'), + (Token.Text, '\n'), + ] + assert list(lexer.get_tokens(fragment)) == tokens + + fragment = ' `**code**`' + tokens = [ + (Token.Text, ' '), + (String.Backtick, '`**code**`'), + (Token.Text, '\n'), + ] + assert list(lexer.get_tokens(fragment)) == tokens + + fragment = '(`code`)' + tokens = [ + (Token.Text, '('), + (String.Backtick, '`code`'), + (Token.Text, ')'), + (Token.Text, '\n'), + ] + assert list(lexer.get_tokens(fragment)) == tokens + + +def test_invalid_bold(lexer): + fragments = ( + '**no bold__', + '__no bold**', + '*no bold*', + '_no bold_', + ) + + for fragment in fragments: + for token, _ in lexer.get_tokens(fragment): + assert token != Generic.Strong + + +def test_bold_fenced_by_asterisk(lexer): + fragment = '**bold**' + tokens = [ + (Generic.Strong, '**bold**'), + (Token.Text, '\n'), + ] + assert list(lexer.get_tokens(fragment)) == tokens + + +def test_bold_fenced_by_underscore(lexer): + fragment = '__bold__' + tokens = [ + (Generic.Strong, '__bold__'), + (Token.Text, '\n'), + ] + assert list(lexer.get_tokens(fragment)) == tokens + + +def test_invalid_italics(lexer): + fragments = ( + '*no italics_', + '_no italics*', + '**no italics**', + '__no italics__', + ) + + for fragment in fragments: + for token, _ in lexer.get_tokens(fragment): + assert token != Generic.Emph + + +def test_italics_fenced_by_asterisk(lexer): + fragment = '*italics*' + tokens = [ + (Generic.Emph, '*italics*'), + (Token.Text, '\n'), + ] + assert list(lexer.get_tokens(fragment)) == tokens + + +def test_italics_fenced_by_underscore(lexer): + fragment = '_italics_' + tokens = [ + (Generic.Emph, '_italics_'), + (Token.Text, '\n'), + ] + assert list(lexer.get_tokens(fragment)) == tokens + + +def test_escape_italics(lexer): + fragments = ( + r'\*no italics\*', + r'\_ no italics \_', + ) + + for fragment in fragments: + for token, _ in lexer.get_tokens(fragment): + assert token != Generic.Emph + + +def test_italics_no_multiline(lexer): + fragment = '*no\nitalics*' + + for token, _ in lexer.get_tokens(fragment): + assert token != Generic.Emph + + +def test_italics_and_bold(lexer): + fragment = '**bold** and *italics*' + tokens = [ + (Generic.Strong, '**bold**'), + (Token.Text, ' '), + (Token.Text, 'and'), + (Token.Text, ' '), + (Generic.Emph, '*italics*'), + (Token.Text, '\n'), + ] + assert list(lexer.get_tokens(fragment)) == tokens + + fragment = '*italics* and **bold**' + tokens = [ + (Generic.Emph, '*italics*'), + (Token.Text, ' '), + (Token.Text, 'and'), + (Token.Text, ' '), + (Generic.Strong, '**bold**'), + (Token.Text, '\n'), + ] + assert list(lexer.get_tokens(fragment)) == tokens + + +def test_strikethrough(lexer): + fragment = '~~striked~~not striked' + tokens = [ + (Generic.Deleted, '~~striked~~'), + (Token.Text, 'not'), + (Token.Text, ' '), + (Token.Text, 'striked'), + (Token.Text, '\n'), + ] + assert list(lexer.get_tokens(fragment)) == tokens + + +def test_mentions(lexer): + fragment = 'note for @me:' + tokens = [ + (Token.Text, 'note'), + (Token.Text, ' '), + (Token.Text, 'for'), + (Token.Text, ' '), + (Name.Entity, '@me:'), + (Token.Text, '\n'), + ] + assert list(lexer.get_tokens(fragment)) == tokens + + +def test_topics(lexer): + fragment = 'message to #you:' + tokens = [ + (Token.Text, 'message'), + (Token.Text, ' '), + (Token.Text, 'to'), + (Token.Text, ' '), + (Name.Entity, '#you:'), + (Token.Text, '\n'), + ] + assert list(lexer.get_tokens(fragment)) == tokens + + +def test_links(lexer): + fragment = '[text](link)' + tokens = [ + (Token.Text, '['), + (Token.Name.Tag, 'text'), + (Token.Text, ']'), + (Token.Text, '('), + (Token.Name.Attribute, 'link'), + (Token.Text, ')'), + (Token.Text, '\n'), + ] + assert list(lexer.get_tokens(fragment)) == tokens + + fragment = '' + tokens = [ + (Token.Text, '!['), + (Token.Name.Tag, 'Image of foo'), + (Token.Text, ']'), + (Token.Text, '('), + (Token.Name.Attribute, 'https://bar.baz'), + (Token.Text, ')'), + (Token.Text, '\n'), + ] + assert list(lexer.get_tokens(fragment)) == tokens + + +def test_reference_style_links(lexer): + fragment = '[an example][id]' + tokens = [ + (Token.Text, '['), + (Token.Name.Tag, 'an example'), + (Token.Text, ']'), + (Token.Text, '['), + (Token.Name.Label, 'id'), + (Token.Text, ']'), + (Token.Text, '\n'), + ] + assert list(lexer.get_tokens(fragment)) == tokens + + fragment = '[id]: http://example.com' + tokens = [ + (Token.Text, '['), + (Token.Name.Label, 'id'), + (Token.Text, ']: '), + (Token.Name.Attribute, 'http://example.com'), + (Token.Text, '\n'), + ] + assert list(lexer.get_tokens(fragment)) == tokens |