diff options
author | Matth?us G. Chajdas <dev@anteru.net> | 2019-11-10 13:56:53 +0100 |
---|---|---|
committer | Matth?us G. Chajdas <dev@anteru.net> | 2019-11-10 13:56:53 +0100 |
commit | 1dd3124a9770e11b6684e5dd1e6bc15a0aa3bc67 (patch) | |
tree | 87a171383266dd1f64196589af081bc2f8e497c3 /scripts/debug_lexer.py | |
parent | f1c080e184dc1bbc36eaa7cd729ff3a499de568a (diff) | |
download | pygments-master.tar.gz |
Diffstat (limited to 'scripts/debug_lexer.py')
-rwxr-xr-x | scripts/debug_lexer.py | 246 |
1 files changed, 0 insertions, 246 deletions
diff --git a/scripts/debug_lexer.py b/scripts/debug_lexer.py deleted file mode 100755 index ef01a23f..00000000 --- a/scripts/debug_lexer.py +++ /dev/null @@ -1,246 +0,0 @@ -#!/usr/bin/python -# -*- coding: utf-8 -*- -""" - Lexing error finder - ~~~~~~~~~~~~~~~~~~~ - - For the source files given on the command line, display - the text where Error tokens are being generated, along - with some context. - - :copyright: Copyright 2006-2019 by the Pygments team, see AUTHORS. - :license: BSD, see LICENSE for details. -""" - -from __future__ import print_function - -import os -import sys - -# always prefer Pygments from source if exists -srcpath = os.path.join(os.path.dirname(__file__), '..') -if os.path.isdir(os.path.join(srcpath, 'pygments')): - sys.path.insert(0, srcpath) - - -from pygments.lexer import RegexLexer, ExtendedRegexLexer, LexerContext, \ - ProfilingRegexLexer, ProfilingRegexLexerMeta -from pygments.lexers import get_lexer_by_name, find_lexer_class, \ - find_lexer_class_for_filename -from pygments.token import Error, Text, _TokenType -from pygments.cmdline import _parse_options - - -class DebuggingRegexLexer(ExtendedRegexLexer): - """Make the state stack, position and current match instance attributes.""" - - def get_tokens_unprocessed(self, text, stack=('root',)): - """ - Split ``text`` into (tokentype, text) pairs. - - ``stack`` is the inital stack (default: ``['root']``) - """ - tokendefs = self._tokens - self.ctx = ctx = LexerContext(text, 0) - ctx.stack = list(stack) - statetokens = tokendefs[ctx.stack[-1]] - while 1: - for rexmatch, action, new_state in statetokens: - self.m = m = rexmatch(text, ctx.pos, ctx.end) - if m: - if action is not None: - if type(action) is _TokenType: - yield ctx.pos, action, m.group() - ctx.pos = m.end() - else: - if not isinstance(self, ExtendedRegexLexer): - for item in action(self, m): - yield item - ctx.pos = m.end() - else: - for item in action(self, m, ctx): - yield item - if not new_state: - # altered the state stack? - statetokens = tokendefs[ctx.stack[-1]] - if new_state is not None: - # state transition - if isinstance(new_state, tuple): - for state in new_state: - if state == '#pop': - ctx.stack.pop() - elif state == '#push': - ctx.stack.append(ctx.stack[-1]) - else: - ctx.stack.append(state) - elif isinstance(new_state, int): - # pop - del ctx.stack[new_state:] - elif new_state == '#push': - ctx.stack.append(ctx.stack[-1]) - else: - assert False, 'wrong state def: %r' % new_state - statetokens = tokendefs[ctx.stack[-1]] - break - else: - try: - if ctx.pos >= ctx.end: - break - if text[ctx.pos] == '\n': - # at EOL, reset state to 'root' - ctx.stack = ['root'] - statetokens = tokendefs['root'] - yield ctx.pos, Text, u'\n' - ctx.pos += 1 - continue - yield ctx.pos, Error, text[ctx.pos] - ctx.pos += 1 - except IndexError: - break - - -def main(fn, lexer=None, options={}): - if lexer is not None: - lxcls = get_lexer_by_name(lexer).__class__ - else: - lxcls = find_lexer_class_for_filename(os.path.basename(fn)) - if lxcls is None: - name, rest = fn.split('_', 1) - lxcls = find_lexer_class(name) - if lxcls is None: - raise AssertionError('no lexer found for file %r' % fn) - print('Using lexer: %s (%s.%s)' % (lxcls.name, lxcls.__module__, - lxcls.__name__)) - debug_lexer = False - # if profile: - # # does not work for e.g. ExtendedRegexLexers - # if lxcls.__bases__ == (RegexLexer,): - # # yes we can! (change the metaclass) - # lxcls.__class__ = ProfilingRegexLexerMeta - # lxcls.__bases__ = (ProfilingRegexLexer,) - # lxcls._prof_sort_index = profsort - # else: - # if lxcls.__bases__ == (RegexLexer,): - # lxcls.__bases__ = (DebuggingRegexLexer,) - # debug_lexer = True - # elif lxcls.__bases__ == (DebuggingRegexLexer,): - # # already debugged before - # debug_lexer = True - # else: - # # HACK: ExtendedRegexLexer subclasses will only partially work here. - # lxcls.__bases__ = (DebuggingRegexLexer,) - # debug_lexer = True - - lx = lxcls(**options) - lno = 1 - if fn == '-': - text = sys.stdin.read() - else: - with open(fn, 'rb') as fp: - text = fp.read().decode('utf-8') - text = text.strip('\n') + '\n' - tokens = [] - states = [] - - def show_token(tok, state): - reprs = list(map(repr, tok)) - print(' ' + reprs[1] + ' ' + ' ' * (29-len(reprs[1])) + reprs[0], end=' ') - if debug_lexer: - print(' ' + ' ' * (29-len(reprs[0])) + ' : '.join(state) if state else '', end=' ') - print() - - for type, val in lx.get_tokens(text): - lno += val.count('\n') - if type == Error and not ignerror: - print('Error parsing', fn, 'on line', lno) - if not showall: - print('Previous tokens' + (debug_lexer and ' and states' or '') + ':') - for i in range(max(len(tokens) - num, 0), len(tokens)): - if debug_lexer: - show_token(tokens[i], states[i]) - else: - show_token(tokens[i], None) - print('Error token:') - l = len(repr(val)) - print(' ' + repr(val), end=' ') - if debug_lexer and hasattr(lx, 'ctx'): - print(' ' * (60-l) + ' : '.join(lx.ctx.stack), end=' ') - print() - print() - return 1 - tokens.append((type, val)) - if debug_lexer: - if hasattr(lx, 'ctx'): - states.append(lx.ctx.stack[:]) - else: - states.append(None) - if showall: - show_token((type, val), states[-1] if debug_lexer else None) - return 0 - - -def print_help(): - print('''\ -Pygments development helper to quickly debug lexers. - - scripts/debug_lexer.py [options] file ... - -Give one or more filenames to lex them and display possible error tokens -and/or profiling info. Files are assumed to be encoded in UTF-8. - -Selecting lexer and options: - - -l NAME use lexer named NAME (default is to guess from - the given filenames) - -O OPTIONSTR use lexer options parsed from OPTIONSTR - -Debugging lexing errors: - - -n N show the last N tokens on error - -a always show all lexed tokens (default is only - to show them when an error occurs) - -e do not stop on error tokens - -Profiling: - - -p use the ProfilingRegexLexer to profile regexes - instead of the debugging lexer - -s N sort profiling output by column N (default is - column 4, the time per call) -''') - -num = 10 -showall = False -ignerror = False -lexer = None -options = {} -profile = False -profsort = 4 - -if __name__ == '__main__': - import getopt - opts, args = getopt.getopt(sys.argv[1:], 'n:l:aepO:s:h') - for opt, val in opts: - if opt == '-n': - num = int(val) - elif opt == '-a': - showall = True - elif opt == '-e': - ignerror = True - elif opt == '-l': - lexer = val - elif opt == '-p': - profile = True - elif opt == '-s': - profsort = int(val) - elif opt == '-O': - options = _parse_options([val]) - elif opt == '-h': - print_help() - sys.exit(0) - ret = 0 - if not args: - print_help() - for f in args: - ret += main(f, lexer, options) - sys.exit(bool(ret)) |