diff options
author | Georg Brandl <georg@python.org> | 2020-12-19 18:37:36 +0100 |
---|---|---|
committer | Georg Brandl <georg@python.org> | 2020-12-19 18:38:24 +0100 |
commit | 873298b4d8511f635dba1d0d29fdf2a677f9d85f (patch) | |
tree | a2eaa122fd979e76e2ab7753c7b186c834c7e8f6 /scripts/debug_lexer.py | |
parent | a56ed8a1f2eb0db14aad50cb7e4eaaf7f2f0d3b3 (diff) | |
download | pygments-git-873298b4d8511f635dba1d0d29fdf2a677f9d85f.tar.gz |
scripts/debug_lexer: allow guessing from content
Diffstat (limited to 'scripts/debug_lexer.py')
-rwxr-xr-x | scripts/debug_lexer.py | 31 |
1 files changed, 23 insertions, 8 deletions
diff --git a/scripts/debug_lexer.py b/scripts/debug_lexer.py index 0f1dc0bd..059a489f 100755 --- a/scripts/debug_lexer.py +++ b/scripts/debug_lexer.py @@ -24,7 +24,7 @@ if os.path.isdir(os.path.join(srcpath, 'pygments')): from pygments.lexer import RegexLexer, ExtendedRegexLexer, LexerContext, \ ProfilingRegexLexer, ProfilingRegexLexerMeta from pygments.lexers import get_lexer_by_name, find_lexer_class, \ - find_lexer_class_for_filename + find_lexer_class_for_filename, guess_lexer from pygments.token import Error, Text, _TokenType from pygments.cmdline import _parse_options @@ -96,8 +96,24 @@ class DebuggingRegexLexer(ExtendedRegexLexer): def main(fn, lexer=None, options={}): + if fn == '-': + text = sys.stdin.read() + else: + try: + with open(fn, 'rb') as fp: + text = fp.read().decode('utf-8') + except UnicodeError: + print('Warning: non-UTF8 input, using latin1') + with open(fn, 'rb') as fp: + text = fp.read().decode('latin1') + text = text.strip('\n') + '\n' + if lexer is not None: lxcls = get_lexer_by_name(lexer).__class__ + elif guess: + lxcls = guess_lexer(text).__class__ + print('Using lexer: %s (%s.%s)' % (lxcls.name, lxcls.__module__, + lxcls.__name__)) else: lxcls = find_lexer_class_for_filename(os.path.basename(fn)) if lxcls is None: @@ -129,12 +145,6 @@ def main(fn, lexer=None, options={}): lx = lxcls(**options) lno = 1 - if fn == '-': - text = sys.stdin.read() - else: - with open(fn, 'rb') as fp: - text = fp.read().decode('utf-8') - text = text.strip('\n') + '\n' tokens = [] states = [] @@ -188,6 +198,7 @@ Selecting lexer and options: -l NAME use lexer named NAME (default is to guess from the given filenames) + -g guess lexer from content -O OPTIONSTR use lexer options parsed from OPTIONSTR Debugging lexing errors: @@ -205,6 +216,7 @@ Profiling: column 4, the time per call) ''') + num = 10 showall = False ignerror = False @@ -212,10 +224,11 @@ lexer = None options = {} profile = False profsort = 4 +guess = False if __name__ == '__main__': import getopt - opts, args = getopt.getopt(sys.argv[1:], 'n:l:aepO:s:h') + opts, args = getopt.getopt(sys.argv[1:], 'n:l:aepO:s:hg') for opt, val in opts: if opt == '-n': num = int(val) @@ -231,6 +244,8 @@ if __name__ == '__main__': profsort = int(val) elif opt == '-O': options = _parse_options([val]) + elif opt == '-g': + guess = True elif opt == '-h': print_help() sys.exit(0) |