scripts/debug_lexer: allow guessing from content

author: Georg Brandl <georg@python.org> 2020-12-19 18:37:36 +0100
committer: Georg Brandl <georg@python.org> 2020-12-19 18:38:24 +0100
commit: 873298b4d8511f635dba1d0d29fdf2a677f9d85f (patch)
tree: a2eaa122fd979e76e2ab7753c7b186c834c7e8f6 /scripts/debug_lexer.py
parent: a56ed8a1f2eb0db14aad50cb7e4eaaf7f2f0d3b3 (diff)
download: pygments-git-873298b4d8511f635dba1d0d29fdf2a677f9d85f.tar.gz
1 files changed, 23 insertions, 8 deletions
diff --git a/scripts/debug_lexer.py b/scripts/debug_lexer.py
index 0f1dc0bd..059a489f 100755
--- a/scripts/debug_lexer.py
+++ b/scripts/debug_lexer.py
@@ -24,7 +24,7 @@ if os.path.isdir(os.path.join(srcpath, 'pygments')):
 from pygments.lexer import RegexLexer, ExtendedRegexLexer, LexerContext, \
     ProfilingRegexLexer, ProfilingRegexLexerMeta
 from pygments.lexers import get_lexer_by_name, find_lexer_class, \
-    find_lexer_class_for_filename
+    find_lexer_class_for_filename, guess_lexer
 from pygments.token import Error, Text, _TokenType
 from pygments.cmdline import _parse_options
 
@@ -96,8 +96,24 @@ class DebuggingRegexLexer(ExtendedRegexLexer):
 
 
 def main(fn, lexer=None, options={}):
+    if fn == '-':
+        text = sys.stdin.read()
+    else:
+        try:
+            with open(fn, 'rb') as fp:
+                text = fp.read().decode('utf-8')
+        except UnicodeError:
+            print('Warning: non-UTF8 input, using latin1')
+            with open(fn, 'rb') as fp:
+                text = fp.read().decode('latin1')
+    text = text.strip('\n') + '\n'
+
     if lexer is not None:
         lxcls = get_lexer_by_name(lexer).__class__
+    elif guess:
+        lxcls = guess_lexer(text).__class__
+        print('Using lexer: %s (%s.%s)' % (lxcls.name, lxcls.__module__,
+                                           lxcls.__name__))
     else:
         lxcls = find_lexer_class_for_filename(os.path.basename(fn))
         if lxcls is None:
@@ -129,12 +145,6 @@ def main(fn, lexer=None, options={}):
 
     lx = lxcls(**options)
     lno = 1
-    if fn == '-':
-        text = sys.stdin.read()
-    else:
-        with open(fn, 'rb') as fp:
-            text = fp.read().decode('utf-8')
-    text = text.strip('\n') + '\n'
     tokens = []
     states = []
 
@@ -188,6 +198,7 @@ Selecting lexer and options:
 
     -l NAME         use lexer named NAME (default is to guess from
                     the given filenames)
+    -g              guess lexer from content
     -O OPTIONSTR    use lexer options parsed from OPTIONSTR
 
 Debugging lexing errors:
@@ -205,6 +216,7 @@ Profiling:
                     column 4, the time per call)
 ''')
 
+
 num = 10
 showall = False
 ignerror = False
@@ -212,10 +224,11 @@ lexer = None
 options = {}
 profile = False
 profsort = 4
+guess = False
 
 if __name__ == '__main__':
     import getopt
-    opts, args = getopt.getopt(sys.argv[1:], 'n:l:aepO:s:h')
+    opts, args = getopt.getopt(sys.argv[1:], 'n:l:aepO:s:hg')
     for opt, val in opts:
         if opt == '-n':
             num = int(val)
@@ -231,6 +244,8 @@ if __name__ == '__main__':
             profsort = int(val)
         elif opt == '-O':
             options = _parse_options([val])
+        elif opt == '-g':
+            guess = True
         elif opt == '-h':
             print_help()
             sys.exit(0)
author	Georg Brandl <georg@python.org>	2020-12-19 18:37:36 +0100
committer	Georg Brandl <georg@python.org>	2020-12-19 18:38:24 +0100
commit	873298b4d8511f635dba1d0d29fdf2a677f9d85f (patch)
tree	a2eaa122fd979e76e2ab7753c7b186c834c7e8f6 /scripts/debug_lexer.py
parent	a56ed8a1f2eb0db14aad50cb7e4eaaf7f2f0d3b3 (diff)
download	pygments-git-873298b4d8511f635dba1d0d29fdf2a677f9d85f.tar.gz