2016-12-27 08:10:47 -08:00

245 lines
8.4 KiB
Python
Executable File

#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Lexing error finder
~~~~~~~~~~~~~~~~~~~
For the source files given on the command line, display
the text where Error tokens are being generated, along
with some context.
:copyright: Copyright 2006-2015 by the Pygments team, see AUTHORS.
:license: BSD, see LICENSE for details.
"""
from __future__ import print_function
import os
import sys
# always prefer Pygments from source if exists
srcpath = os.path.join(os.path.dirname(__file__), '..')
if os.path.isdir(os.path.join(srcpath, 'pygments')):
sys.path.insert(0, srcpath)
from pygments.lexer import RegexLexer, ExtendedRegexLexer, LexerContext, \
ProfilingRegexLexer, ProfilingRegexLexerMeta
from pygments.lexers import get_lexer_by_name, find_lexer_class, \
find_lexer_class_for_filename
from pygments.token import Error, Text, _TokenType
from pygments.cmdline import _parse_options
class DebuggingRegexLexer(ExtendedRegexLexer):
"""Make the state stack, position and current match instance attributes."""
def get_tokens_unprocessed(self, text, stack=('root',)):
"""
Split ``text`` into (tokentype, text) pairs.
``stack`` is the inital stack (default: ``['root']``)
"""
tokendefs = self._tokens
self.ctx = ctx = LexerContext(text, 0)
ctx.stack = list(stack)
statetokens = tokendefs[ctx.stack[-1]]
while 1:
for rexmatch, action, new_state in statetokens:
self.m = m = rexmatch(text, ctx.pos, ctx.end)
if m:
if action is not None:
if type(action) is _TokenType:
yield ctx.pos, action, m.group()
ctx.pos = m.end()
else:
if not isinstance(self, ExtendedRegexLexer):
for item in action(self, m):
yield item
ctx.pos = m.end()
else:
for item in action(self, m, ctx):
yield item
if not new_state:
# altered the state stack?
statetokens = tokendefs[ctx.stack[-1]]
if new_state is not None:
# state transition
if isinstance(new_state, tuple):
for state in new_state:
if state == '#pop':
ctx.stack.pop()
elif state == '#push':
ctx.stack.append(ctx.stack[-1])
else:
ctx.stack.append(state)
elif isinstance(new_state, int):
# pop
del ctx.stack[new_state:]
elif new_state == '#push':
ctx.stack.append(ctx.stack[-1])
else:
assert False, 'wrong state def: %r' % new_state
statetokens = tokendefs[ctx.stack[-1]]
break
else:
try:
if ctx.pos >= ctx.end:
break
if text[ctx.pos] == '\n':
# at EOL, reset state to 'root'
ctx.stack = ['root']
statetokens = tokendefs['root']
yield ctx.pos, Text, u'\n'
ctx.pos += 1
continue
yield ctx.pos, Error, text[ctx.pos]
ctx.pos += 1
except IndexError:
break
def main(fn, lexer=None, options={}):
if lexer is not None:
lxcls = get_lexer_by_name(lexer).__class__
else:
lxcls = find_lexer_class_for_filename(os.path.basename(fn))
if lxcls is None:
name, rest = fn.split('_', 1)
lxcls = find_lexer_class(name)
if lxcls is None:
raise AssertionError('no lexer found for file %r' % fn)
debug_lexer = False
# if profile:
# # does not work for e.g. ExtendedRegexLexers
# if lxcls.__bases__ == (RegexLexer,):
# # yes we can! (change the metaclass)
# lxcls.__class__ = ProfilingRegexLexerMeta
# lxcls.__bases__ = (ProfilingRegexLexer,)
# lxcls._prof_sort_index = profsort
# else:
# if lxcls.__bases__ == (RegexLexer,):
# lxcls.__bases__ = (DebuggingRegexLexer,)
# debug_lexer = True
# elif lxcls.__bases__ == (DebuggingRegexLexer,):
# # already debugged before
# debug_lexer = True
# else:
# # HACK: ExtendedRegexLexer subclasses will only partially work here.
# lxcls.__bases__ = (DebuggingRegexLexer,)
# debug_lexer = True
lx = lxcls(**options)
lno = 1
if fn == '-':
text = sys.stdin.read()
else:
with open(fn, 'rb') as fp:
text = fp.read().decode('utf-8')
text = text.strip('\n') + '\n'
tokens = []
states = []
def show_token(tok, state):
reprs = list(map(repr, tok))
print(' ' + reprs[1] + ' ' + ' ' * (29-len(reprs[1])) + reprs[0], end=' ')
if debug_lexer:
print(' ' + ' ' * (29-len(reprs[0])) + ' : '.join(state) if state else '', end=' ')
print()
for type, val in lx.get_tokens(text):
lno += val.count('\n')
if type == Error and not ignerror:
print('Error parsing', fn, 'on line', lno)
if not showall:
print('Previous tokens' + (debug_lexer and ' and states' or '') + ':')
for i in range(max(len(tokens) - num, 0), len(tokens)):
if debug_lexer:
show_token(tokens[i], states[i])
else:
show_token(tokens[i], None)
print('Error token:')
l = len(repr(val))
print(' ' + repr(val), end=' ')
if debug_lexer and hasattr(lx, 'ctx'):
print(' ' * (60-l) + ' : '.join(lx.ctx.stack), end=' ')
print()
print()
return 1
tokens.append((type, val))
if debug_lexer:
if hasattr(lx, 'ctx'):
states.append(lx.ctx.stack[:])
else:
states.append(None)
if showall:
show_token((type, val), states[-1] if debug_lexer else None)
return 0
def print_help():
print('''\
Pygments development helper to quickly debug lexers.
scripts/debug_lexer.py [options] file ...
Give one or more filenames to lex them and display possible error tokens
and/or profiling info. Files are assumed to be encoded in UTF-8.
Selecting lexer and options:
-l NAME use lexer named NAME (default is to guess from
the given filenames)
-O OPTIONSTR use lexer options parsed from OPTIONSTR
Debugging lexing errors:
-n N show the last N tokens on error
-a always show all lexed tokens (default is only
to show them when an error occurs)
-e do not stop on error tokens
Profiling:
-p use the ProfilingRegexLexer to profile regexes
instead of the debugging lexer
-s N sort profiling output by column N (default is
column 4, the time per call)
''')
num = 10
showall = False
ignerror = False
lexer = None
options = {}
profile = False
profsort = 4
if __name__ == '__main__':
import getopt
opts, args = getopt.getopt(sys.argv[1:], 'n:l:aepO:s:h')
for opt, val in opts:
if opt == '-n':
num = int(val)
elif opt == '-a':
showall = True
elif opt == '-e':
ignerror = True
elif opt == '-l':
lexer = val
elif opt == '-p':
profile = True
elif opt == '-s':
profsort = int(val)
elif opt == '-O':
options = _parse_options([val])
elif opt == '-h':
print_help()
sys.exit(0)
ret = 0
if not args:
print_help()
for f in args:
ret += main(f, lexer, options)
sys.exit(bool(ret))