691 lines
50 KiB
HTML
691 lines
50 KiB
HTML
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
|
|
"http://www.w3.org/TR/html4/strict.dtd">
|
|
<html>
|
|
<head>
|
|
<title>Write your own lexer — Pygments</title>
|
|
<meta http-equiv="content-type" content="text/html; charset=utf-8">
|
|
<style type="text/css">
|
|
body {
|
|
background-color: #f2f2f2;
|
|
margin: 0;
|
|
padding: 0;
|
|
font-family: 'Georgia', serif;
|
|
color: #111;
|
|
}
|
|
|
|
#content {
|
|
background-color: white;
|
|
padding: 20px;
|
|
margin: 20px auto 20px auto;
|
|
max-width: 800px;
|
|
border: 4px solid #ddd;
|
|
}
|
|
|
|
h1 {
|
|
font-weight: normal;
|
|
font-size: 40px;
|
|
color: #09839A;
|
|
}
|
|
|
|
h2 {
|
|
font-weight: normal;
|
|
font-size: 30px;
|
|
color: #C73F00;
|
|
}
|
|
|
|
h1.heading {
|
|
margin: 0 0 30px 0;
|
|
}
|
|
|
|
h2.subheading {
|
|
margin: -30px 0 0 45px;
|
|
}
|
|
|
|
h3 {
|
|
margin-top: 30px;
|
|
}
|
|
|
|
table.docutils {
|
|
border-collapse: collapse;
|
|
border: 2px solid #aaa;
|
|
margin: 0.5em 1.5em 0.5em 1.5em;
|
|
}
|
|
|
|
table.docutils td {
|
|
padding: 2px;
|
|
border: 1px solid #ddd;
|
|
}
|
|
|
|
p, li, dd, dt, blockquote {
|
|
font-size: 15px;
|
|
color: #333;
|
|
}
|
|
|
|
p {
|
|
line-height: 150%;
|
|
margin-bottom: 0;
|
|
margin-top: 10px;
|
|
}
|
|
|
|
hr {
|
|
border-top: 1px solid #ccc;
|
|
border-bottom: 0;
|
|
border-right: 0;
|
|
border-left: 0;
|
|
margin-bottom: 10px;
|
|
margin-top: 20px;
|
|
}
|
|
|
|
dl {
|
|
margin-left: 10px;
|
|
}
|
|
|
|
li, dt {
|
|
margin-top: 5px;
|
|
}
|
|
|
|
dt {
|
|
font-weight: bold;
|
|
}
|
|
|
|
th {
|
|
text-align: left;
|
|
}
|
|
|
|
a {
|
|
color: #990000;
|
|
}
|
|
|
|
a:hover {
|
|
color: #c73f00;
|
|
}
|
|
|
|
pre {
|
|
background-color: #f9f9f9;
|
|
border-top: 1px solid #ccc;
|
|
border-bottom: 1px solid #ccc;
|
|
padding: 5px;
|
|
font-size: 13px;
|
|
font-family: Bitstream Vera Sans Mono,monospace;
|
|
}
|
|
|
|
tt {
|
|
font-size: 13px;
|
|
font-family: Bitstream Vera Sans Mono,monospace;
|
|
color: black;
|
|
padding: 1px 2px 1px 2px;
|
|
background-color: #f0f0f0;
|
|
}
|
|
|
|
cite {
|
|
/* abusing <cite>, it's generated by ReST for `x` */
|
|
font-size: 13px;
|
|
font-family: Bitstream Vera Sans Mono,monospace;
|
|
font-weight: bold;
|
|
font-style: normal;
|
|
}
|
|
|
|
#backlink {
|
|
float: right;
|
|
font-size: 11px;
|
|
color: #888;
|
|
}
|
|
|
|
div.toc {
|
|
margin: 0 0 10px 0;
|
|
}
|
|
|
|
div.toc h2 {
|
|
font-size: 20px;
|
|
}
|
|
.syntax .hll { background-color: #ffffcc }
|
|
.syntax { background: #ffffff; }
|
|
.syntax .c { color: #888888 } /* Comment */
|
|
.syntax .err { color: #a61717; background-color: #e3d2d2 } /* Error */
|
|
.syntax .k { color: #008800; font-weight: bold } /* Keyword */
|
|
.syntax .cm { color: #888888 } /* Comment.Multiline */
|
|
.syntax .cp { color: #cc0000; font-weight: bold } /* Comment.Preproc */
|
|
.syntax .c1 { color: #888888 } /* Comment.Single */
|
|
.syntax .cs { color: #cc0000; font-weight: bold; background-color: #fff0f0 } /* Comment.Special */
|
|
.syntax .gd { color: #000000; background-color: #ffdddd } /* Generic.Deleted */
|
|
.syntax .ge { font-style: italic } /* Generic.Emph */
|
|
.syntax .gr { color: #aa0000 } /* Generic.Error */
|
|
.syntax .gh { color: #333333 } /* Generic.Heading */
|
|
.syntax .gi { color: #000000; background-color: #ddffdd } /* Generic.Inserted */
|
|
.syntax .go { color: #888888 } /* Generic.Output */
|
|
.syntax .gp { color: #555555 } /* Generic.Prompt */
|
|
.syntax .gs { font-weight: bold } /* Generic.Strong */
|
|
.syntax .gu { color: #666666 } /* Generic.Subheading */
|
|
.syntax .gt { color: #aa0000 } /* Generic.Traceback */
|
|
.syntax .kc { color: #008800; font-weight: bold } /* Keyword.Constant */
|
|
.syntax .kd { color: #008800; font-weight: bold } /* Keyword.Declaration */
|
|
.syntax .kn { color: #008800; font-weight: bold } /* Keyword.Namespace */
|
|
.syntax .kp { color: #008800 } /* Keyword.Pseudo */
|
|
.syntax .kr { color: #008800; font-weight: bold } /* Keyword.Reserved */
|
|
.syntax .kt { color: #888888; font-weight: bold } /* Keyword.Type */
|
|
.syntax .m { color: #0000DD; font-weight: bold } /* Literal.Number */
|
|
.syntax .s { color: #dd2200; background-color: #fff0f0 } /* Literal.String */
|
|
.syntax .na { color: #336699 } /* Name.Attribute */
|
|
.syntax .nb { color: #003388 } /* Name.Builtin */
|
|
.syntax .nc { color: #bb0066; font-weight: bold } /* Name.Class */
|
|
.syntax .no { color: #003366; font-weight: bold } /* Name.Constant */
|
|
.syntax .nd { color: #555555 } /* Name.Decorator */
|
|
.syntax .ne { color: #bb0066; font-weight: bold } /* Name.Exception */
|
|
.syntax .nf { color: #0066bb; font-weight: bold } /* Name.Function */
|
|
.syntax .nl { color: #336699; font-style: italic } /* Name.Label */
|
|
.syntax .nn { color: #bb0066; font-weight: bold } /* Name.Namespace */
|
|
.syntax .py { color: #336699; font-weight: bold } /* Name.Property */
|
|
.syntax .nt { color: #bb0066; font-weight: bold } /* Name.Tag */
|
|
.syntax .nv { color: #336699 } /* Name.Variable */
|
|
.syntax .ow { color: #008800 } /* Operator.Word */
|
|
.syntax .w { color: #bbbbbb } /* Text.Whitespace */
|
|
.syntax .mf { color: #0000DD; font-weight: bold } /* Literal.Number.Float */
|
|
.syntax .mh { color: #0000DD; font-weight: bold } /* Literal.Number.Hex */
|
|
.syntax .mi { color: #0000DD; font-weight: bold } /* Literal.Number.Integer */
|
|
.syntax .mo { color: #0000DD; font-weight: bold } /* Literal.Number.Oct */
|
|
.syntax .sb { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Backtick */
|
|
.syntax .sc { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Char */
|
|
.syntax .sd { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Doc */
|
|
.syntax .s2 { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Double */
|
|
.syntax .se { color: #0044dd; background-color: #fff0f0 } /* Literal.String.Escape */
|
|
.syntax .sh { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Heredoc */
|
|
.syntax .si { color: #3333bb; background-color: #fff0f0 } /* Literal.String.Interpol */
|
|
.syntax .sx { color: #22bb22; background-color: #f0fff0 } /* Literal.String.Other */
|
|
.syntax .sr { color: #008800; background-color: #fff0ff } /* Literal.String.Regex */
|
|
.syntax .s1 { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Single */
|
|
.syntax .ss { color: #aa6600; background-color: #fff0f0 } /* Literal.String.Symbol */
|
|
.syntax .bp { color: #003388 } /* Name.Builtin.Pseudo */
|
|
.syntax .vc { color: #336699 } /* Name.Variable.Class */
|
|
.syntax .vg { color: #dd7700 } /* Name.Variable.Global */
|
|
.syntax .vi { color: #3333bb } /* Name.Variable.Instance */
|
|
.syntax .il { color: #0000DD; font-weight: bold } /* Literal.Number.Integer.Long */
|
|
</style>
|
|
</head>
|
|
<body>
|
|
<div id="content">
|
|
<h1 class="heading">Pygments</h1>
|
|
<h2 class="subheading">Write your own lexer</h2>
|
|
|
|
<a id="backlink" href="index.html">« Back To Index</a>
|
|
|
|
|
|
<div class="toc">
|
|
<h2>Contents</h2>
|
|
<ul class="contents">
|
|
|
|
<li><a href="#regexlexer">RegexLexer</a></li>
|
|
|
|
<li><a href="#regex-flags">Regex Flags</a></li>
|
|
|
|
<li><a href="#scanning-multiple-tokens-at-once">Scanning multiple tokens at once</a></li>
|
|
|
|
<li><a href="#changing-states">Changing states</a></li>
|
|
|
|
<li><a href="#advanced-state-tricks">Advanced state tricks</a></li>
|
|
|
|
<li><a href="#using-multiple-lexers">Using multiple lexers</a></li>
|
|
|
|
<li><a href="#delegating-lexer">Delegating Lexer</a></li>
|
|
|
|
<li><a href="#callbacks">Callbacks</a></li>
|
|
|
|
<li><a href="#the-extendedregexlexer-class">The ExtendedRegexLexer class</a></li>
|
|
|
|
<li><a href="#filtering-token-streams">Filtering Token Streams</a></li>
|
|
|
|
</ul>
|
|
</div>
|
|
|
|
<!-- -*- mode: rst -*- -->
|
|
<p>If a lexer for your favorite language is missing in the Pygments package, you can
|
|
easily write your own and extend Pygments.</p>
|
|
<p>All you need can be found inside the <cite>pygments.lexer</cite> module. As you can read in
|
|
the <a class="reference external" href="./api.html">API documentation</a>, a lexer is a class that is initialized with
|
|
some keyword arguments (the lexer options) and that provides a
|
|
<cite>get_tokens_unprocessed()</cite> method which is given a string or unicode object with
|
|
the data to parse.</p>
|
|
<p>The <cite>get_tokens_unprocessed()</cite> method must return an iterator or iterable
|
|
containing tuples in the form <tt class="docutils literal">(index, token, value)</tt>. Normally you don't need
|
|
to do this since there are numerous base lexers you can subclass.</p>
|
|
<div class="section" id="regexlexer">
|
|
<h3>RegexLexer</h3>
|
|
<p>A very powerful (but quite easy to use) lexer is the <cite>RegexLexer</cite>. This lexer
|
|
base class allows you to define lexing rules in terms of <em>regular expressions</em>
|
|
for different <em>states</em>.</p>
|
|
<p>States are groups of regular expressions that are matched against the input
|
|
string at the <em>current position</em>. If one of these expressions matches, a
|
|
corresponding action is performed (normally yielding a token with a specific
|
|
type), the current position is set to where the last match ended and the
|
|
matching process continues with the first regex of the current state.</p>
|
|
<p>Lexer states are kept in a state stack: each time a new state is entered, the
|
|
new state is pushed onto the stack. The most basic lexers (like the
|
|
<cite>DiffLexer</cite>) just need one state.</p>
|
|
<p>Each state is defined as a list of tuples in the form (<cite>regex</cite>, <cite>action</cite>,
|
|
<cite>new_state</cite>) where the last item is optional. In the most basic form, <cite>action</cite>
|
|
is a token type (like <cite>Name.Builtin</cite>). That means: When <cite>regex</cite> matches, emit a
|
|
token with the match text and type <cite>tokentype</cite> and push <cite>new_state</cite> on the state
|
|
stack. If the new state is <tt class="docutils literal">'#pop'</tt>, the topmost state is popped from the
|
|
stack instead. (To pop more than one state, use <tt class="docutils literal">'#pop:2'</tt> and so on.)
|
|
<tt class="docutils literal">'#push'</tt> is a synonym for pushing the current state on the
|
|
stack.</p>
|
|
<p>The following example shows the <cite>DiffLexer</cite> from the builtin lexers. Note that
|
|
it contains some additional attributes <cite>name</cite>, <cite>aliases</cite> and <cite>filenames</cite> which
|
|
aren't required for a lexer. They are used by the builtin lexer lookup
|
|
functions.</p>
|
|
<div class="syntax"><pre><span class="kn">from</span> <span class="nn">pygments.lexer</span> <span class="kn">import</span> <span class="n">RegexLexer</span>
|
|
<span class="kn">from</span> <span class="nn">pygments.token</span> <span class="kn">import</span> <span class="o">*</span>
|
|
|
|
<span class="k">class</span> <span class="nc">DiffLexer</span><span class="p">(</span><span class="n">RegexLexer</span><span class="p">):</span>
|
|
<span class="n">name</span> <span class="o">=</span> <span class="s">'Diff'</span>
|
|
<span class="n">aliases</span> <span class="o">=</span> <span class="p">[</span><span class="s">'diff'</span><span class="p">]</span>
|
|
<span class="n">filenames</span> <span class="o">=</span> <span class="p">[</span><span class="s">'*.diff'</span><span class="p">]</span>
|
|
|
|
<span class="n">tokens</span> <span class="o">=</span> <span class="p">{</span>
|
|
<span class="s">'root'</span><span class="p">:</span> <span class="p">[</span>
|
|
<span class="p">(</span><span class="s">r' .*\n'</span><span class="p">,</span> <span class="n">Text</span><span class="p">),</span>
|
|
<span class="p">(</span><span class="s">r'\+.*\n'</span><span class="p">,</span> <span class="n">Generic</span><span class="o">.</span><span class="n">Inserted</span><span class="p">),</span>
|
|
<span class="p">(</span><span class="s">r'-.*\n'</span><span class="p">,</span> <span class="n">Generic</span><span class="o">.</span><span class="n">Deleted</span><span class="p">),</span>
|
|
<span class="p">(</span><span class="s">r'@.*\n'</span><span class="p">,</span> <span class="n">Generic</span><span class="o">.</span><span class="n">Subheading</span><span class="p">),</span>
|
|
<span class="p">(</span><span class="s">r'Index.*\n'</span><span class="p">,</span> <span class="n">Generic</span><span class="o">.</span><span class="n">Heading</span><span class="p">),</span>
|
|
<span class="p">(</span><span class="s">r'=.*\n'</span><span class="p">,</span> <span class="n">Generic</span><span class="o">.</span><span class="n">Heading</span><span class="p">),</span>
|
|
<span class="p">(</span><span class="s">r'.*\n'</span><span class="p">,</span> <span class="n">Text</span><span class="p">),</span>
|
|
<span class="p">]</span>
|
|
<span class="p">}</span>
|
|
</pre></div>
|
|
<p>As you can see this lexer only uses one state. When the lexer starts scanning
|
|
the text, it first checks if the current character is a space. If this is true
|
|
it scans everything until newline and returns the parsed data as <cite>Text</cite> token.</p>
|
|
<p>If this rule doesn't match, it checks if the current char is a plus sign. And
|
|
so on.</p>
|
|
<p>If no rule matches at the current position, the current char is emitted as an
|
|
<cite>Error</cite> token that indicates a parsing error, and the position is increased by
|
|
1.</p>
|
|
</div>
|
|
<div class="section" id="regex-flags">
|
|
<h3>Regex Flags</h3>
|
|
<p>You can either define regex flags in the regex (<tt class="docutils literal"><span class="pre">r'(?x)foo</span> bar'</tt>) or by adding
|
|
a <cite>flags</cite> attribute to your lexer class. If no attribute is defined, it defaults
|
|
to <cite>re.MULTILINE</cite>. For more informations about regular expression flags see the
|
|
<a class="reference external" href="http://docs.python.org/lib/re-syntax.html">regular expressions</a> help page in the python documentation.</p>
|
|
</div>
|
|
<div class="section" id="scanning-multiple-tokens-at-once">
|
|
<h3>Scanning multiple tokens at once</h3>
|
|
<p>Here is a more complex lexer that highlights INI files. INI files consist of
|
|
sections, comments and key = value pairs:</p>
|
|
<div class="syntax"><pre><span class="kn">from</span> <span class="nn">pygments.lexer</span> <span class="kn">import</span> <span class="n">RegexLexer</span><span class="p">,</span> <span class="n">bygroups</span>
|
|
<span class="kn">from</span> <span class="nn">pygments.token</span> <span class="kn">import</span> <span class="o">*</span>
|
|
|
|
<span class="k">class</span> <span class="nc">IniLexer</span><span class="p">(</span><span class="n">RegexLexer</span><span class="p">):</span>
|
|
<span class="n">name</span> <span class="o">=</span> <span class="s">'INI'</span>
|
|
<span class="n">aliases</span> <span class="o">=</span> <span class="p">[</span><span class="s">'ini'</span><span class="p">,</span> <span class="s">'cfg'</span><span class="p">]</span>
|
|
<span class="n">filenames</span> <span class="o">=</span> <span class="p">[</span><span class="s">'*.ini'</span><span class="p">,</span> <span class="s">'*.cfg'</span><span class="p">]</span>
|
|
|
|
<span class="n">tokens</span> <span class="o">=</span> <span class="p">{</span>
|
|
<span class="s">'root'</span><span class="p">:</span> <span class="p">[</span>
|
|
<span class="p">(</span><span class="s">r'\s+'</span><span class="p">,</span> <span class="n">Text</span><span class="p">),</span>
|
|
<span class="p">(</span><span class="s">r';.*?$'</span><span class="p">,</span> <span class="n">Comment</span><span class="p">),</span>
|
|
<span class="p">(</span><span class="s">r'\[.*?\]$'</span><span class="p">,</span> <span class="n">Keyword</span><span class="p">),</span>
|
|
<span class="p">(</span><span class="s">r'(.*?)(\s*)(=)(\s*)(.*?)$'</span><span class="p">,</span>
|
|
<span class="n">bygroups</span><span class="p">(</span><span class="n">Name</span><span class="o">.</span><span class="n">Attribute</span><span class="p">,</span> <span class="n">Text</span><span class="p">,</span> <span class="n">Operator</span><span class="p">,</span> <span class="n">Text</span><span class="p">,</span> <span class="n">String</span><span class="p">))</span>
|
|
<span class="p">]</span>
|
|
<span class="p">}</span>
|
|
</pre></div>
|
|
<p>The lexer first looks for whitespace, comments and section names. And later it
|
|
looks for a line that looks like a key, value pair, separated by an <tt class="docutils literal">'='</tt>
|
|
sign, and optional whitespace.</p>
|
|
<p>The <cite>bygroups</cite> helper makes sure that each group is yielded with a different
|
|
token type. First the <cite>Name.Attribute</cite> token, then a <cite>Text</cite> token for the
|
|
optional whitespace, after that a <cite>Operator</cite> token for the equals sign. Then a
|
|
<cite>Text</cite> token for the whitespace again. The rest of the line is returned as
|
|
<cite>String</cite>.</p>
|
|
<p>Note that for this to work, every part of the match must be inside a capturing
|
|
group (a <tt class="docutils literal"><span class="pre">(...)</span></tt>), and there must not be any nested capturing groups. If you
|
|
nevertheless need a group, use a non-capturing group defined using this syntax:
|
|
<tt class="docutils literal"><span class="pre">r'(?:some|words|here)'</span></tt> (note the <tt class="docutils literal"><span class="pre">?:</span></tt> after the beginning parenthesis).</p>
|
|
<p>If you find yourself needing a capturing group inside the regex which
|
|
shouldn't be part of the output but is used in the regular expressions for
|
|
backreferencing (eg: <tt class="docutils literal"><span class="pre">r'(<(foo|bar)>)(.*?)(</\2>)'</span></tt>), you can pass <cite>None</cite>
|
|
to the bygroups function and it will skip that group will be skipped in the
|
|
output.</p>
|
|
</div>
|
|
<div class="section" id="changing-states">
|
|
<h3>Changing states</h3>
|
|
<p>Many lexers need multiple states to work as expected. For example, some
|
|
languages allow multiline comments to be nested. Since this is a recursive
|
|
pattern it's impossible to lex just using regular expressions.</p>
|
|
<p>Here is the solution:</p>
|
|
<div class="syntax"><pre><span class="kn">from</span> <span class="nn">pygments.lexer</span> <span class="kn">import</span> <span class="n">RegexLexer</span>
|
|
<span class="kn">from</span> <span class="nn">pygments.token</span> <span class="kn">import</span> <span class="o">*</span>
|
|
|
|
<span class="k">class</span> <span class="nc">ExampleLexer</span><span class="p">(</span><span class="n">RegexLexer</span><span class="p">):</span>
|
|
<span class="n">name</span> <span class="o">=</span> <span class="s">'Example Lexer with states'</span>
|
|
|
|
<span class="n">tokens</span> <span class="o">=</span> <span class="p">{</span>
|
|
<span class="s">'root'</span><span class="p">:</span> <span class="p">[</span>
|
|
<span class="p">(</span><span class="s">r'[^/]+'</span><span class="p">,</span> <span class="n">Text</span><span class="p">),</span>
|
|
<span class="p">(</span><span class="s">r'/\*'</span><span class="p">,</span> <span class="n">Comment</span><span class="o">.</span><span class="n">Multiline</span><span class="p">,</span> <span class="s">'comment'</span><span class="p">),</span>
|
|
<span class="p">(</span><span class="s">r'//.*?$'</span><span class="p">,</span> <span class="n">Comment</span><span class="o">.</span><span class="n">Singleline</span><span class="p">),</span>
|
|
<span class="p">(</span><span class="s">r'/'</span><span class="p">,</span> <span class="n">Text</span><span class="p">)</span>
|
|
<span class="p">],</span>
|
|
<span class="s">'comment'</span><span class="p">:</span> <span class="p">[</span>
|
|
<span class="p">(</span><span class="s">r'[^*/]'</span><span class="p">,</span> <span class="n">Comment</span><span class="o">.</span><span class="n">Multiline</span><span class="p">),</span>
|
|
<span class="p">(</span><span class="s">r'/\*'</span><span class="p">,</span> <span class="n">Comment</span><span class="o">.</span><span class="n">Multiline</span><span class="p">,</span> <span class="s">'#push'</span><span class="p">),</span>
|
|
<span class="p">(</span><span class="s">r'\*/'</span><span class="p">,</span> <span class="n">Comment</span><span class="o">.</span><span class="n">Multiline</span><span class="p">,</span> <span class="s">'#pop'</span><span class="p">),</span>
|
|
<span class="p">(</span><span class="s">r'[*/]'</span><span class="p">,</span> <span class="n">Comment</span><span class="o">.</span><span class="n">Multiline</span><span class="p">)</span>
|
|
<span class="p">]</span>
|
|
<span class="p">}</span>
|
|
</pre></div>
|
|
<p>This lexer starts lexing in the <tt class="docutils literal">'root'</tt> state. It tries to match as much as
|
|
possible until it finds a slash (<tt class="docutils literal">'/'</tt>). If the next character after the slash
|
|
is a star (<tt class="docutils literal">'*'</tt>) the <cite>RegexLexer</cite> sends those two characters to the output
|
|
stream marked as <cite>Comment.Multiline</cite> and continues parsing with the rules
|
|
defined in the <tt class="docutils literal">'comment'</tt> state.</p>
|
|
<p>If there wasn't a star after the slash, the <cite>RegexLexer</cite> checks if it's a
|
|
singleline comment (eg: followed by a second slash). If this also wasn't the
|
|
case it must be a single slash (the separate regex for a single slash must also
|
|
be given, else the slash would be marked as an error token).</p>
|
|
<p>Inside the <tt class="docutils literal">'comment'</tt> state, we do the same thing again. Scan until the lexer
|
|
finds a star or slash. If it's the opening of a multiline comment, push the
|
|
<tt class="docutils literal">'comment'</tt> state on the stack and continue scanning, again in the
|
|
<tt class="docutils literal">'comment'</tt> state. Else, check if it's the end of the multiline comment. If
|
|
yes, pop one state from the stack.</p>
|
|
<p>Note: If you pop from an empty stack you'll get an <cite>IndexError</cite>. (There is an
|
|
easy way to prevent this from happening: don't <tt class="docutils literal">'#pop'</tt> in the root state).</p>
|
|
<p>If the <cite>RegexLexer</cite> encounters a newline that is flagged as an error token, the
|
|
stack is emptied and the lexer continues scanning in the <tt class="docutils literal">'root'</tt> state. This
|
|
helps producing error-tolerant highlighting for erroneous input, e.g. when a
|
|
single-line string is not closed.</p>
|
|
</div>
|
|
<div class="section" id="advanced-state-tricks">
|
|
<h3>Advanced state tricks</h3>
|
|
<p>There are a few more things you can do with states:</p>
|
|
<ul>
|
|
<li><p class="first">You can push multiple states onto the stack if you give a tuple instead of a
|
|
simple string as the third item in a rule tuple. For example, if you want to
|
|
match a comment containing a directive, something like:</p>
|
|
<pre class="literal-block">
|
|
/* <processing directive> rest of comment */
|
|
</pre>
|
|
<p>you can use this rule:</p>
|
|
<div class="syntax"><pre><span class="n">tokens</span> <span class="o">=</span> <span class="p">{</span>
|
|
<span class="s">'root'</span><span class="p">:</span> <span class="p">[</span>
|
|
<span class="p">(</span><span class="s">r'/\* <'</span><span class="p">,</span> <span class="n">Comment</span><span class="p">,</span> <span class="p">(</span><span class="s">'comment'</span><span class="p">,</span> <span class="s">'directive'</span><span class="p">)),</span>
|
|
<span class="o">...</span>
|
|
<span class="p">],</span>
|
|
<span class="s">'directive'</span><span class="p">:</span> <span class="p">[</span>
|
|
<span class="p">(</span><span class="s">r'[^>]*'</span><span class="p">,</span> <span class="n">Comment</span><span class="o">.</span><span class="n">Directive</span><span class="p">),</span>
|
|
<span class="p">(</span><span class="s">r'>'</span><span class="p">,</span> <span class="n">Comment</span><span class="p">,</span> <span class="s">'#pop'</span><span class="p">),</span>
|
|
<span class="p">],</span>
|
|
<span class="s">'comment'</span><span class="p">:</span> <span class="p">[</span>
|
|
<span class="p">(</span><span class="s">r'[^*]+'</span><span class="p">,</span> <span class="n">Comment</span><span class="p">),</span>
|
|
<span class="p">(</span><span class="s">r'\*/'</span><span class="p">,</span> <span class="n">Comment</span><span class="p">,</span> <span class="s">'#pop'</span><span class="p">),</span>
|
|
<span class="p">(</span><span class="s">r'\*'</span><span class="p">,</span> <span class="n">Comment</span><span class="p">),</span>
|
|
<span class="p">]</span>
|
|
<span class="p">}</span>
|
|
</pre></div>
|
|
<p>When this encounters the above sample, first <tt class="docutils literal">'comment'</tt> and <tt class="docutils literal">'directive'</tt>
|
|
are pushed onto the stack, then the lexer continues in the directive state
|
|
until it finds the closing <tt class="docutils literal">></tt>, then it continues in the comment state until
|
|
the closing <tt class="docutils literal">*/</tt>. Then, both states are popped from the stack again and
|
|
lexing continues in the root state.</p>
|
|
<p><em>New in Pygments 0.9:</em> The tuple can contain the special <tt class="docutils literal">'#push'</tt> and
|
|
<tt class="docutils literal">'#pop'</tt> (but not <tt class="docutils literal">'#pop:n'</tt>) directives.</p>
|
|
</li>
|
|
<li><p class="first">You can include the rules of a state in the definition of another. This is
|
|
done by using <cite>include</cite> from <cite>pygments.lexer</cite>:</p>
|
|
<div class="syntax"><pre><span class="kn">from</span> <span class="nn">pygments.lexer</span> <span class="kn">import</span> <span class="n">RegexLexer</span><span class="p">,</span> <span class="n">bygroups</span><span class="p">,</span> <span class="n">include</span>
|
|
<span class="kn">from</span> <span class="nn">pygments.token</span> <span class="kn">import</span> <span class="o">*</span>
|
|
|
|
<span class="k">class</span> <span class="nc">ExampleLexer</span><span class="p">(</span><span class="n">RegexLexer</span><span class="p">):</span>
|
|
<span class="n">tokens</span> <span class="o">=</span> <span class="p">{</span>
|
|
<span class="s">'comments'</span><span class="p">:</span> <span class="p">[</span>
|
|
<span class="p">(</span><span class="s">r'/\*.*?\*/'</span><span class="p">,</span> <span class="n">Comment</span><span class="p">),</span>
|
|
<span class="p">(</span><span class="s">r'//.*?\n'</span><span class="p">,</span> <span class="n">Comment</span><span class="p">),</span>
|
|
<span class="p">],</span>
|
|
<span class="s">'root'</span><span class="p">:</span> <span class="p">[</span>
|
|
<span class="n">include</span><span class="p">(</span><span class="s">'comments'</span><span class="p">),</span>
|
|
<span class="p">(</span><span class="s">r'(function )(\w+)( {)'</span><span class="p">,</span>
|
|
<span class="n">bygroups</span><span class="p">(</span><span class="n">Keyword</span><span class="p">,</span> <span class="n">Name</span><span class="p">,</span> <span class="n">Keyword</span><span class="p">),</span> <span class="s">'function'</span><span class="p">),</span>
|
|
<span class="p">(</span><span class="s">r'.'</span><span class="p">,</span> <span class="n">Text</span><span class="p">),</span>
|
|
<span class="p">],</span>
|
|
<span class="s">'function'</span><span class="p">:</span> <span class="p">[</span>
|
|
<span class="p">(</span><span class="s">r'[^}/]+'</span><span class="p">,</span> <span class="n">Text</span><span class="p">),</span>
|
|
<span class="n">include</span><span class="p">(</span><span class="s">'comments'</span><span class="p">),</span>
|
|
<span class="p">(</span><span class="s">r'/'</span><span class="p">,</span> <span class="n">Text</span><span class="p">),</span>
|
|
<span class="p">(</span><span class="s">r'}'</span><span class="p">,</span> <span class="n">Keyword</span><span class="p">,</span> <span class="s">'#pop'</span><span class="p">),</span>
|
|
<span class="p">]</span>
|
|
<span class="p">}</span>
|
|
</pre></div>
|
|
<p>This is a hypothetical lexer for a language that consist of functions and
|
|
comments. Because comments can occur at toplevel and in functions, we need
|
|
rules for comments in both states. As you can see, the <cite>include</cite> helper saves
|
|
repeating rules that occur more than once (in this example, the state
|
|
<tt class="docutils literal">'comment'</tt> will never be entered by the lexer, as it's only there to be
|
|
included in <tt class="docutils literal">'root'</tt> and <tt class="docutils literal">'function'</tt>).</p>
|
|
</li>
|
|
<li><p class="first">Sometimes, you may want to "combine" a state from existing ones. This is
|
|
possible with the <cite>combine</cite> helper from <cite>pygments.lexer</cite>.</p>
|
|
<p>If you, instead of a new state, write <tt class="docutils literal"><span class="pre">combined('state1',</span> 'state2')</tt> as the
|
|
third item of a rule tuple, a new anonymous state will be formed from state1
|
|
and state2 and if the rule matches, the lexer will enter this state.</p>
|
|
<p>This is not used very often, but can be helpful in some cases, such as the
|
|
<cite>PythonLexer</cite>'s string literal processing.</p>
|
|
</li>
|
|
<li><p class="first">If you want your lexer to start lexing in a different state you can modify
|
|
the stack by overloading the <cite>get_tokens_unprocessed()</cite> method:</p>
|
|
<div class="syntax"><pre><span class="kn">from</span> <span class="nn">pygments.lexer</span> <span class="kn">import</span> <span class="n">RegexLexer</span>
|
|
|
|
<span class="k">class</span> <span class="nc">MyLexer</span><span class="p">(</span><span class="n">RegexLexer</span><span class="p">):</span>
|
|
<span class="n">tokens</span> <span class="o">=</span> <span class="p">{</span><span class="o">...</span><span class="p">}</span>
|
|
|
|
<span class="k">def</span> <span class="nf">get_tokens_unprocessed</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">text</span><span class="p">):</span>
|
|
<span class="n">stack</span> <span class="o">=</span> <span class="p">[</span><span class="s">'root'</span><span class="p">,</span> <span class="s">'otherstate'</span><span class="p">]</span>
|
|
<span class="k">for</span> <span class="n">item</span> <span class="ow">in</span> <span class="n">RegexLexer</span><span class="o">.</span><span class="n">get_tokens_unprocessed</span><span class="p">(</span><span class="n">text</span><span class="p">,</span> <span class="n">stack</span><span class="p">):</span>
|
|
<span class="k">yield</span> <span class="n">item</span>
|
|
</pre></div>
|
|
<p>Some lexers like the <cite>PhpLexer</cite> use this to make the leading <tt class="docutils literal"><span class="pre"><?php</span></tt>
|
|
preprocessor comments optional. Note that you can crash the lexer easily
|
|
by putting values into the stack that don't exist in the token map. Also
|
|
removing <tt class="docutils literal">'root'</tt> from the stack can result in strange errors!</p>
|
|
</li>
|
|
<li><p class="first">An empty regex at the end of a state list, combined with <tt class="docutils literal">'#pop'</tt>, can
|
|
act as a return point from a state that doesn't have a clear end marker.</p>
|
|
</li>
|
|
</ul>
|
|
</div>
|
|
<div class="section" id="using-multiple-lexers">
|
|
<h3>Using multiple lexers</h3>
|
|
<p>Using multiple lexers for the same input can be tricky. One of the easiest
|
|
combination techniques is shown here: You can replace the token type entry in a
|
|
rule tuple (the second item) with a lexer class. The matched text will then be
|
|
lexed with that lexer, and the resulting tokens will be yielded.</p>
|
|
<p>For example, look at this stripped-down HTML lexer:</p>
|
|
<div class="syntax"><pre><span class="kn">from</span> <span class="nn">pygments.lexer</span> <span class="kn">import</span> <span class="n">RegexLexer</span><span class="p">,</span> <span class="n">bygroups</span><span class="p">,</span> <span class="n">using</span>
|
|
<span class="kn">from</span> <span class="nn">pygments.token</span> <span class="kn">import</span> <span class="o">*</span>
|
|
<span class="kn">from</span> <span class="nn">pygments.lexers.web</span> <span class="kn">import</span> <span class="n">JavascriptLexer</span>
|
|
|
|
<span class="k">class</span> <span class="nc">HtmlLexer</span><span class="p">(</span><span class="n">RegexLexer</span><span class="p">):</span>
|
|
<span class="n">name</span> <span class="o">=</span> <span class="s">'HTML'</span>
|
|
<span class="n">aliases</span> <span class="o">=</span> <span class="p">[</span><span class="s">'html'</span><span class="p">]</span>
|
|
<span class="n">filenames</span> <span class="o">=</span> <span class="p">[</span><span class="s">'*.html'</span><span class="p">,</span> <span class="s">'*.htm'</span><span class="p">]</span>
|
|
|
|
<span class="n">flags</span> <span class="o">=</span> <span class="n">re</span><span class="o">.</span><span class="n">IGNORECASE</span> <span class="o">|</span> <span class="n">re</span><span class="o">.</span><span class="n">DOTALL</span>
|
|
<span class="n">tokens</span> <span class="o">=</span> <span class="p">{</span>
|
|
<span class="s">'root'</span><span class="p">:</span> <span class="p">[</span>
|
|
<span class="p">(</span><span class="s">'[^<&]+'</span><span class="p">,</span> <span class="n">Text</span><span class="p">),</span>
|
|
<span class="p">(</span><span class="s">'&.*?;'</span><span class="p">,</span> <span class="n">Name</span><span class="o">.</span><span class="n">Entity</span><span class="p">),</span>
|
|
<span class="p">(</span><span class="s">r'<\s*script\s*'</span><span class="p">,</span> <span class="n">Name</span><span class="o">.</span><span class="n">Tag</span><span class="p">,</span> <span class="p">(</span><span class="s">'script-content'</span><span class="p">,</span> <span class="s">'tag'</span><span class="p">)),</span>
|
|
<span class="p">(</span><span class="s">r'<\s*[a-zA-Z0-9:]+'</span><span class="p">,</span> <span class="n">Name</span><span class="o">.</span><span class="n">Tag</span><span class="p">,</span> <span class="s">'tag'</span><span class="p">),</span>
|
|
<span class="p">(</span><span class="s">r'<\s*/\s*[a-zA-Z0-9:]+\s*>'</span><span class="p">,</span> <span class="n">Name</span><span class="o">.</span><span class="n">Tag</span><span class="p">),</span>
|
|
<span class="p">],</span>
|
|
<span class="s">'script-content'</span><span class="p">:</span> <span class="p">[</span>
|
|
<span class="p">(</span><span class="s">r'(.+?)(<\s*/\s*script\s*>)'</span><span class="p">,</span>
|
|
<span class="n">bygroups</span><span class="p">(</span><span class="n">using</span><span class="p">(</span><span class="n">JavascriptLexer</span><span class="p">),</span> <span class="n">Name</span><span class="o">.</span><span class="n">Tag</span><span class="p">),</span>
|
|
<span class="s">'#pop'</span><span class="p">),</span>
|
|
<span class="p">]</span>
|
|
<span class="p">}</span>
|
|
</pre></div>
|
|
<p>Here the content of a <tt class="docutils literal"><script></tt> tag is passed to a newly created instance of
|
|
a <cite>JavascriptLexer</cite> and not processed by the <cite>HtmlLexer</cite>. This is done using the
|
|
<cite>using</cite> helper that takes the other lexer class as its parameter.</p>
|
|
<p>Note the combination of <cite>bygroups</cite> and <cite>using</cite>. This makes sure that the content
|
|
up to the <tt class="docutils literal"></script></tt> end tag is processed by the <cite>JavascriptLexer</cite>, while the
|
|
end tag is yielded as a normal token with the <cite>Name.Tag</cite> type.</p>
|
|
<p>As an additional goodie, if the lexer class is replaced by <cite>this</cite> (imported from
|
|
<cite>pygments.lexer</cite>), the "other" lexer will be the current one (because you cannot
|
|
refer to the current class within the code that runs at class definition time).</p>
|
|
<p>Also note the <tt class="docutils literal"><span class="pre">(r'<\s*script\s*',</span> Name.Tag, <span class="pre">('script-content',</span> <span class="pre">'tag'))</span></tt> rule.
|
|
Here, two states are pushed onto the state stack, <tt class="docutils literal"><span class="pre">'script-content'</span></tt> and
|
|
<tt class="docutils literal">'tag'</tt>. That means that first <tt class="docutils literal">'tag'</tt> is processed, which will parse
|
|
attributes and the closing <tt class="docutils literal">></tt>, then the <tt class="docutils literal">'tag'</tt> state is popped and the
|
|
next state on top of the stack will be <tt class="docutils literal"><span class="pre">'script-content'</span></tt>.</p>
|
|
<p>The <cite>using()</cite> helper has a special keyword argument, <cite>state</cite>, which works as
|
|
follows: if given, the lexer to use initially is not in the <tt class="docutils literal">"root"</tt> state,
|
|
but in the state given by this argument. This <em>only</em> works with a <cite>RegexLexer</cite>.</p>
|
|
<p>Any other keywords arguments passed to <cite>using()</cite> are added to the keyword
|
|
arguments used to create the lexer.</p>
|
|
</div>
|
|
<div class="section" id="delegating-lexer">
|
|
<h3>Delegating Lexer</h3>
|
|
<p>Another approach for nested lexers is the <cite>DelegatingLexer</cite> which is for
|
|
example used for the template engine lexers. It takes two lexers as
|
|
arguments on initialisation: a <cite>root_lexer</cite> and a <cite>language_lexer</cite>.</p>
|
|
<p>The input is processed as follows: First, the whole text is lexed with the
|
|
<cite>language_lexer</cite>. All tokens yielded with a type of <tt class="docutils literal">Other</tt> are then
|
|
concatenated and given to the <cite>root_lexer</cite>. The language tokens of the
|
|
<cite>language_lexer</cite> are then inserted into the <cite>root_lexer</cite>'s token stream
|
|
at the appropriate positions.</p>
|
|
<div class="syntax"><pre><span class="kn">from</span> <span class="nn">pygments.lexer</span> <span class="kn">import</span> <span class="n">DelegatingLexer</span>
|
|
<span class="kn">from</span> <span class="nn">pygments.lexers.web</span> <span class="kn">import</span> <span class="n">HtmlLexer</span><span class="p">,</span> <span class="n">PhpLexer</span>
|
|
|
|
<span class="k">class</span> <span class="nc">HtmlPhpLexer</span><span class="p">(</span><span class="n">DelegatingLexer</span><span class="p">):</span>
|
|
<span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">**</span><span class="n">options</span><span class="p">):</span>
|
|
<span class="nb">super</span><span class="p">(</span><span class="n">HtmlPhpLexer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="n">__init__</span><span class="p">(</span><span class="n">HtmlLexer</span><span class="p">,</span> <span class="n">PhpLexer</span><span class="p">,</span> <span class="o">**</span><span class="n">options</span><span class="p">)</span>
|
|
</pre></div>
|
|
<p>This procedure ensures that e.g. HTML with template tags in it is highlighted
|
|
correctly even if the template tags are put into HTML tags or attributes.</p>
|
|
<p>If you want to change the needle token <tt class="docutils literal">Other</tt> to something else, you can
|
|
give the lexer another token type as the third parameter:</p>
|
|
<div class="syntax"><pre><span class="n">DelegatingLexer</span><span class="o">.</span><span class="n">__init__</span><span class="p">(</span><span class="n">MyLexer</span><span class="p">,</span> <span class="n">OtherLexer</span><span class="p">,</span> <span class="n">Text</span><span class="p">,</span> <span class="o">**</span><span class="n">options</span><span class="p">)</span>
|
|
</pre></div>
|
|
</div>
|
|
<div class="section" id="callbacks">
|
|
<h3>Callbacks</h3>
|
|
<p>Sometimes the grammar of a language is so complex that a lexer would be unable
|
|
to parse it just by using regular expressions and stacks.</p>
|
|
<p>For this, the <cite>RegexLexer</cite> allows callbacks to be given in rule tuples, instead
|
|
of token types (<cite>bygroups</cite> and <cite>using</cite> are nothing else but preimplemented
|
|
callbacks). The callback must be a function taking two arguments:</p>
|
|
<ul class="simple">
|
|
<li>the lexer itself</li>
|
|
<li>the match object for the last matched rule</li>
|
|
</ul>
|
|
<p>The callback must then return an iterable of (or simply yield) <tt class="docutils literal">(index,
|
|
tokentype, value)</tt> tuples, which are then just passed through by
|
|
<cite>get_tokens_unprocessed()</cite>. The <tt class="docutils literal">index</tt> here is the position of the token in
|
|
the input string, <tt class="docutils literal">tokentype</tt> is the normal token type (like <cite>Name.Builtin</cite>),
|
|
and <tt class="docutils literal">value</tt> the associated part of the input string.</p>
|
|
<p>You can see an example here:</p>
|
|
<div class="syntax"><pre><span class="kn">from</span> <span class="nn">pygments.lexer</span> <span class="kn">import</span> <span class="n">RegexLexer</span>
|
|
<span class="kn">from</span> <span class="nn">pygments.token</span> <span class="kn">import</span> <span class="n">Generic</span>
|
|
|
|
<span class="k">class</span> <span class="nc">HypotheticLexer</span><span class="p">(</span><span class="n">RegexLexer</span><span class="p">):</span>
|
|
|
|
<span class="k">def</span> <span class="nf">headline_callback</span><span class="p">(</span><span class="n">lexer</span><span class="p">,</span> <span class="n">match</span><span class="p">):</span>
|
|
<span class="n">equal_signs</span> <span class="o">=</span> <span class="n">match</span><span class="o">.</span><span class="n">group</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span>
|
|
<span class="n">text</span> <span class="o">=</span> <span class="n">match</span><span class="o">.</span><span class="n">group</span><span class="p">(</span><span class="mi">2</span><span class="p">)</span>
|
|
<span class="k">yield</span> <span class="n">match</span><span class="o">.</span><span class="n">start</span><span class="p">(),</span> <span class="n">Generic</span><span class="o">.</span><span class="n">Headline</span><span class="p">,</span> <span class="n">equal_signs</span> <span class="o">+</span> <span class="n">text</span> <span class="o">+</span> <span class="n">equal_signs</span>
|
|
|
|
<span class="n">tokens</span> <span class="o">=</span> <span class="p">{</span>
|
|
<span class="s">'root'</span><span class="p">:</span> <span class="p">[</span>
|
|
<span class="p">(</span><span class="s">r'(=+)(.*?)(\1)'</span><span class="p">,</span> <span class="n">headline_callback</span><span class="p">)</span>
|
|
<span class="p">]</span>
|
|
<span class="p">}</span>
|
|
</pre></div>
|
|
<p>If the regex for the <cite>headline_callback</cite> matches, the function is called with the
|
|
match object. Note that after the callback is done, processing continues
|
|
normally, that is, after the end of the previous match. The callback has no
|
|
possibility to influence the position.</p>
|
|
<p>There are not really any simple examples for lexer callbacks, but you can see
|
|
them in action e.g. in the <a class="reference external" href="http://bitbucket.org/birkenfeld/pygments-main/src/tip/pygments/lexers/compiled.py">compiled.py</a> source code in the <cite>CLexer</cite> and
|
|
<cite>JavaLexer</cite> classes.</p>
|
|
</div>
|
|
<div class="section" id="the-extendedregexlexer-class">
|
|
<h3>The ExtendedRegexLexer class</h3>
|
|
<p>The <cite>RegexLexer</cite>, even with callbacks, unfortunately isn't powerful enough for
|
|
the funky syntax rules of some languages that will go unnamed, such as Ruby.</p>
|
|
<p>But fear not; even then you don't have to abandon the regular expression
|
|
approach. For Pygments has a subclass of <cite>RegexLexer</cite>, the <cite>ExtendedRegexLexer</cite>.
|
|
All features known from RegexLexers are available here too, and the tokens are
|
|
specified in exactly the same way, <em>except</em> for one detail:</p>
|
|
<p>The <cite>get_tokens_unprocessed()</cite> method holds its internal state data not as local
|
|
variables, but in an instance of the <cite>pygments.lexer.LexerContext</cite> class, and
|
|
that instance is passed to callbacks as a third argument. This means that you
|
|
can modify the lexer state in callbacks.</p>
|
|
<p>The <cite>LexerContext</cite> class has the following members:</p>
|
|
<ul class="simple">
|
|
<li><cite>text</cite> -- the input text</li>
|
|
<li><cite>pos</cite> -- the current starting position that is used for matching regexes</li>
|
|
<li><cite>stack</cite> -- a list containing the state stack</li>
|
|
<li><cite>end</cite> -- the maximum position to which regexes are matched, this defaults to
|
|
the length of <cite>text</cite></li>
|
|
</ul>
|
|
<p>Additionally, the <cite>get_tokens_unprocessed()</cite> method can be given a
|
|
<cite>LexerContext</cite> instead of a string and will then process this context instead of
|
|
creating a new one for the string argument.</p>
|
|
<p>Note that because you can set the current position to anything in the callback,
|
|
it won't be automatically be set by the caller after the callback is finished.
|
|
For example, this is how the hypothetical lexer above would be written with the
|
|
<cite>ExtendedRegexLexer</cite>:</p>
|
|
<div class="syntax"><pre><span class="kn">from</span> <span class="nn">pygments.lexer</span> <span class="kn">import</span> <span class="n">ExtendedRegexLexer</span>
|
|
<span class="kn">from</span> <span class="nn">pygments.token</span> <span class="kn">import</span> <span class="n">Generic</span>
|
|
|
|
<span class="k">class</span> <span class="nc">ExHypotheticLexer</span><span class="p">(</span><span class="n">ExtendedRegexLexer</span><span class="p">):</span>
|
|
|
|
<span class="k">def</span> <span class="nf">headline_callback</span><span class="p">(</span><span class="n">lexer</span><span class="p">,</span> <span class="n">match</span><span class="p">,</span> <span class="n">ctx</span><span class="p">):</span>
|
|
<span class="n">equal_signs</span> <span class="o">=</span> <span class="n">match</span><span class="o">.</span><span class="n">group</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span>
|
|
<span class="n">text</span> <span class="o">=</span> <span class="n">match</span><span class="o">.</span><span class="n">group</span><span class="p">(</span><span class="mi">2</span><span class="p">)</span>
|
|
<span class="k">yield</span> <span class="n">match</span><span class="o">.</span><span class="n">start</span><span class="p">(),</span> <span class="n">Generic</span><span class="o">.</span><span class="n">Headline</span><span class="p">,</span> <span class="n">equal_signs</span> <span class="o">+</span> <span class="n">text</span> <span class="o">+</span> <span class="n">equal_signs</span>
|
|
<span class="n">ctx</span><span class="o">.</span><span class="n">pos</span> <span class="o">=</span> <span class="n">match</span><span class="o">.</span><span class="n">end</span><span class="p">()</span>
|
|
|
|
<span class="n">tokens</span> <span class="o">=</span> <span class="p">{</span>
|
|
<span class="s">'root'</span><span class="p">:</span> <span class="p">[</span>
|
|
<span class="p">(</span><span class="s">r'(=+)(.*?)(\1)'</span><span class="p">,</span> <span class="n">headline_callback</span><span class="p">)</span>
|
|
<span class="p">]</span>
|
|
<span class="p">}</span>
|
|
</pre></div>
|
|
<p>This might sound confusing (and it can really be). But it is needed, and for an
|
|
example look at the Ruby lexer in <a class="reference external" href="https://bitbucket.org/birkenfeld/pygments-main/src/tip/pygments/lexers/agile.py">agile.py</a>.</p>
|
|
</div>
|
|
<div class="section" id="filtering-token-streams">
|
|
<h3>Filtering Token Streams</h3>
|
|
<p>Some languages ship a lot of builtin functions (for example PHP). The total
|
|
amount of those functions differs from system to system because not everybody
|
|
has every extension installed. In the case of PHP there are over 3000 builtin
|
|
functions. That's an incredible huge amount of functions, much more than you
|
|
can put into a regular expression.</p>
|
|
<p>But because only <cite>Name</cite> tokens can be function names it's solvable by overriding
|
|
the <tt class="docutils literal">get_tokens_unprocessed()</tt> method. The following lexer subclasses the
|
|
<cite>PythonLexer</cite> so that it highlights some additional names as pseudo keywords:</p>
|
|
<div class="syntax"><pre><span class="kn">from</span> <span class="nn">pygments.lexers.agile</span> <span class="kn">import</span> <span class="n">PythonLexer</span>
|
|
<span class="kn">from</span> <span class="nn">pygments.token</span> <span class="kn">import</span> <span class="n">Name</span><span class="p">,</span> <span class="n">Keyword</span>
|
|
|
|
<span class="k">class</span> <span class="nc">MyPythonLexer</span><span class="p">(</span><span class="n">PythonLexer</span><span class="p">):</span>
|
|
<span class="n">EXTRA_KEYWORDS</span> <span class="o">=</span> <span class="p">[</span><span class="s">'foo'</span><span class="p">,</span> <span class="s">'bar'</span><span class="p">,</span> <span class="s">'foobar'</span><span class="p">,</span> <span class="s">'barfoo'</span><span class="p">,</span> <span class="s">'spam'</span><span class="p">,</span> <span class="s">'eggs'</span><span class="p">]</span>
|
|
|
|
<span class="k">def</span> <span class="nf">get_tokens_unprocessed</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">text</span><span class="p">):</span>
|
|
<span class="k">for</span> <span class="n">index</span><span class="p">,</span> <span class="n">token</span><span class="p">,</span> <span class="n">value</span> <span class="ow">in</span> <span class="n">PythonLexer</span><span class="o">.</span><span class="n">get_tokens_unprocessed</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">text</span><span class="p">):</span>
|
|
<span class="k">if</span> <span class="n">token</span> <span class="ow">is</span> <span class="n">Name</span> <span class="ow">and</span> <span class="n">value</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">EXTRA_KEYWORDS</span><span class="p">:</span>
|
|
<span class="k">yield</span> <span class="n">index</span><span class="p">,</span> <span class="n">Keyword</span><span class="o">.</span><span class="n">Pseudo</span><span class="p">,</span> <span class="n">value</span>
|
|
<span class="k">else</span><span class="p">:</span>
|
|
<span class="k">yield</span> <span class="n">index</span><span class="p">,</span> <span class="n">token</span><span class="p">,</span> <span class="n">value</span>
|
|
</pre></div>
|
|
<p>The <cite>PhpLexer</cite> and <cite>LuaLexer</cite> use this method to resolve builtin functions.</p>
|
|
<p><strong>Note</strong> Do not confuse this with the <a class="reference external" href="./filters.html">filter</a> system.</p>
|
|
</div>
|
|
|
|
</div>
|
|
</body>
|
|
<!-- generated on: 2013-01-09 17:48:42.356609
|
|
file id: lexerdevelopment -->
|
|
</html> |