[Mlir-commits] [mlir] [MLIR][Pygments] Refine the pygments MLIR lexer (PR #166406)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Fri Nov 7 02:04:47 PST 2025
================
@@ -2,37 +2,119 @@
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-from pygments.lexer import RegexLexer
+from pygments.lexer import RegexLexer, bygroups, include, combined
from pygments.token import *
+import re
class MlirLexer(RegexLexer):
+ """Pygments lexer for MLIR.
+
+ This lexer focuses on accurate tokenization of common MLIR constructs:
+ - SSA values (%%... / %...)
+ - attribute and type aliases (#name =, !name =)
+ - types (builtin and dialect types, parametric types)
+ - attribute dictionaries and nested containers to a reasonable depth
+ - numbers (ints, floats with exponents, hex)
+ - strings with common escapes
+ - line comments (// ...)
+ - block labels (^foo) and operations
+ """
+
name = "MLIR"
aliases = ["mlir"]
filenames = ["*.mlir"]
+ flags = re.MULTILINE
+
tokens = {
"root": [
- (r"%[a-zA-Z0-9_]+", Name.Variable),
- (r"@[a-zA-Z_][a-zA-Z0-9_]+", Name.Function),
- (r"\^[a-zA-Z0-9_]+", Name.Label),
- (r"#[a-zA-Z0-9_]+", Name.Constant),
- (r"![a-zA-Z0-9_]+", Keyword.Type),
- (r"[a-zA-Z_][a-zA-Z0-9_]*\.", Name.Entity),
- (r"memref[^.]", Keyword.Type),
- (r"index", Keyword.Type),
- (r"i[0-9]+", Keyword.Type),
- (r"f[0-9]+", Keyword.Type),
+ # Comments
+ (r"//.*?$", Comment.Single),
+ # Attribute alias definition: #name =
+ (
+ r"^\s*(#[_A-Za-z0-9\$\-\.]+)(\b)(\s*=)",
+ bygroups(Name.Constant, Text, Operator),
+ ),
+ # Type alias definition: !name =
+ (
+ r"^\s*(![_A-Za-z0-9\$\-\.]+)(\b)(\s*=)",
+ bygroups(Keyword.Type, Text, Operator),
+ ),
+ # SSA values (results, uses) - allow many characters MLIR uses
+ (r"%[%_A-Za-z0-9\.\$:\-]+", Name.Variable),
+ # attribute refs, constants and named attributes
+ (r"#[_A-Za-z0-9\$\-\.]+\b", Name.Constant),
+ # symbol refs / function-like names
+ (r"@[_A-Za-z][_A-Za-z0-9\$\-\.]*\b", Name.Function),
+ # blocks
+ (r"\^[A-Za-z0-9_\$\.\-]+", Name.Label),
+ # types by exclamation or builtin names
+ (r"![_A-Za-z0-9\$\-\.]+\b", Keyword.Type),
+ (r"\b(bf16|f16|f32|f64|f80|f128|index|none|(u|s)?i[0-9]+)\b", Keyword.Type),
+ # container-like dialect types (tensor<...>, memref<...>, vector<...>)
+ (
+ r"\b(complex|memref|tensor|tuple|vector)\s*(<)",
+ bygroups(Keyword.Type, Punctuation),
+ "angled-type",
+ ),
+ # affine constructs
+ (r"\b(affine_map|affine_set)\b", Keyword.Reserved),
+ # common builtin operators / functions inside affine_map
+ (r"\b(ceildiv|floordiv|mod|symbol)\b", Name.Builtin),
+ # operation definitions with assignment: %... = op.name
+ (
+ r"^(\s*)(%[\%_A-Za-z0-9\:\,\s]+)(\s*=\s*)([A-Za-z0-9_\.\$\-]+)\b",
+ bygroups(Text, Name.Variable, Operator, Name.Function),
+ ),
+ # operation name without result
+ (r"^(\s*)([A-Za-z0-9_\.\$\-]+)\b(?=[^<:])", bygroups(Text, Name.Function)),
+ # identifiers / bare words
+ (r"\b[_A-Za-z][_A-Za-z0-9\.-]*\b", Name.Other),
+ # numbers: hex, float (with exponent), integer
+ (r"\b0x[0-9A-Fa-f]+\b", Number.Hex),
+ (r"\b([0-9]+(\.[0-9]*)?|\.[0-9]+)([eE][+-]?[0-9]+)?\b", Number.Float),
+ (r"\b[0-9]+\b", Number.Integer),
+ # strings
+ (r'"', String.Double, "string"),
+ # punctuation and arrow-like tokens
+ (r"->|>=|<=|\>=|\<=|\->|\=>", Operator),
+ (r"[()\[\]{}<>,.:=]", Punctuation),
+ # operators
+ (r"[-+*/%]", Operator),
+ ],
+ # string state with common escapes
+ "string": [
+ (r'\\[ntr"\\]', String.Escape),
+ (r'[^"\\]+', String.Double),
+ (r'"', String.Double, "#pop"),
+ ],
+ # angled-type content (simple nested handling)
+ "angled-type": [
+ # match nested '<' and '>'
+ (r"<", Punctuation, "#push"),
+ (r">", Punctuation, "#pop"),
+ # dimensions like 3x or 3x3x... and standalone numbers:
+ # - match numbers that are followed by an 'x' (dimension separator)
+ (r"([0-9]+)(?=(?:[xX]))", Number.Integer),
+ # - match bare numbers (sizes)
(r"[0-9]+", Number.Integer),
- (r"[0-9]*\.[0-9]*", Number.Float),
- (r'"[^"]*"', String.Double),
- (r"affine_map", Keyword.Reserved),
- # TODO: this should be within affine maps only
- (r"\+-\*\/", Operator),
- (r"floordiv", Operator.Word),
- (r"ceildiv", Operator.Word),
- (r"mod", Operator.Word),
- (r"()\[\]<>,{}", Punctuation),
- (r"\/\/.*\n", Comment.Single),
- ]
+ # dynamic dimension '?'
+ (r"\?", Name.Constant),
+ # the 'x' dimension separator (treat as punctuation)
+ (r"[xX]", Punctuation),
+ # element / builtin types inside angle brackets (no word-boundary)
+ (
+ r"(?:bf16|f16|f32|f64|f80|f128|index|none|(?:[us]?i[0-9]+))",
----------------
PragmaTwice wrote:
ahh looks good. I'll add.
https://github.com/llvm/llvm-project/pull/166406
More information about the Mlir-commits
mailing list