[Mlir-commits] [mlir] [MLIR][Pygments] Refine the pygments MLIR lexer (PR #166406)

Tue Nov 4 09:19:52 PST 2025

https://github.com/PragmaTwice created https://github.com/llvm/llvm-project/pull/166406

Recently, the MLIR website added API documentation for the Python bindings generated via Sphinx ([https://mlir.llvm.org/python-bindings/](https://mlir.llvm.org/python-bindings/)). In [https://github.com/llvm/mlir-www/pull/245](https://github.com/llvm/mlir-www/pull/245), I introduced the Pygments lexer from the MLIR repository to enable syntax highlighting for MLIR code blocks in these API docs.

However, since the existing Pygments lexer was fairly minimal, it didn’t fully handle all aspects of the MLIR syntax, leading to imperfect highlighting in some cases. In this PR, I used ChatGPT to rewrite the lexer by combining it with the TextMate grammar for MLIR ([https://github.com/llvm/llvm-project/blob/main/mlir/utils/textmate/mlir.json](https://github.com/llvm/llvm-project/blob/main/mlir/utils/textmate/mlir.json)). After some manual adjustments, the results look quite good—so I’m submitting this to improve the syntax highlighting experience in the Python bindings API documentation.


>From 94821d0f1afc07d853abcdb211729661924b04b9 Mon Sep 17 00:00:00 2001
From: PragmaTwice <twice at apache.org>
Date: Wed, 5 Nov 2025 01:08:09 +0800
Subject: [PATCH] [MLIR][Pygments] Refine the pygments MLIR lexer

---
 mlir/utils/pygments/mlir_lexer.py | 136 +++++++++++++++++++++++++-----
 1 file changed, 114 insertions(+), 22 deletions(-)

diff --git a/mlir/utils/pygments/mlir_lexer.py b/mlir/utils/pygments/mlir_lexer.py
index 179a058e9110c..ebe29e083387c 100644
--- a/mlir/utils/pygments/mlir_lexer.py
+++ b/mlir/utils/pygments/mlir_lexer.py
@@ -2,37 +2,129 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-from pygments.lexer import RegexLexer
+from pygments.lexer import RegexLexer, bygroups, include, combined
 from pygments.token import *
+import re
 
 
 class MlirLexer(RegexLexer):
+    """Pygments lexer for MLIR.
+
+    This lexer focuses on accurate tokenization of common MLIR constructs:
+    - SSA values (%%... / %...)
+    - attribute and type aliases (#name =, !name =)
+    - types (builtin and dialect types, parametric types)
+    - attribute dictionaries and nested containers to a reasonable depth
+    - numbers (ints, floats with exponents, hex)
+    - strings with common escapes
+    - line comments (// ...)
+    - block labels (^foo) and operations
+    """
+
     name = "MLIR"
     aliases = ["mlir"]
     filenames = ["*.mlir"]
 
+    flags = re.MULTILINE
+
     tokens = {
         "root": [
-            (r"%[a-zA-Z0-9_]+", Name.Variable),
-            (r"@[a-zA-Z_][a-zA-Z0-9_]+", Name.Function),
-            (r"\^[a-zA-Z0-9_]+", Name.Label),
-            (r"#[a-zA-Z0-9_]+", Name.Constant),
-            (r"![a-zA-Z0-9_]+", Keyword.Type),
-            (r"[a-zA-Z_][a-zA-Z0-9_]*\.", Name.Entity),
-            (r"memref[^.]", Keyword.Type),
-            (r"index", Keyword.Type),
-            (r"i[0-9]+", Keyword.Type),
-            (r"f[0-9]+", Keyword.Type),
+            # Comments
+            (r"//.*?$", Comment.Single),
+
+            # Attribute alias definition:  #name =
+            (r"^\s*(#[_A-Za-z0-9\$\-\.]+)(\b)(\s*=)",
+             bygroups(Name.Constant, Text, Operator)),
+
+            # Type alias definition: !name =
+            (r"^\s*(![_A-Za-z0-9\$\-\.]+)(\b)(\s*=)",
+             bygroups(Keyword.Type, Text, Operator)),
+
+            # SSA values (results, uses) - allow many characters MLIR uses
+            (r"%[%_A-Za-z0-9\.\$:\-]+", Name.Variable),
+
+            # attribute refs, constants and named attributes
+            (r"#[_A-Za-z0-9\$\-\.]+\b", Name.Constant),
+
+            # symbol refs / function-like names
+            (r"@[_A-Za-z][_A-Za-z0-9\$\-\.]*\b", Name.Function),
+
+            # blocks
+            (r"\^[A-Za-z0-9_\$\.\-]+", Name.Label),
+
+            # types by exclamation or builtin names
+            (r"![_A-Za-z0-9\$\-\.]+\b", Keyword.Type),
+            (r"\b(bf16|f16|f32|f64|f80|f128|index|none|(u|s)?i[0-9]+)\b", Keyword.Type),
+
+            # container-like dialect types (tensor<...>, memref<...>, vector<...>)
+            (r"\b(complex|memref|tensor|tuple|vector)\s*(<)", bygroups(Keyword.Type, Punctuation), 'angled-type'),
+
+            # affine constructs
+            (r"\b(affine_map|affine_set)\b", Keyword.Reserved),
+
+            # common builtin operators / functions inside affine_map
+            (r"\b(ceildiv|floordiv|mod|symbol)\b", Name.Builtin),
+
+            # operation definitions with assignment: %... = op.name
+            (r"^\s*(%[\%_A-Za-z0-9\:\,\s]+)\s*(=)\s*([A-Za-z0-9_\.\$\-]+)\b",
+             bygroups(Name.Variable, Operator, Name.Function)),
+
+            # operation name without result
+            (r"^\s*([A-Za-z0-9_\.\$\-]+)\b(?=[^<:])", Name.Function),
+
+            # identifiers / bare words
+            (r"\b[_A-Za-z][_A-Za-z0-9\.-]*\b", Name.Other),
+
+            # numbers: hex, float (with exponent), integer
+            (r"\b0x[0-9A-Fa-f]+\b", Number.Hex),
+            (r"\b([0-9]+(\.[0-9]*)?|\.[0-9]+)([eE][+-]?[0-9]+)?\b", Number.Float),
+            (r"\b[0-9]+\b", Number.Integer),
+
+            # strings
+            (r'"', String.Double, 'string'),
+
+            # punctuation and arrow-like tokens
+            (r"->|>=|<=|\>=|\<=|\->|\=>", Operator),
+            (r"[()\[\]{}<>,.:=]", Punctuation),
+
+            # operators
+            (r"[-+*/%]", Operator),
+        ],
+
+        # string state with common escapes
+        'string': [
+            (r'\\[ntr"\\]', String.Escape),
+            (r'[^"\\]+', String.Double),
+            (r'"', String.Double, '#pop'),
+        ],
+
+        # angled-type content (simple nested handling)
+        'angled-type': [
+            # match nested '<' and '>'
+            (r"<", Punctuation, '#push'),
+            (r">", Punctuation, '#pop'),
+
+            # dimensions like 3x or 3x3x... and standalone numbers:
+            # - match numbers that are followed by an 'x' (dimension separator)
+            (r"([0-9]+)(?=(?:[xX]))", Number.Integer),
+            # - match bare numbers (sizes)
             (r"[0-9]+", Number.Integer),
-            (r"[0-9]*\.[0-9]*", Number.Float),
-            (r'"[^"]*"', String.Double),
-            (r"affine_map", Keyword.Reserved),
-            # TODO: this should be within affine maps only
-            (r"\+-\*\/", Operator),
-            (r"floordiv", Operator.Word),
-            (r"ceildiv", Operator.Word),
-            (r"mod", Operator.Word),
-            (r"()\[\]<>,{}", Punctuation),
-            (r"\/\/.*\n", Comment.Single),
-        ]
+            # dynamic dimension '?'
+            (r"\?", Name.Constant),
+
+            # the 'x' dimension separator (treat as punctuation)
+            (r"[xX]", Punctuation),
+
+            # element / builtin types inside angle brackets (no word-boundary)
+            (r"(?:bf16|f16|f32|f64|f80|f128|index|none|(?:[us]?i[0-9]+))",
+            Keyword.Type),
+
+            # also allow nested container-like types to be recognized
+            (r"\b(complex|memref|tensor|tuple|vector)\s*(<)",
+            bygroups(Keyword.Type, Punctuation), 'angled-type'),
+
+            # fall back to root rules for anything else
+            include('root'),
+        ],
+
     }