[llvm] [MC] Add canonicalizeMnemonic hook (PR #174160)

Aiden Grossman via llvm-commits llvm-commits at lists.llvm.org
Thu Jan 1 13:07:14 PST 2026


https://github.com/boomanaiden154 created https://github.com/llvm/llvm-project/pull/174160

Fixes #87917.

Some ISAs, such as X86, have instruction prefixes which means that turning the opcode into lowercase in
parseAndMatchAndEmitTargetInstruction does not actually canonicalize the mnemonic, just the prefix. We cannot just convert the case within parseInstruction() due to lifetime constraints around StringRef. We also cannot generically canonicalize mnemonics in AsmParser due to different backends implementing their own token storage, on top of other details like HasMnemonicFirst not being exposed. So implement the (seemingly) cleanest approach of a new target specific hook. Also do the wire up in X86 to fix issues around parsing capitalized instruction mnemonics that come after prefixes.

>From f76446641366158695bcc7e83b15629dc0c2c919 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman at google.com>
Date: Thu, 1 Jan 2026 21:01:35 +0000
Subject: [PATCH] [MC] Add canonicalizeMnemonic hook

Fixes #87917.

Some ISAs, such as X86, have instruction prefixes which means that
turning the opcode into lowercase in
parseAndMatchAndEmitTargetInstruction does not actually canonicalize the
mnemonic, just the prefix. We cannot just convert the case within
parseInstruction() due to lifetime constraints around StringRef. We also
cannot generically canonicalize mnemonics in AsmParser due to different
backends implementing their own token storage, on top of other details
like HasMnemonicFirst not being exposed. So implement the (seemingly)
cleanest approach of a new target specific hook. Also do the wire up in
X86 to fix issues around parsing capitalized instruction mnemonics that
come after prefixes.
---
 .../include/llvm/MC/MCParser/MCTargetAsmParser.h |  9 +++++++++
 llvm/lib/MC/MCParser/AsmParser.cpp               |  3 +++
 llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp   | 16 ++++++++++++----
 llvm/test/MC/X86/x86-64.s                        |  4 ++++
 4 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/MC/MCParser/MCTargetAsmParser.h b/llvm/include/llvm/MC/MCParser/MCTargetAsmParser.h
index 7628987b9587d..23446f03cf277 100644
--- a/llvm/include/llvm/MC/MCParser/MCTargetAsmParser.h
+++ b/llvm/include/llvm/MC/MCParser/MCTargetAsmParser.h
@@ -471,6 +471,15 @@ class LLVM_ABI MCTargetAsmParser : public MCAsmParserExtension {
                                        uint64_t &ErrorInfo,
                                        bool MatchingInlineAsm) = 0;
 
+  /// Canonicalize instruction mnemonics after instruction parsing.
+  ///
+  /// parseInstruction() is not allowed to update the string contents of any
+  /// tokens, which might be necessary in some cases. This allows targets to
+  /// change the mnemonic. Due to lifetime constraints, the changed mnemonic
+  /// string needs to be stored in NameStorage.
+  virtual void canonicalizeMnemonic(OperandVector &Operands,
+                                    std::string &NameStorage) {};
+
   /// Allows targets to let registers opt out of clobber lists.
   virtual bool omitRegisterFromClobberLists(MCRegister Reg) { return false; }
 
diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp
index 429cdae1fa1b6..e1517289ddcef 100644
--- a/llvm/lib/MC/MCParser/AsmParser.cpp
+++ b/llvm/lib/MC/MCParser/AsmParser.cpp
@@ -2238,6 +2238,9 @@ bool AsmParser::parseAndMatchAndEmitTargetInstruction(ParseStatementInfo &Info,
                                                           Info.ParsedOperands);
   Info.ParseError = ParseHadError;
 
+  std::string MnemonicStr;
+  getTargetParser().canonicalizeMnemonic(Info.ParsedOperands, MnemonicStr);
+
   // Dump the parsed representation, if requested.
   if (getShowParsedOperands()) {
     SmallString<256> Str;
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index d1dda4debe229..5204c8c50e14b 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -1228,6 +1228,9 @@ class X86AsmParser : public MCTargetAsmParser {
                                uint64_t &ErrorInfo,
                                bool MatchingInlineAsm) override;
 
+  void canonicalizeMnemonic(OperandVector &Operands,
+                            std::string &NameStorage) override;
+
   void MatchFPUWaitAlias(SMLoc IDLoc, X86Operand &Op, OperandVector &Operands,
                          MCStreamer &Out, bool MatchingInlineAsm);
 
@@ -3312,7 +3315,6 @@ bool X86AsmParser::parseInstruction(ParseInstructionInfo &Info, StringRef Name,
       } else {
         if (getLexer().isNot(AsmToken::Identifier))
           return Error(Parser.getTok().getLoc(), "Expected identifier");
-        // FIXME: The mnemonic won't match correctly if its not in lower case.
         Name = Parser.getTok().getString();
         Parser.Lex();
       }
@@ -3332,7 +3334,6 @@ bool X86AsmParser::parseInstruction(ParseInstructionInfo &Info, StringRef Name,
       if (ForcedOpcodePrefix != OpcodePrefix_Default) {
         if (getLexer().isNot(AsmToken::Identifier))
           return Error(Parser.getTok().getLoc(), "Expected identifier");
-        // FIXME: The mnemonic won't match correctly if its not in lower case.
         Name = Parser.getTok().getString();
         NameLoc = Parser.getTok().getLoc();
         Parser.Lex();
@@ -3560,7 +3561,6 @@ bool X86AsmParser::parseInstruction(ParseInstructionInfo &Info, StringRef Name,
       Flags = X86::IP_NO_PREFIX;
       break;
     }
-    // FIXME: The mnemonic won't match correctly if its not in lower case.
     Name = Parser.getTok().getString();
     Parser.Lex(); // eat the prefix
     // Hack: we could have something like "rep # some comment" or
@@ -3568,7 +3568,6 @@ bool X86AsmParser::parseInstruction(ParseInstructionInfo &Info, StringRef Name,
     while (Name.starts_with(";") || Name.starts_with("\n") ||
            Name.starts_with("#") || Name.starts_with("\t") ||
            Name.starts_with("/")) {
-      // FIXME: The mnemonic won't match correctly if its not in lower case.
       Name = Parser.getTok().getString();
       Parser.Lex(); // go to next prefix or instr
     }
@@ -4250,6 +4249,15 @@ bool X86AsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                           ErrorInfo, MatchingInlineAsm);
 }
 
+void X86AsmParser::canonicalizeMnemonic(OperandVector &Operands,
+                                        std::string &NameStorage) {
+  if (Operands.size() == 0)
+    return;
+  NameStorage = static_cast<X86Operand &>(*Operands[0]).getToken().lower();
+  Operands[0] =
+      X86Operand::CreateToken(NameStorage, Operands[0]->getStartLoc());
+}
+
 void X86AsmParser::MatchFPUWaitAlias(SMLoc IDLoc, X86Operand &Op,
                                      OperandVector &Operands, MCStreamer &Out,
                                      bool MatchingInlineAsm) {
diff --git a/llvm/test/MC/X86/x86-64.s b/llvm/test/MC/X86/x86-64.s
index 2da72d52d997c..4774168814fff 100644
--- a/llvm/test/MC/X86/x86-64.s
+++ b/llvm/test/MC/X86/x86-64.s
@@ -1984,3 +1984,7 @@ senduipi %r8
 // CHECK: senduipi %r13
 // CHECK: encoding: [0xf3,0x41,0x0f,0xc7,0xf5]
 senduipi %r13
+
+// CHECK: rep movl $29, %eax
+// CHECK: encoding: [0xf3,0xb8,0x1d,0x00,0x00,0x00]
+REP MOVL $0x1d, %EAX



More information about the llvm-commits mailing list