[llvm] [LLVM][MC][DecoderEmitter] Add support to specialize decoder per bitwidth (PR #154865)

Wed Aug 27 13:37:09 PDT 2025

https://github.com/jurahul updated https://github.com/llvm/llvm-project/pull/154865

>From bae0b4ff45916e29663add95fb219f85684cc7eb Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi at nvidia.com>
Date: Thu, 21 Aug 2025 16:08:03 -0700
Subject: [PATCH 1/8] Cull decoders

---
 llvm/include/llvm/MC/MCDecoder.h              |  25 ++++
 llvm/include/llvm/Target/Target.td            |   7 +
 llvm/lib/Target/AMDGPU/AMDGPU.td              |   1 +
 .../Disassembler/AMDGPUDisassembler.cpp       |  29 ++---
 .../AMDGPU/Disassembler/AMDGPUDisassembler.h  |  38 ------
 .../RISCV/Disassembler/RISCVDisassembler.cpp  |  19 ++-
 llvm/lib/Target/RISCV/RISCV.td                |   1 +
 llvm/test/TableGen/VarLenDecoder.td           |  11 +-
 llvm/utils/TableGen/DecoderEmitter.cpp        | 121 +++++++++++++-----
 9 files changed, 153 insertions(+), 99 deletions(-)

diff --git a/llvm/include/llvm/MC/MCDecoder.h b/llvm/include/llvm/MC/MCDecoder.h
index 70762a4a5ebae..6259ef5a3bd5d 100644
--- a/llvm/include/llvm/MC/MCDecoder.h
+++ b/llvm/include/llvm/MC/MCDecoder.h
@@ -12,6 +12,7 @@
 
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/Support/MathExtras.h"
+#include <bitset>
 #include <cassert>
 
 namespace llvm::MCD {
@@ -48,6 +49,15 @@ fieldFromInstruction(const InsnType &Insn, unsigned StartBit,
   return Insn.extractBitsAsZExtValue(NumBits, StartBit);
 }
 
+template <size_t N>
+uint64_t fieldFromInstruction(const std::bitset<N> &Insn, unsigned StartBit,
+                              unsigned NumBits) {
+  assert(StartBit + NumBits <= N && "Instruction field out of bounds!");
+  assert(NumBits <= 64 && "Cannot support >64-bit extractions!");
+  const std::bitset<N> Mask(maskTrailingOnes<uint64_t>(NumBits));
+  return ((Insn >> StartBit) & Mask).to_ullong();
+}
+
 // Helper function for inserting bits extracted from an encoded instruction into
 // an integer-typed field.
 template <typename IntType>
@@ -62,6 +72,21 @@ insertBits(IntType &field, IntType bits, unsigned startBit, unsigned numBits) {
   field |= bits << startBit;
 }
 
+// InsnBitWidth is essentially a type trait used by the decoder emitter to query
+// the supported bitwidth for a given type. But default, the value is 0, making
+// it an invalid type for use as `InsnType` when instantiating the decoder.
+template <typename T> inline constexpr uint32_t InsnBitWidth = 0;
+
+// Provide specializations for commonly used types.
+// Integer types.
+template <> inline constexpr uint32_t InsnBitWidth<uint8_t> = 8;
+template <> inline constexpr uint32_t InsnBitWidth<uint16_t> = 16;
+template <> inline constexpr uint32_t InsnBitWidth<uint32_t> = 32;
+template <> inline constexpr uint32_t InsnBitWidth<uint64_t> = 64;
+
+// std::bitset<N>.
+template <size_t N> inline constexpr uint32_t InsnBitWidth<std::bitset<N>> = N;
+
 } // namespace llvm::MCD
 
 #endif // LLVM_MC_MCDECODER_H
diff --git a/llvm/include/llvm/Target/Target.td b/llvm/include/llvm/Target/Target.td
index 495b59ee916cf..403c866e386ae 100644
--- a/llvm/include/llvm/Target/Target.td
+++ b/llvm/include/llvm/Target/Target.td
@@ -1158,6 +1158,13 @@ class InstrInfo {
   //
   // This option is a temporary migration help. It will go away.
   bit guessInstructionProperties = true;
+
+  // Generate decoders that are specialized per bit width in the generated
+  // decoder/disassembler. This requires use of different `InsnType` for
+  // different bitwidths and defining `InsnBitWidth` template specialization for
+  // the `InsnType` types used. Some common specializations are already defined
+  // in MCDecoder.h.
+  bit SpecializeDecodersPerBitwidth = false;
 }
 
 // Standard Pseudo Instructions.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index edd3ce72d7df3..4d9185d40b9a5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -2100,6 +2100,7 @@ def FeatureISAVersion12_Generic: FeatureSet<
 
 def AMDGPUInstrInfo : InstrInfo {
   let guessInstructionProperties = 1;
+  let SpecializeDecodersPerBitwidth = true;
 }
 
 def AMDGPUAsmParser : AsmParser {
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 6a2beeed41dfd..ac42e51d447e3 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -38,6 +38,7 @@
 #include "llvm/Support/Compiler.h"
 
 using namespace llvm;
+using namespace llvm::MCD;
 
 #define DEBUG_TYPE "amdgpu-disassembler"
 
@@ -498,26 +499,24 @@ template <typename T> static inline T eatBytes(ArrayRef<uint8_t>& Bytes) {
   return Res;
 }
 
-static inline DecoderUInt128 eat12Bytes(ArrayRef<uint8_t> &Bytes) {
+static inline std::bitset<96> eat12Bytes(ArrayRef<uint8_t> &Bytes) {
+  using namespace llvm::support::endian;
   assert(Bytes.size() >= 12);
-  uint64_t Lo =
-      support::endian::read<uint64_t, llvm::endianness::little>(Bytes.data());
+  std::bitset<96> Lo(read<uint64_t, endianness::little>(Bytes.data()));
   Bytes = Bytes.slice(8);
-  uint64_t Hi =
-      support::endian::read<uint32_t, llvm::endianness::little>(Bytes.data());
+  std::bitset<96> Hi(read<uint32_t, endianness::little>(Bytes.data()));
   Bytes = Bytes.slice(4);
-  return DecoderUInt128(Lo, Hi);
+  return (Hi << 64) | Lo;
 }
 
-static inline DecoderUInt128 eat16Bytes(ArrayRef<uint8_t> &Bytes) {
+static inline std::bitset<128> eat16Bytes(ArrayRef<uint8_t> &Bytes) {
+  using namespace llvm::support::endian;
   assert(Bytes.size() >= 16);
-  uint64_t Lo =
-      support::endian::read<uint64_t, llvm::endianness::little>(Bytes.data());
+  std::bitset<128> Lo(read<uint64_t, endianness::little>(Bytes.data()));
   Bytes = Bytes.slice(8);
-  uint64_t Hi =
-      support::endian::read<uint64_t, llvm::endianness::little>(Bytes.data());
+  std::bitset<128> Hi(read<uint64_t, endianness::little>(Bytes.data()));
   Bytes = Bytes.slice(8);
-  return DecoderUInt128(Lo, Hi);
+  return (Hi << 64) | Lo;
 }
 
 void AMDGPUDisassembler::decodeImmOperands(MCInst &MI,
@@ -600,14 +599,14 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     // Try to decode DPP and SDWA first to solve conflict with VOP1 and VOP2
     // encodings
     if (isGFX1250() && Bytes.size() >= 16) {
-      DecoderUInt128 DecW = eat16Bytes(Bytes);
+      std::bitset<128> DecW = eat16Bytes(Bytes);
       if (tryDecodeInst(DecoderTableGFX1250128, MI, DecW, Address, CS))
         break;
       Bytes = Bytes_.slice(0, MaxInstBytesNum);
     }
 
     if (isGFX11Plus() && Bytes.size() >= 12) {
-      DecoderUInt128 DecW = eat12Bytes(Bytes);
+      std::bitset<96> DecW = eat12Bytes(Bytes);
 
       if (isGFX11() &&
           tryDecodeInst(DecoderTableGFX1196, DecoderTableGFX11_FAKE1696, MI,
@@ -642,7 +641,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
 
     } else if (Bytes.size() >= 16 &&
                STI.hasFeature(AMDGPU::FeatureGFX950Insts)) {
-      DecoderUInt128 DecW = eat16Bytes(Bytes);
+      std::bitset<128> DecW = eat16Bytes(Bytes);
       if (tryDecodeInst(DecoderTableGFX940128, MI, DecW, Address, CS))
         break;
 
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index f4d164bf10c3c..ded447b6f8d5a 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -32,44 +32,6 @@ class MCOperand;
 class MCSubtargetInfo;
 class Twine;
 
-// Exposes an interface expected by autogenerated code in
-// FixedLenDecoderEmitter
-class DecoderUInt128 {
-private:
-  uint64_t Lo = 0;
-  uint64_t Hi = 0;
-
-public:
-  DecoderUInt128() = default;
-  DecoderUInt128(uint64_t Lo, uint64_t Hi = 0) : Lo(Lo), Hi(Hi) {}
-  operator bool() const { return Lo || Hi; }
-  uint64_t extractBitsAsZExtValue(unsigned NumBits,
-                                  unsigned BitPosition) const {
-    assert(NumBits && NumBits <= 64);
-    assert(BitPosition < 128);
-    uint64_t Val;
-    if (BitPosition < 64)
-      Val = Lo >> BitPosition | Hi << 1 << (63 - BitPosition);
-    else
-      Val = Hi >> (BitPosition - 64);
-    return Val & ((uint64_t(2) << (NumBits - 1)) - 1);
-  }
-  DecoderUInt128 operator&(const DecoderUInt128 &RHS) const {
-    return DecoderUInt128(Lo & RHS.Lo, Hi & RHS.Hi);
-  }
-  DecoderUInt128 operator&(const uint64_t &RHS) const {
-    return *this & DecoderUInt128(RHS);
-  }
-  DecoderUInt128 operator~() const { return DecoderUInt128(~Lo, ~Hi); }
-  bool operator==(const DecoderUInt128 &RHS) {
-    return Lo == RHS.Lo && Hi == RHS.Hi;
-  }
-  bool operator!=(const DecoderUInt128 &RHS) {
-    return Lo != RHS.Lo || Hi != RHS.Hi;
-  }
-  bool operator!=(const int &RHS) { return *this != DecoderUInt128(RHS); }
-};
-
 //===----------------------------------------------------------------------===//
 // AMDGPUDisassembler
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index dbb16fce8390a..494de07843237 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -558,7 +558,8 @@ static DecodeStatus decodeXqccmpRlistS0(MCInst &Inst, uint32_t Imm,
   return decodeZcmpRlist(Inst, Imm, Address, Decoder);
 }
 
-static DecodeStatus decodeCSSPushPopchk(MCInst &Inst, uint32_t Insn,
+static DecodeStatus decodeCSSPushPopchk(MCInst &Inst,
+                                        const std::bitset<48> &Insn,
                                         uint64_t Address,
                                         const MCDisassembler *Decoder) {
   uint32_t Rs1 = fieldFromInstruction(Insn, 7, 5);
@@ -568,7 +569,8 @@ static DecodeStatus decodeCSSPushPopchk(MCInst &Inst, uint32_t Insn,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus decodeXTHeadMemPair(MCInst &Inst, uint32_t Insn,
+static DecodeStatus decodeXTHeadMemPair(MCInst &Inst,
+                                        const std::bitset<48> &Insn,
                                         uint64_t Address,
                                         const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
@@ -710,9 +712,7 @@ DecodeStatus RISCVDisassembler::getInstruction32(MCInst &MI, uint64_t &Size,
   }
   Size = 4;
 
-  // Use uint64_t to match getInstruction48. decodeInstruction is templated
-  // on the Insn type.
-  uint64_t Insn = support::endian::read32le(Bytes.data());
+  uint32_t Insn = support::endian::read32le(Bytes.data());
 
   for (const DecoderListEntry &Entry : DecoderList32) {
     if (!Entry.haveContainedFeatures(STI.getFeatureBits()))
@@ -758,9 +758,7 @@ DecodeStatus RISCVDisassembler::getInstruction16(MCInst &MI, uint64_t &Size,
   }
   Size = 2;
 
-  // Use uint64_t to match getInstruction48. decodeInstruction is templated
-  // on the Insn type.
-  uint64_t Insn = support::endian::read16le(Bytes.data());
+  uint16_t Insn = support::endian::read16le(Bytes.data());
 
   for (const DecoderListEntry &Entry : DecoderList16) {
     if (!Entry.haveContainedFeatures(STI.getFeatureBits()))
@@ -794,10 +792,11 @@ DecodeStatus RISCVDisassembler::getInstruction48(MCInst &MI, uint64_t &Size,
   }
   Size = 6;
 
-  uint64_t Insn = 0;
+  uint64_t InsnBits = 0;
   for (size_t i = Size; i-- != 0;)
-    Insn += (static_cast<uint64_t>(Bytes[i]) << 8 * i);
+    InsnBits += (static_cast<uint64_t>(Bytes[i]) << 8 * i);
 
+  std::bitset<48> Insn = InsnBits;
   for (const DecoderListEntry &Entry : DecoderList48) {
     if (!Entry.haveContainedFeatures(STI.getFeatureBits()))
       continue;
diff --git a/llvm/lib/Target/RISCV/RISCV.td b/llvm/lib/Target/RISCV/RISCV.td
index b24d8637cb27f..bd8bc56f1da3a 100644
--- a/llvm/lib/Target/RISCV/RISCV.td
+++ b/llvm/lib/Target/RISCV/RISCV.td
@@ -85,6 +85,7 @@ include "RISCVPfmCounters.td"
 
 def RISCVInstrInfo : InstrInfo {
   let guessInstructionProperties = 0;
+  let SpecializeDecodersPerBitwidth = true;
 }
 
 def RISCVAsmParser : AsmParser {
diff --git a/llvm/test/TableGen/VarLenDecoder.td b/llvm/test/TableGen/VarLenDecoder.td
index 769c5895ec3c1..10e254f7673e6 100644
--- a/llvm/test/TableGen/VarLenDecoder.td
+++ b/llvm/test/TableGen/VarLenDecoder.td
@@ -47,6 +47,12 @@ def FOO32 : MyVarInst<MemOp32> {
   );
 }
 
+// Instruction length table
+// CHECK: InstrLenTable
+// CHECK: 27,
+// CHECK-NEXT: 43,
+// CHECK-NEXT: };
+
 // CHECK-SMALL:      /* 0 */       MCD::OPC_ExtractField, 3, 5,  // Inst{7-3} ...
 // CHECK-SMALL-NEXT: /* 3 */       MCD::OPC_FilterValue, 8, 4, 0, // Skip to: 11
 // CHECK-SMALL-NEXT: /* 7 */       MCD::OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 0, // Opcode: FOO16
@@ -61,11 +67,6 @@ def FOO32 : MyVarInst<MemOp32> {
 // CHECK-LARGE-NEXT: /* 14 */      MCD::OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 1, // Opcode: FOO32
 // CHECK-LARGE-NEXT: };
 
-// Instruction length table
-// CHECK: 27,
-// CHECK-NEXT: 43,
-// CHECK-NEXT: };
-
 // CHECK:      case 0:
 // CHECK-NEXT: tmp = fieldFromInstruction(insn, 8, 3);
 // CHECK-NEXT: if (!Check(S, DecodeRegClassRegisterClass(MI, tmp, Address, Decoder))) { return MCDisassembler::Fail; }
diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp
index ecdc48775c9c1..6117261b80bdd 100644
--- a/llvm/utils/TableGen/DecoderEmitter.cpp
+++ b/llvm/utils/TableGen/DecoderEmitter.cpp
@@ -23,6 +23,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
@@ -354,7 +355,8 @@ class DecoderEmitter {
   void emitPredicateFunction(formatted_raw_ostream &OS,
                              PredicateSet &Predicates) const;
   void emitDecoderFunction(formatted_raw_ostream &OS,
-                           DecoderSet &Decoders) const;
+                           const DecoderSet &Decoders,
+                           unsigned BucketBitWidth) const;
 
   // run - Output the code emitter
   void run(raw_ostream &o) const;
@@ -924,7 +926,8 @@ void DecoderEmitter::emitPredicateFunction(formatted_raw_ostream &OS,
 }
 
 void DecoderEmitter::emitDecoderFunction(formatted_raw_ostream &OS,
-                                         DecoderSet &Decoders) const {
+                                         const DecoderSet &Decoders,
+                                         unsigned BucketBitWidth) const {
   // The decoder function is just a big switch statement or a table of function
   // pointers based on the input decoder index.
 
@@ -955,8 +958,13 @@ void DecoderEmitter::emitDecoderFunction(formatted_raw_ostream &OS,
 
   OS << "// Handling " << Decoders.size() << " cases.\n";
   OS << "template <typename InsnType>\n";
-  OS << "static DecodeStatus decodeToMCInst(unsigned Idx, " << DecodeParams
-     << ") {\n";
+  OS << "static ";
+  if (BucketBitWidth != 0)
+    OS << "std::enable_if_t<InsnBitWidth<InsnType> == " << BucketBitWidth
+       << ", DecodeStatus>\n";
+  else
+    OS << "DecodeStatus ";
+  OS << "decodeToMCInst(unsigned Idx, " << DecodeParams << ") {\n";
   OS << "  using namespace llvm::MCD;\n";
   OS << "  DecodeComplete = true;\n";
 
@@ -969,7 +977,6 @@ void DecoderEmitter::emitDecoderFunction(formatted_raw_ostream &OS,
     OS << "  };\n";
     OS << "  if (Idx >= " << Decoders.size() << ")\n";
     OS << "    llvm_unreachable(\"Invalid decoder index!\");\n";
-
     OS << "  return decodeFnTable[Idx](S, insn, MI, Address, Decoder, "
           "DecodeComplete);\n";
   } else {
@@ -2514,21 +2521,29 @@ namespace {
 )";
 
   // Do extra bookkeeping for variable-length encodings.
-  std::vector<unsigned> InstrLen;
   bool IsVarLenInst = Target.hasVariableLengthEncodings();
   unsigned MaxInstLen = 0;
   if (IsVarLenInst) {
-    InstrLen.resize(Target.getInstructions().size(), 0);
+    std::vector<unsigned> InstrLen(Target.getInstructions().size(), 0);
     for (const InstructionEncoding &Encoding : Encodings) {
       MaxInstLen = std::max(MaxInstLen, Encoding.getBitWidth());
       InstrLen[Target.getInstrIntValue(Encoding.getInstruction()->TheDef)] =
           Encoding.getBitWidth();
     }
+
+    // For variable instruction, we emit a instruction length table to let the
+    // decoder know how long the instructions are. You can see example usage in
+    // M68k's disassembler.
+    emitInstrLenTable(OS, InstrLen);
   }
 
   // Map of (namespace, hwmode, size) tuple to encoding IDs.
-  std::map<std::tuple<StringRef, unsigned, unsigned>, std::vector<unsigned>>
-      EncMap;
+  using EncMapTy = std::map<std::tuple<StringRef, unsigned, unsigned>,
+                            std::vector<unsigned>>;
+  EncMapTy EncMap;
+
+  // The set of valid instruction bitwidths for this target.
+  SmallSet<unsigned, 4> InstrBitwidths;
   for (const auto &[HwModeID, EncodingIDs] : EncodingIDsByHwMode) {
     for (unsigned EncodingID : EncodingIDs) {
       const InstructionEncoding &Encoding = Encodings[EncodingID];
@@ -2536,37 +2551,84 @@ namespace {
       unsigned Size = EncodingDef->getValueAsInt("Size");
       StringRef DecoderNamespace =
           EncodingDef->getValueAsString("DecoderNamespace");
+      const unsigned BitWidth = IsVarLenInst ? MaxInstLen : 8 * Size;
+      InstrBitwidths.insert(BitWidth);
       EncMap[{DecoderNamespace, HwModeID, Size}].push_back(EncodingID);
     }
   }
 
+  const bool SpecializeDecodersPerBitwidth =
+      Target.getInstructionSet()->getValueAsBit(
+          "SpecializeDecodersPerBitwidth");
+
+  // Variable length instructions use the same `APInt` type for all instructions
+  // so we cannot specialize decoders based on instruction bitwidths (which
+  // requires using different `InstType` for differet bitwidths for the correct
+  // template specialization to kick in).
+  if (IsVarLenInst && SpecializeDecodersPerBitwidth)
+    PrintFatalError(
+        "Cannot specialize decoders for variable length instuctions");
+
+  // Bucket entries in the `EncMap` based on the instruction bitwidths if
+  // SpecializeDecodersPerBitwidth is enabled.
+  SmallVector<SmallVector<const EncMapTy::value_type *>> PerBitWidthEncMap;
+  if (SpecializeDecodersPerBitwidth) {
+    for (unsigned BW : InstrBitwidths) {
+      auto &Bucket = PerBitWidthEncMap.emplace_back();
+      for (const auto &Entry : EncMap) {
+        unsigned Size = std::get<2>(Entry.first);
+        const unsigned BitWidth = IsVarLenInst ? MaxInstLen : 8 * Size;
+        if (BitWidth != BW)
+          continue;
+        Bucket.push_back(&Entry);
+      }
+    }
+  } else {
+    // If we are not emitting decoders specialized per bit width, create a
+    // single bucket
+    auto &Bucket = PerBitWidthEncMap.emplace_back();
+    Bucket.reserve(EncMap.size());
+    for (const auto &Entry : EncMap)
+      Bucket.push_back(&Entry);
+  }
+
   DecoderTableInfo TableInfo;
   DecoderTableBuilder TableBuilder(Target, Encodings, TableInfo);
   unsigned OpcodeMask = 0;
 
-  for (const auto &[Key, EncodingIDs] : EncMap) {
-    auto [DecoderNamespace, HwModeID, Size] = Key;
-    const unsigned BitWidth = IsVarLenInst ? MaxInstLen : 8 * Size;
-    // Emit the decoder for this (namespace, hwmode, width) combination.
-    FilterChooser FC(Encodings, EncodingIDs);
+  for (const auto &Bucket : PerBitWidthEncMap) {
+    // Each BitWidth get's its own decoders and decoder function.
+    TableInfo.Decoders.clear();
+
+    for (const auto &[Key, EncodingIDs] : make_pointee_range(Bucket)) {
+      auto [DecoderNamespace, HwModeID, Size] = Key;
+      const unsigned BitWidth = IsVarLenInst ? MaxInstLen : 8 * Size;
+      // Emit the decoder for this (namespace, hwmode, width) combination.
+      FilterChooser FC(Encodings, EncodingIDs, BitWidth, Target);
+
+      // The decode table is cleared for each top level decoder function. The
+      // predicates and decoders themselves, however, are shared across
+      // different decoders to give more opportunities for uniqueing.
+      //  - If `SpecializeDecodersPerBitwidth` is enabled, decoders are shared
+      //    across all decoder tables for a given bitwidth, else they are shared
+      //    across all decoder tables.
+      //  - predicates are shared across all decoder tables.
+      TableInfo.Table.clear();
+      TableBuilder.buildTable(FC);
+
+      // Print the table to the output stream.
+      OpcodeMask |= emitTable(OS, TableInfo.Table, DecoderNamespace, HwModeID,
+                              BitWidth, EncodingIDs);
+    }
 
-    // The decode table is cleared for each top level decoder function. The
-    // predicates and decoders themselves, however, are shared across all
-    // decoders to give more opportunities for uniqueing.
-    TableInfo.Table.clear();
-    TableBuilder.buildTable(FC);
+    unsigned BucketSize = std::get<2>(Bucket.front()->first);
+    const unsigned BucketBitWidth =
+        SpecializeDecodersPerBitwidth ? BucketSize * 8 : 0;
 
-    // Print the table to the output stream.
-    OpcodeMask |= emitTable(OS, TableInfo.Table, DecoderNamespace, HwModeID,
-                            BitWidth, EncodingIDs);
+    // Emit the decoder function for each BitWidth bucket.
+    emitDecoderFunction(OS, TableInfo.Decoders, BucketBitWidth);
   }
 
-  // For variable instruction, we emit a instruction length table
-  // to let the decoder know how long the instructions are.
-  // You can see example usage in M68k's disassembler.
-  if (IsVarLenInst)
-    emitInstrLenTable(OS, InstrLen);
-
   const bool HasCheckPredicate =
       OpcodeMask &
       ((1 << MCD::OPC_CheckPredicate) | (1 << MCD::OPC_CheckPredicateOrFail));
@@ -2575,9 +2637,6 @@ namespace {
   if (HasCheckPredicate)
     emitPredicateFunction(OS, TableInfo.Predicates);
 
-  // Emit the decoder function.
-  emitDecoderFunction(OS, TableInfo.Decoders);
-
   // Emit the main entry point for the decoder, decodeInstruction().
   emitDecodeInstruction(OS, IsVarLenInst, OpcodeMask);
 

>From 6ef5aabe761a5b795c9452779cb0888292fc7bfa Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi at nvidia.com>
Date: Mon, 25 Aug 2025 22:32:08 -0700
Subject: [PATCH 2/8] Review feedback

---
 .../RISCV/Disassembler/RISCVDisassembler.cpp  |   6 +-
 llvm/utils/TableGen/DecoderEmitter.cpp        | 106 +++++++-----------
 2 files changed, 45 insertions(+), 67 deletions(-)

diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index 494de07843237..ccec73fb3a455 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -558,8 +558,7 @@ static DecodeStatus decodeXqccmpRlistS0(MCInst &Inst, uint32_t Imm,
   return decodeZcmpRlist(Inst, Imm, Address, Decoder);
 }
 
-static DecodeStatus decodeCSSPushPopchk(MCInst &Inst,
-                                        const std::bitset<48> &Insn,
+static DecodeStatus decodeCSSPushPopchk(MCInst &Inst, uint16_t Insn,
                                         uint64_t Address,
                                         const MCDisassembler *Decoder) {
   uint32_t Rs1 = fieldFromInstruction(Insn, 7, 5);
@@ -569,8 +568,7 @@ static DecodeStatus decodeCSSPushPopchk(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus decodeXTHeadMemPair(MCInst &Inst,
-                                        const std::bitset<48> &Insn,
+static DecodeStatus decodeXTHeadMemPair(MCInst &Inst, uint32_t Insn,
                                         uint64_t Address,
                                         const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp
index 6117261b80bdd..31a992611064d 100644
--- a/llvm/utils/TableGen/DecoderEmitter.cpp
+++ b/llvm/utils/TableGen/DecoderEmitter.cpp
@@ -2537,23 +2537,19 @@ namespace {
     emitInstrLenTable(OS, InstrLen);
   }
 
-  // Map of (namespace, hwmode, size) tuple to encoding IDs.
-  using EncMapTy = std::map<std::tuple<StringRef, unsigned, unsigned>,
-                            std::vector<unsigned>>;
-  EncMapTy EncMap;
+  // Map of (bitwidth, namespace, hwmode) tuple to encoding IDs.
+  std::map<std::tuple<unsigned, StringRef, unsigned>, std::vector<unsigned>>
+      EncMap;
 
-  // The set of valid instruction bitwidths for this target.
-  SmallSet<unsigned, 4> InstrBitwidths;
   for (const auto &[HwModeID, EncodingIDs] : EncodingIDsByHwMode) {
     for (unsigned EncodingID : EncodingIDs) {
       const InstructionEncoding &Encoding = Encodings[EncodingID];
       const Record *EncodingDef = Encoding.getRecord();
       unsigned Size = EncodingDef->getValueAsInt("Size");
+      const unsigned BitWidth = IsVarLenInst ? MaxInstLen : 8 * Size;
       StringRef DecoderNamespace =
           EncodingDef->getValueAsString("DecoderNamespace");
-      const unsigned BitWidth = IsVarLenInst ? MaxInstLen : 8 * Size;
-      InstrBitwidths.insert(BitWidth);
-      EncMap[{DecoderNamespace, HwModeID, Size}].push_back(EncodingID);
+      EncMap[{BitWidth, DecoderNamespace, HwModeID}].push_back(EncodingID);
     }
   }
 
@@ -2569,65 +2565,49 @@ namespace {
     PrintFatalError(
         "Cannot specialize decoders for variable length instuctions");
 
-  // Bucket entries in the `EncMap` based on the instruction bitwidths if
-  // SpecializeDecodersPerBitwidth is enabled.
-  SmallVector<SmallVector<const EncMapTy::value_type *>> PerBitWidthEncMap;
-  if (SpecializeDecodersPerBitwidth) {
-    for (unsigned BW : InstrBitwidths) {
-      auto &Bucket = PerBitWidthEncMap.emplace_back();
-      for (const auto &Entry : EncMap) {
-        unsigned Size = std::get<2>(Entry.first);
-        const unsigned BitWidth = IsVarLenInst ? MaxInstLen : 8 * Size;
-        if (BitWidth != BW)
-          continue;
-        Bucket.push_back(&Entry);
-      }
-    }
-  } else {
-    // If we are not emitting decoders specialized per bit width, create a
-    // single bucket
-    auto &Bucket = PerBitWidthEncMap.emplace_back();
-    Bucket.reserve(EncMap.size());
-    for (const auto &Entry : EncMap)
-      Bucket.push_back(&Entry);
-  }
-
+  // Entries in `EncMap` are already sorted by bitwidth. So bucketing per
+  // bitwidth can be done on-the-fly as we iterate over tha map.
   DecoderTableInfo TableInfo;
   DecoderTableBuilder TableBuilder(Target, Encodings, TableInfo);
   unsigned OpcodeMask = 0;
 
-  for (const auto &Bucket : PerBitWidthEncMap) {
-    // Each BitWidth get's its own decoders and decoder function.
-    TableInfo.Decoders.clear();
-
-    for (const auto &[Key, EncodingIDs] : make_pointee_range(Bucket)) {
-      auto [DecoderNamespace, HwModeID, Size] = Key;
-      const unsigned BitWidth = IsVarLenInst ? MaxInstLen : 8 * Size;
-      // Emit the decoder for this (namespace, hwmode, width) combination.
-      FilterChooser FC(Encodings, EncodingIDs, BitWidth, Target);
-
-      // The decode table is cleared for each top level decoder function. The
-      // predicates and decoders themselves, however, are shared across
-      // different decoders to give more opportunities for uniqueing.
-      //  - If `SpecializeDecodersPerBitwidth` is enabled, decoders are shared
-      //    across all decoder tables for a given bitwidth, else they are shared
-      //    across all decoder tables.
-      //  - predicates are shared across all decoder tables.
-      TableInfo.Table.clear();
-      TableBuilder.buildTable(FC);
-
-      // Print the table to the output stream.
-      OpcodeMask |= emitTable(OS, TableInfo.Table, DecoderNamespace, HwModeID,
-                              BitWidth, EncodingIDs);
+  unsigned PrevBitWidth = 0;
+  for (const auto &[Key, EncodingIDs] : EncMap) {
+    auto [BitWidth, DecoderNamespace, HwModeID] = Key;
+
+    // If we are starting a new bitwidth and SpecializeDecodersPerBitwidth is
+    // enabled, emit the decoder function for the previous bitwidth.
+    if (SpecializeDecodersPerBitwidth && PrevBitWidth != BitWidth &&
+        PrevBitWidth != 0) {
+      emitDecoderFunction(OS, TableInfo.Decoders, PrevBitWidth);
+      // Each BitWidth get's its own decoders and decoder function.
+      TableInfo.Decoders.clear();
     }
-
-    unsigned BucketSize = std::get<2>(Bucket.front()->first);
-    const unsigned BucketBitWidth =
-        SpecializeDecodersPerBitwidth ? BucketSize * 8 : 0;
-
-    // Emit the decoder function for each BitWidth bucket.
-    emitDecoderFunction(OS, TableInfo.Decoders, BucketBitWidth);
-  }
+    PrevBitWidth = BitWidth;
+
+    // Emit the decoder for this (namespace, hwmode, width) combination.
+    FilterChooser FC(Encodings, EncodingIDs, BitWidth, Target);
+
+    // The decode table is cleared for each top level decoder function. The
+    // predicates and decoders themselves, however, are shared across
+    // different decoders to give more opportunities for uniqueing.
+    //  - If `SpecializeDecodersPerBitwidth` is enabled, decoders are shared
+    //    across all decoder tables for a given bitwidth, else they are shared
+    //    across all decoder tables.
+    //  - predicates are shared across all decoder tables.
+    TableInfo.Table.clear();
+    TableBuilder.buildTable(FC);
+
+    // Print the table to the output stream.
+    OpcodeMask |= emitTable(OS, TableInfo.Table, DecoderNamespace, HwModeID,
+                            BitWidth, EncodingIDs);
+  }
+
+  // Emit the decoder function for the last bucket. This will also emit the
+  // single decoder function if SpecializeDecodersPerBitwidth = false.
+  if (!SpecializeDecodersPerBitwidth)
+    PrevBitWidth = 0;
+  emitDecoderFunction(OS, TableInfo.Decoders, PrevBitWidth);
 
   const bool HasCheckPredicate =
       OpcodeMask &

>From 6420b4f706488f0b9e90e25575b8ff5f2163b5c1 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi at nvidia.com>
Date: Tue, 26 Aug 2025 09:02:06 -0700
Subject: [PATCH 3/8] Use nested maps and use cl option

---
 llvm/include/llvm/Target/Target.td            |  7 --
 llvm/lib/Target/AMDGPU/AMDGPU.td              |  1 -
 llvm/lib/Target/AMDGPU/CMakeLists.txt         |  3 +-
 llvm/lib/Target/RISCV/CMakeLists.txt          |  3 +-
 llvm/lib/Target/RISCV/RISCV.td                |  1 -
 llvm/test/TableGen/HwModeEncodeDecode3.td     | 14 ++--
 llvm/utils/TableGen/DecoderEmitter.cpp        | 79 ++++++++++---------
 .../lib/Target/AMDGPU/Disassembler/BUILD.gn   |  7 +-
 .../lib/Target/RISCV/Disassembler/BUILD.gn    |  5 +-
 9 files changed, 64 insertions(+), 56 deletions(-)

diff --git a/llvm/include/llvm/Target/Target.td b/llvm/include/llvm/Target/Target.td
index 403c866e386ae..495b59ee916cf 100644
--- a/llvm/include/llvm/Target/Target.td
+++ b/llvm/include/llvm/Target/Target.td
@@ -1158,13 +1158,6 @@ class InstrInfo {
   //
   // This option is a temporary migration help. It will go away.
   bit guessInstructionProperties = true;
-
-  // Generate decoders that are specialized per bit width in the generated
-  // decoder/disassembler. This requires use of different `InsnType` for
-  // different bitwidths and defining `InsnBitWidth` template specialization for
-  // the `InsnType` types used. Some common specializations are already defined
-  // in MCDecoder.h.
-  bit SpecializeDecodersPerBitwidth = false;
 }
 
 // Standard Pseudo Instructions.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 4d9185d40b9a5..edd3ce72d7df3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -2100,7 +2100,6 @@ def FeatureISAVersion12_Generic: FeatureSet<
 
 def AMDGPUInstrInfo : InstrInfo {
   let guessInstructionProperties = 1;
-  let SpecializeDecodersPerBitwidth = true;
 }
 
 def AMDGPUAsmParser : AsmParser {
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index dc9dd220130ea..56dc6f9e8e6e9 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -6,7 +6,8 @@ tablegen(LLVM AMDGPUGenAsmMatcher.inc -gen-asm-matcher)
 tablegen(LLVM AMDGPUGenAsmWriter.inc -gen-asm-writer)
 tablegen(LLVM AMDGPUGenCallingConv.inc -gen-callingconv)
 tablegen(LLVM AMDGPUGenDAGISel.inc -gen-dag-isel)
-tablegen(LLVM AMDGPUGenDisassemblerTables.inc -gen-disassembler)
+tablegen(LLVM AMDGPUGenDisassemblerTables.inc -gen-disassembler
+              --specialize-decoders-per-bitwidth)
 tablegen(LLVM AMDGPUGenInstrInfo.inc -gen-instr-info)
 tablegen(LLVM AMDGPUGenMCCodeEmitter.inc -gen-emitter)
 tablegen(LLVM AMDGPUGenMCPseudoLowering.inc -gen-pseudo-lowering)
diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt
index 47329b2c2f4d2..531238ae85029 100644
--- a/llvm/lib/Target/RISCV/CMakeLists.txt
+++ b/llvm/lib/Target/RISCV/CMakeLists.txt
@@ -7,7 +7,8 @@ tablegen(LLVM RISCVGenAsmWriter.inc -gen-asm-writer)
 tablegen(LLVM RISCVGenCompressInstEmitter.inc -gen-compress-inst-emitter)
 tablegen(LLVM RISCVGenMacroFusion.inc -gen-macro-fusion-pred)
 tablegen(LLVM RISCVGenDAGISel.inc -gen-dag-isel)
-tablegen(LLVM RISCVGenDisassemblerTables.inc -gen-disassembler)
+tablegen(LLVM RISCVGenDisassemblerTables.inc -gen-disassembler
+              --specialize-decoders-per-bitwidth)
 tablegen(LLVM RISCVGenInstrInfo.inc -gen-instr-info)
 tablegen(LLVM RISCVGenMCCodeEmitter.inc -gen-emitter)
 tablegen(LLVM RISCVGenMCPseudoLowering.inc -gen-pseudo-lowering)
diff --git a/llvm/lib/Target/RISCV/RISCV.td b/llvm/lib/Target/RISCV/RISCV.td
index bd8bc56f1da3a..b24d8637cb27f 100644
--- a/llvm/lib/Target/RISCV/RISCV.td
+++ b/llvm/lib/Target/RISCV/RISCV.td
@@ -85,7 +85,6 @@ include "RISCVPfmCounters.td"
 
 def RISCVInstrInfo : InstrInfo {
   let guessInstructionProperties = 0;
-  let SpecializeDecodersPerBitwidth = true;
 }
 
 def RISCVAsmParser : AsmParser {
diff --git a/llvm/test/TableGen/HwModeEncodeDecode3.td b/llvm/test/TableGen/HwModeEncodeDecode3.td
index dbbf866f057e5..5e9ac7d17e45a 100644
--- a/llvm/test/TableGen/HwModeEncodeDecode3.td
+++ b/llvm/test/TableGen/HwModeEncodeDecode3.td
@@ -118,8 +118,6 @@ def unrelated: Instruction {
 // exact duplicates and could effectively be merged into one.
 // DECODER-LABEL: DecoderTable32
 // DECODER-DAG: Opcode: bar
-// DECODER-LABEL: DecoderTable64
-// DECODER-DAG: Opcode: fooTypeEncDefault:foo
 // DECODER-LABEL: DecoderTable_ModeA32
 // DECODER-DAG: Opcode: fooTypeEncA:foo
 // DECODER-DAG: Opcode: bar
@@ -138,13 +136,13 @@ def unrelated: Instruction {
 // DECODER-DAG: Opcode: unrelated
 // DECODER-LABEL: DecoderTableAlt_ModeC32
 // DECODER-DAG: Opcode: unrelated
+// DECODER-LABEL: DecoderTable64
+// DECODER-DAG: Opcode: fooTypeEncDefault:foo
 
 // Under the 'O1' optimization level, unnecessary duplicate tables will be eliminated,
 // reducing the four ‘Alt’ tables down to just one.
 // DECODER-SUPPRESS-O1-LABEL: DecoderTable32
 // DECODER-SUPPRESS-O1-DAG: Opcode: bar
-// DECODER-SUPPRESS-O1-LABEL: DecoderTable64
-// DECODER-SUPPRESS-O1-DAG: Opcode: fooTypeEncDefault:foo
 // DECODER-SUPPRESS-O1-LABEL: DecoderTable_ModeA32
 // DECODER-SUPPRESS-O1-DAG: Opcode: fooTypeEncA:foo
 // DECODER-SUPPRESS-O1-DAG: Opcode: bar
@@ -157,6 +155,8 @@ def unrelated: Instruction {
 // DECODER-SUPPRESS-O1-DAG: Opcode: bar
 // DECODER-SUPPRESS-O1-LABEL: DecoderTableAlt32
 // DECODER-SUPPRESS-O1-DAG: Opcode: unrelated
+// DECODER-SUPPRESS-O1-LABEL: DecoderTable64
+// DECODER-SUPPRESS-O1-DAG: Opcode: fooTypeEncDefault:foo
 
 // Under the 'O2' optimization condition, instructions possessing the 'EncodingByHwMode'
 // attribute will be extracted from their original DecoderNamespace and placed into their
@@ -166,9 +166,6 @@ def unrelated: Instruction {
 // consider the interplay between HwMode and DecoderNamespace for their instructions.
 // DECODER-SUPPRESS-O2-LABEL: DecoderTable32
 // DECODER-SUPPRESS-O2-DAG: Opcode: bar
-// DECODER-SUPPRESS-O2-LABEL: DecoderTable64
-// DECODER-SUPPRESS-O2-NOT: Opcode: bar
-// DECODER-SUPPRESS-O2-DAG: Opcode: fooTypeEncDefault:foo
 // DECODER-SUPPRESS-O2-LABEL: DecoderTable_ModeA32
 // DECODER-SUPPRESS-O2-DAG: Opcode: fooTypeEncA:foo
 // DECODER-SUPPRESS-O2-NOT: Opcode: bar
@@ -181,6 +178,9 @@ def unrelated: Instruction {
 // DECODER-SUPPRESS-O2-NOT: Opcode: bar
 // DECODER-SUPPRESS-O2-LABEL: DecoderTableAlt32
 // DECODER-SUPPRESS-O2-DAG: Opcode: unrelated
+// DECODER-SUPPRESS-O2-LABEL: DecoderTable64
+// DECODER-SUPPRESS-O2-NOT: Opcode: bar
+// DECODER-SUPPRESS-O2-DAG: Opcode: fooTypeEncDefault:foo
 
 // For 'bar' and 'unrelated', we didn't assign any HwModes for them,
 // they should keep the same in the following four tables.
diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp
index 31a992611064d..9900f33f76e6e 100644
--- a/llvm/utils/TableGen/DecoderEmitter.cpp
+++ b/llvm/utils/TableGen/DecoderEmitter.cpp
@@ -94,6 +94,15 @@ static cl::opt<bool> UseFnTableInDecodeToMCInst(
         "of the generated code."),
     cl::init(false), cl::cat(DisassemblerEmitterCat));
 
+static cl::opt<bool> SpecializeDecodersPerBitwidth(
+    "specialize-decoders-per-bitwidth",
+    cl::desc("Specialize the generated `decodeToMCInst` function per bitwidth. "
+             "Can help reduce the code size. This requires use of different "
+             "`InsnType` for different bitwidths and defining `InsnBitWidth` "
+             "template specialization for the `InsnType` types used. Some "
+             "common specializations are already defined in MCDecoder.h."),
+    cl::init(false), cl::cat(DisassemblerEmitterCat));
+
 STATISTIC(NumEncodings, "Number of encodings considered");
 STATISTIC(NumEncodingsLackingDisasm,
           "Number of encodings without disassembler info");
@@ -2538,8 +2547,13 @@ namespace {
   }
 
   // Map of (bitwidth, namespace, hwmode) tuple to encoding IDs.
-  std::map<std::tuple<unsigned, StringRef, unsigned>, std::vector<unsigned>>
-      EncMap;
+  // Its organized as a nested map, with the (namespace, hwmode) as the key for
+  // the inner map and bitwidth as the key for the outer map. We use std::map
+  // for deterministic iteration order so that the code emitted is also
+  // deterministic.
+  using InnerKeyTy = std::pair<StringRef, unsigned>;
+  using InnerMapTy = std::map<InnerKeyTy, std::vector<unsigned>>;
+  std::map<unsigned, InnerMapTy> EncMap;
 
   for (const auto &[HwModeID, EncodingIDs] : EncodingIDsByHwMode) {
     for (unsigned EncodingID : EncodingIDs) {
@@ -2549,14 +2563,10 @@ namespace {
       const unsigned BitWidth = IsVarLenInst ? MaxInstLen : 8 * Size;
       StringRef DecoderNamespace =
           EncodingDef->getValueAsString("DecoderNamespace");
-      EncMap[{BitWidth, DecoderNamespace, HwModeID}].push_back(EncodingID);
+      EncMap[BitWidth][{DecoderNamespace, HwModeID}].push_back(EncodingID);
     }
   }
 
-  const bool SpecializeDecodersPerBitwidth =
-      Target.getInstructionSet()->getValueAsBit(
-          "SpecializeDecodersPerBitwidth");
-
   // Variable length instructions use the same `APInt` type for all instructions
   // so we cannot specialize decoders based on instruction bitwidths (which
   // requires using different `InstType` for differet bitwidths for the correct
@@ -2571,43 +2581,40 @@ namespace {
   DecoderTableBuilder TableBuilder(Target, Encodings, TableInfo);
   unsigned OpcodeMask = 0;
 
-  unsigned PrevBitWidth = 0;
-  for (const auto &[Key, EncodingIDs] : EncMap) {
-    auto [BitWidth, DecoderNamespace, HwModeID] = Key;
+  for (const auto &[BitWidth, BWMap] : EncMap) {
+    for (const auto &[Key, EncodingIDs] : BWMap) {
+      auto [DecoderNamespace, HwModeID] = Key;
+
+      // Emit the decoder for this (namespace, hwmode, width) combination.
+      FilterChooser FC(Encodings, EncodingIDs);
+
+      // The decode table is cleared for each top level decoder function. The
+      // predicates and decoders themselves, however, are shared across
+      // different decoders to give more opportunities for uniqueing.
+      //  - If `SpecializeDecodersPerBitwidth` is enabled, decoders are shared
+      //    across all decoder tables for a given bitwidth, else they are shared
+      //    across all decoder tables.
+      //  - predicates are shared across all decoder tables.
+      TableInfo.Table.clear();
+      TableBuilder.buildTable(FC);
+
+      // Print the table to the output stream.
+      OpcodeMask |= emitTable(OS, TableInfo.Table, DecoderNamespace, HwModeID,
+                              BitWidth, EncodingIDs);
+    }
 
-    // If we are starting a new bitwidth and SpecializeDecodersPerBitwidth is
-    // enabled, emit the decoder function for the previous bitwidth.
-    if (SpecializeDecodersPerBitwidth && PrevBitWidth != BitWidth &&
-        PrevBitWidth != 0) {
-      emitDecoderFunction(OS, TableInfo.Decoders, PrevBitWidth);
-      // Each BitWidth get's its own decoders and decoder function.
+    // Each BitWidth get's its own decoders and decoder function if
+    // SpecializeDecodersPerBitwidth is enabled.
+    if (SpecializeDecodersPerBitwidth) {
+      emitDecoderFunction(OS, TableInfo.Decoders, BitWidth);
       TableInfo.Decoders.clear();
     }
-    PrevBitWidth = BitWidth;
-
-    // Emit the decoder for this (namespace, hwmode, width) combination.
-    FilterChooser FC(Encodings, EncodingIDs, BitWidth, Target);
-
-    // The decode table is cleared for each top level decoder function. The
-    // predicates and decoders themselves, however, are shared across
-    // different decoders to give more opportunities for uniqueing.
-    //  - If `SpecializeDecodersPerBitwidth` is enabled, decoders are shared
-    //    across all decoder tables for a given bitwidth, else they are shared
-    //    across all decoder tables.
-    //  - predicates are shared across all decoder tables.
-    TableInfo.Table.clear();
-    TableBuilder.buildTable(FC);
-
-    // Print the table to the output stream.
-    OpcodeMask |= emitTable(OS, TableInfo.Table, DecoderNamespace, HwModeID,
-                            BitWidth, EncodingIDs);
   }
 
   // Emit the decoder function for the last bucket. This will also emit the
   // single decoder function if SpecializeDecodersPerBitwidth = false.
   if (!SpecializeDecodersPerBitwidth)
-    PrevBitWidth = 0;
-  emitDecoderFunction(OS, TableInfo.Decoders, PrevBitWidth);
+    emitDecoderFunction(OS, TableInfo.Decoders, 0);
 
   const bool HasCheckPredicate =
       OpcodeMask &
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/Disassembler/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/Disassembler/BUILD.gn
index 11bc537936508..9cc98cd8642d6 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/Disassembler/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/Disassembler/BUILD.gn
@@ -2,7 +2,12 @@ import("//llvm/utils/TableGen/tablegen.gni")
 
 tablegen("AMDGPUGenDisassemblerTables") {
   visibility = [ ":Disassembler" ]
-  args = [ "-gen-disassembler" ]
+  args = [
+    "-gen-disassembler",
+    "-specialize-decoders-per-bitwidth",
+  ]
+
+  args = [   ]
   td_file = "../AMDGPU.td"
 }
 
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/Disassembler/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/Disassembler/BUILD.gn
index cb579221fd366..447a67af6be7b 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/Disassembler/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/Disassembler/BUILD.gn
@@ -2,7 +2,10 @@ import("//llvm/utils/TableGen/tablegen.gni")
 
 tablegen("RISCVGenDisassemblerTables") {
   visibility = [ ":Disassembler" ]
-  args = [ "-gen-disassembler" ]
+  args = [
+    "-gen-disassembler",
+    "-specialize-decoders-per-bitwidth",
+  ]
   td_file = "../RISCV.td"
 }
 

>From 934756eca1c433e6e5d01406d2b1b364bd895f9c Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi at nvidia.com>
Date: Tue, 26 Aug 2025 11:35:42 -0700
Subject: [PATCH 4/8] Review feedback: Make help message succint

---
 llvm/utils/TableGen/DecoderEmitter.cpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp
index 9900f33f76e6e..37c8b29e7209f 100644
--- a/llvm/utils/TableGen/DecoderEmitter.cpp
+++ b/llvm/utils/TableGen/DecoderEmitter.cpp
@@ -94,13 +94,14 @@ static cl::opt<bool> UseFnTableInDecodeToMCInst(
         "of the generated code."),
     cl::init(false), cl::cat(DisassemblerEmitterCat));
 
+// Enabling this option requires use of different `InsnType` for different
+// bitwidths and defining `InsnBitWidth` template specialization for the
+// `InsnType` types used. Some common specializations are already defined in
+// MCDecoder.h.
 static cl::opt<bool> SpecializeDecodersPerBitwidth(
     "specialize-decoders-per-bitwidth",
     cl::desc("Specialize the generated `decodeToMCInst` function per bitwidth. "
-             "Can help reduce the code size. This requires use of different "
-             "`InsnType` for different bitwidths and defining `InsnBitWidth` "
-             "template specialization for the `InsnType` types used. Some "
-             "common specializations are already defined in MCDecoder.h."),
+             "Helps reduce the code size."),
     cl::init(false), cl::cat(DisassemblerEmitterCat));
 
 STATISTIC(NumEncodings, "Number of encodings considered");
@@ -2576,7 +2577,7 @@ namespace {
         "Cannot specialize decoders for variable length instuctions");
 
   // Entries in `EncMap` are already sorted by bitwidth. So bucketing per
-  // bitwidth can be done on-the-fly as we iterate over tha map.
+  // bitwidth can be done on-the-fly as we iterate over the map.
   DecoderTableInfo TableInfo;
   DecoderTableBuilder TableBuilder(Target, Encodings, TableInfo);
   unsigned OpcodeMask = 0;

>From ad52f1453206a05e0589c882000edd992d3dd3c5 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi at nvidia.com>
Date: Tue, 26 Aug 2025 21:08:19 -0700
Subject: [PATCH 5/8] Use uint64_t for RISCV 48-bit insts

---
 llvm/include/llvm/MC/MCDecoder.h                     | 12 ++----------
 .../AMDGPU/Disassembler/AMDGPUDisassembler.cpp       |  8 ++++++++
 .../Target/RISCV/Disassembler/RISCVDisassembler.cpp  | 11 ++++++++---
 3 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/llvm/include/llvm/MC/MCDecoder.h b/llvm/include/llvm/MC/MCDecoder.h
index 6259ef5a3bd5d..f2da8b52251a6 100644
--- a/llvm/include/llvm/MC/MCDecoder.h
+++ b/llvm/include/llvm/MC/MCDecoder.h
@@ -75,18 +75,10 @@ insertBits(IntType &field, IntType bits, unsigned startBit, unsigned numBits) {
 // InsnBitWidth is essentially a type trait used by the decoder emitter to query
 // the supported bitwidth for a given type. But default, the value is 0, making
 // it an invalid type for use as `InsnType` when instantiating the decoder.
+// Individual targets are expected to provide specializations for these based
+// on their usage.
 template <typename T> inline constexpr uint32_t InsnBitWidth = 0;
 
-// Provide specializations for commonly used types.
-// Integer types.
-template <> inline constexpr uint32_t InsnBitWidth<uint8_t> = 8;
-template <> inline constexpr uint32_t InsnBitWidth<uint16_t> = 16;
-template <> inline constexpr uint32_t InsnBitWidth<uint32_t> = 32;
-template <> inline constexpr uint32_t InsnBitWidth<uint64_t> = 64;
-
-// std::bitset<N>.
-template <size_t N> inline constexpr uint32_t InsnBitWidth<std::bitset<N>> = N;
-
 } // namespace llvm::MCD
 
 #endif // LLVM_MC_MCDECODER_H
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index ac42e51d447e3..2dc607608b80c 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -447,6 +447,14 @@ static DecodeStatus decodeVersionImm(MCInst &Inst, unsigned Imm,
 
 #include "AMDGPUGenDisassemblerTables.inc"
 
+// Define bitwidths for various types used to instantiate the decoder.
+template <> inline constexpr uint32_t llvm::MCD::InsnBitWidth<uint32_t> = 32;
+template <> inline constexpr uint32_t llvm::MCD::InsnBitWidth<uint64_t> = 64;
+template <>
+inline constexpr uint32_t llvm::MCD::InsnBitWidth<std::bitset<96>> = 96;
+template <>
+inline constexpr uint32_t llvm::MCD::InsnBitWidth<std::bitset<128>> = 128;
+
 //===----------------------------------------------------------------------===//
 //
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index ccec73fb3a455..1447ba364c482 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -700,6 +700,12 @@ static constexpr DecoderListEntry DecoderList32[]{
     {DecoderTableZdinxRV32Only32, {}, "RV32-only Zdinx (Double in Integer)"},
 };
 
+// Define bitwidths for various types used to instantiate the decoder.
+template <> inline constexpr uint32_t llvm::MCD::InsnBitWidth<uint16_t> = 16;
+template <> inline constexpr uint32_t llvm::MCD::InsnBitWidth<uint32_t> = 32;
+// Use uint64_t to represent 48 bit instructions.
+template <> inline constexpr uint32_t llvm::MCD::InsnBitWidth<uint64_t> = 48;
+
 DecodeStatus RISCVDisassembler::getInstruction32(MCInst &MI, uint64_t &Size,
                                                  ArrayRef<uint8_t> Bytes,
                                                  uint64_t Address,
@@ -790,11 +796,10 @@ DecodeStatus RISCVDisassembler::getInstruction48(MCInst &MI, uint64_t &Size,
   }
   Size = 6;
 
-  uint64_t InsnBits = 0;
+  uint64_t Insn = 0;
   for (size_t i = Size; i-- != 0;)
-    InsnBits += (static_cast<uint64_t>(Bytes[i]) << 8 * i);
+    Insn += (static_cast<uint64_t>(Bytes[i]) << 8 * i);
 
-  std::bitset<48> Insn = InsnBits;
   for (const DecoderListEntry &Entry : DecoderList48) {
     if (!Entry.haveContainedFeatures(STI.getFeatureBits()))
       continue;

>From d5afae2edc1c9533e62c5e2ab6df514c0b0580bb Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi at nvidia.com>
Date: Tue, 26 Aug 2025 22:10:04 -0700
Subject: [PATCH 6/8] Make InstBitWidth specializations static

---
 .../lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp | 8 ++++----
 llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp  | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 2dc607608b80c..80d194afa926b 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -448,12 +448,12 @@ static DecodeStatus decodeVersionImm(MCInst &Inst, unsigned Imm,
 #include "AMDGPUGenDisassemblerTables.inc"
 
 // Define bitwidths for various types used to instantiate the decoder.
-template <> inline constexpr uint32_t llvm::MCD::InsnBitWidth<uint32_t> = 32;
-template <> inline constexpr uint32_t llvm::MCD::InsnBitWidth<uint64_t> = 64;
+template <> static constexpr uint32_t llvm::MCD::InsnBitWidth<uint32_t> = 32;
+template <> static constexpr uint32_t llvm::MCD::InsnBitWidth<uint64_t> = 64;
 template <>
-inline constexpr uint32_t llvm::MCD::InsnBitWidth<std::bitset<96>> = 96;
+static constexpr uint32_t llvm::MCD::InsnBitWidth<std::bitset<96>> = 96;
 template <>
-inline constexpr uint32_t llvm::MCD::InsnBitWidth<std::bitset<128>> = 128;
+static constexpr uint32_t llvm::MCD::InsnBitWidth<std::bitset<128>> = 128;
 
 //===----------------------------------------------------------------------===//
 //
diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index 1447ba364c482..4becd3bbb25f2 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -701,10 +701,10 @@ static constexpr DecoderListEntry DecoderList32[]{
 };
 
 // Define bitwidths for various types used to instantiate the decoder.
-template <> inline constexpr uint32_t llvm::MCD::InsnBitWidth<uint16_t> = 16;
-template <> inline constexpr uint32_t llvm::MCD::InsnBitWidth<uint32_t> = 32;
+template <> static constexpr uint32_t llvm::MCD::InsnBitWidth<uint16_t> = 16;
+template <> static constexpr uint32_t llvm::MCD::InsnBitWidth<uint32_t> = 32;
 // Use uint64_t to represent 48 bit instructions.
-template <> inline constexpr uint32_t llvm::MCD::InsnBitWidth<uint64_t> = 48;
+template <> static constexpr uint32_t llvm::MCD::InsnBitWidth<uint64_t> = 48;
 
 DecodeStatus RISCVDisassembler::getInstruction32(MCInst &MI, uint64_t &Size,
                                                  ArrayRef<uint8_t> Bytes,

>From be3450afa636d9c2f2d7a0dbb29c532726491afa Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi at nvidia.com>
Date: Wed, 27 Aug 2025 09:59:40 -0700
Subject: [PATCH 7/8] Fixes for function tables, add lit test

---
 llvm/include/llvm/MC/MCDecoder.h              |   2 +-
 .../DecoderEmitterBitwidthSpecialization.td   | 169 ++++++++++++++++++
 llvm/test/TableGen/DecoderEmitterFnTable.td   |  16 +-
 llvm/utils/TableGen/DecoderEmitter.cpp        |  41 +++--
 4 files changed, 207 insertions(+), 21 deletions(-)
 create mode 100644 llvm/test/TableGen/DecoderEmitterBitwidthSpecialization.td

diff --git a/llvm/include/llvm/MC/MCDecoder.h b/llvm/include/llvm/MC/MCDecoder.h
index f2da8b52251a6..459c8a6a5ea34 100644
--- a/llvm/include/llvm/MC/MCDecoder.h
+++ b/llvm/include/llvm/MC/MCDecoder.h
@@ -77,7 +77,7 @@ insertBits(IntType &field, IntType bits, unsigned startBit, unsigned numBits) {
 // it an invalid type for use as `InsnType` when instantiating the decoder.
 // Individual targets are expected to provide specializations for these based
 // on their usage.
-template <typename T> inline constexpr uint32_t InsnBitWidth = 0;
+template <typename T> static constexpr uint32_t InsnBitWidth = 0;
 
 } // namespace llvm::MCD
 
diff --git a/llvm/test/TableGen/DecoderEmitterBitwidthSpecialization.td b/llvm/test/TableGen/DecoderEmitterBitwidthSpecialization.td
new file mode 100644
index 0000000000000..6fd9676939bde
--- /dev/null
+++ b/llvm/test/TableGen/DecoderEmitterBitwidthSpecialization.td
@@ -0,0 +1,169 @@
+// RUN: llvm-tblgen -gen-disassembler -I %p/../../include %s | FileCheck %s --check-prefix=CHECK-DEFAULT
+// RUN: llvm-tblgen -gen-disassembler -specialize-decoders-per-bitwidth -I %p/../../include %s | FileCheck %s --check-prefix=CHECK-SPECIALIZE-NO-TABLE
+// RUN: llvm-tblgen -gen-disassembler -specialize-decoders-per-bitwidth -use-fn-table-in-decode-to-mcinst -I %p/../../include %s | FileCheck %s --check-prefix=CHECK-SPECIALIZE-TABLE
+
+
+include "llvm/Target/Target.td"
+
+def archInstrInfo : InstrInfo { }
+
+def arch : Target {
+  let InstructionSet = archInstrInfo;
+}
+
+let Namespace = "arch" in {
+  def R0 : Register<"r0">;
+  def R1 : Register<"r1">;
+  def R2 : Register<"r2">;
+  def R3 : Register<"r3">;
+}
+def Regs : RegisterClass<"Regs", [i32], 32, (add R0, R1, R2, R3)>;
+
+// Bit 0 of the encoding determines the size (8 or 16 bits).
+// Bits {3..1} define the number define the number of operands encoded.
+class Instruction8Bit<int NumOps> : Instruction {
+  let Size = 1;
+  let OutOperandList = (outs);
+  field bits<8> Inst;
+  let Inst{0} = 0;
+  let Inst{3-1} = NumOps;
+}
+
+class Instruction16Bit<int NumOps> : Instruction {
+  let Size = 2;
+  let OutOperandList = (outs);
+  field bits<16> Inst;
+  let Inst{0} = 1;
+  let Inst{3-1} = NumOps;
+}
+
+// Define instructions to generate 4 cases in decodeToMCInst.
+// Each register operand needs 2 bits to encode.
+
+// An instruction with no inputs.
+def Inst0 : Instruction8Bit<0> {
+  let Inst{7-4} = 0;
+  let InOperandList = (ins);
+  let AsmString = "Inst0";
+}
+
+// An instruction with a single input.
+def Inst1 : Instruction8Bit<1> {
+  bits<2> r0;
+  let Inst{5-4} = r0;
+  let Inst{7-6} = 0;
+  let InOperandList = (ins Regs:$r0);
+  let AsmString = "Inst1";
+}
+
+// An instruction with two inputs. 
+def Inst2 : Instruction16Bit<2> {
+  bits<2> r0;
+  bits<2> r1;
+  let Inst{5-4} = r0;
+  let Inst{7-6} = r1;
+  let Inst{15-8} = 0;
+  let InOperandList = (ins Regs:$r0, Regs:$r1);
+  let AsmString = "Inst2";
+}
+
+// An instruction with three inputs. .
+def Inst3 : Instruction16Bit<3> {
+  bits<2> r0;
+  bits<2> r1;
+  bits<2> r2;
+  let Inst{5-4} = r0;
+  let Inst{7-6} = r1;
+  let Inst{9-8} = r2;
+  let Inst{15-10} = 0;
+  let InOperandList = (ins Regs:$r0, Regs:$r1, Regs:$r2);
+  let AsmString = "Inst3";
+}
+
+// -----------------------------------------------------------------------------
+// In the default case, we emit a single decodeToMCinst function and DecodeIdx
+// is shared across all bitwidths.
+
+// CHECK-DEFAULT-LABEL: DecoderTable8[25]
+// CHECK-DEFAULT:         DecodeIdx: 0
+// CHECK-DEFAULT:         DecodeIdx: 1
+// CHECK-DEFAULT:       };
+
+// CHECK-DEFAULT-LABEL: DecoderTable16[25]
+// CHECK-DEFAULT:         DecodeIdx: 2
+// CHECK-DEFAULT:         DecodeIdx: 3
+// CHECK-DEFAULT:       };
+
+// CHECK-DEFAULT-LABEL: template <typename InsnType>
+// CHECK-DEFAULT-NEXT:  static DecodeStatus decodeToMCInst
+// CHECK-DEFAULT:         case 0
+// CHECK-DEFAULT:         case 1
+// CHECK-DEFAULT:         case 2
+// CHECK-DEFAULT:         case 3
+
+// -----------------------------------------------------------------------------
+// When we specialize per bitwidth, we emit 2 decodeToMCInst functions and
+// DecodeIdx is assigned per bit width.
+
+// CHECK-SPECIALIZE-NO-TABLE-LABEL:   DecoderTable8[25]
+// CHECK-SPECIALIZE-NO-TABLE:           DecodeIdx: 0
+// CHECK-SPECIALIZE-NO-TABLE:           DecodeIdx: 1
+// CHECK-SPECIALIZE-NO-TABLE:         };
+
+// CHECK-SPECIALIZE-NO-TABLE-LABEL:   template <typename InsnType>
+// CHECK-SPECIALIZE-NO-TABLE-NEXT:    static std::enable_if_t<InsnBitWidth<InsnType> == 8, DecodeStatus>
+// CHECK-SPECIALIZE-NO-TABLE-NEXT:    decodeToMCInst
+// CHECK-SPECIALIZE-NO-TABLE:           case 0
+// CHECK-SPECIALIZE-NO-TABLE:           case 1
+
+// CHECK-SPECIALIZE-NO-TABLE-LABEL:   DecoderTable16[25]
+// CHECK-SPECIALIZE-NO-TABLE:           DecodeIdx: 0
+// CHECK-SPECIALIZE-NO-TABLE:           DecodeIdx: 1
+// CHECK-SPECIALIZE-NO-TABLE:         };
+
+// CHECK-SPECIALIZE-NO-TABLE-LABEL:   template <typename InsnType>
+// CHECK-SPECIALIZE-NO-TABLE-NEXT:    static std::enable_if_t<InsnBitWidth<InsnType> == 16, DecodeStatus>
+// CHECK-SPECIALIZE-NO-TABLE-NEXT:    decodeToMCInst
+// CHECK-SPECIALIZE-NO-TABLE:           case 0
+// CHECK-SPECIALIZE-NO-TABLE:           case 1
+
+// -----------------------------------------------------------------------------
+// Per bitwidth specialization with function table.
+// CHECK-SPECIALIZE-TABLE-LABEL:    DecoderTable8[25]
+// CHECK-SPECIALIZE-TABLE:            DecodeIdx: 0
+// CHECK-SPECIALIZE-TABLE:            DecodeIdx: 1
+// CHECK-SPECIALIZE-TABLE:          };
+
+// 8 bit functions and function table.
+// CHECK-SPECIALIZE-TABLE-LABEL:    template <typename InsnType>
+// CHECK-SPECIALIZE-TABLE-NEXT:     static std::enable_if_t<InsnBitWidth<InsnType> == 8, DecodeStatus>
+// CHECK-SPECIALIZE-TABLE-NEXT:     decodeFn_8bit_0
+
+// CHECK-SPECIALIZE-TABLE-LABEL:    template <typename InsnType>
+// CHECK-SPECIALIZE-TABLE-NEXT:     static std::enable_if_t<InsnBitWidth<InsnType> == 8, DecodeStatus>
+// CHECK-SPECIALIZE-TABLE-NEXT:     decodeFn_8bit_1
+
+// CHECK-SPECIALIZE-TABLE-LABEL:    template <typename InsnType>
+// CHECK-SPECIALIZE-TABLE-NEXT:     static std::enable_if_t<InsnBitWidth<InsnType> == 8, DecodeStatus>
+// CHECK-SPECIALIZE-TABLE-NEXT:     decodeToMCInst
+// CHECK-SPECIALIZE-TABLE-LABEL:      static constexpr DecodeFnTy decodeFnTable[] = {
+// CHECK-SPECIALIZE-TABLE-NEXT:       decodeFn_8bit_0,
+// CHECK-SPECIALIZE-TABLE-NEXT:       decodeFn_8bit_1,
+// CHECK-SPECIALIZE-TABLE-NEXT:     };
+
+// 16 bit functions and function table.
+// CHECK-SPECIALIZE-TABLE-LABEL:    template <typename InsnType>
+// CHECK-SPECIALIZE-TABLE-NEXT:     static std::enable_if_t<InsnBitWidth<InsnType> == 16, DecodeStatus>
+// CHECK-SPECIALIZE-TABLE-NEXT:     decodeFn_16bit_0
+
+// CHECK-SPECIALIZE-TABLE-LABEL:    template <typename InsnType>
+// CHECK-SPECIALIZE-TABLE-NEXT:     static std::enable_if_t<InsnBitWidth<InsnType> == 16, DecodeStatus>
+// CHECK-SPECIALIZE-TABLE-NEXT:     decodeFn_16bit_1
+
+// CHECK-SPECIALIZE-TABLE-LABEL:    template <typename InsnType>
+// CHECK-SPECIALIZE-TABLE-NEXT:     static std::enable_if_t<InsnBitWidth<InsnType> == 16, DecodeStatus>
+// CHECK-SPECIALIZE-TABLE-NEXT:     decodeToMCInst
+// CHECK-SPECIALIZE-TABLE-LABEL:      static constexpr DecodeFnTy decodeFnTable[] = {
+// CHECK-SPECIALIZE-TABLE-NEXT:       decodeFn_16bit_0,
+// CHECK-SPECIALIZE-TABLE-NEXT:       decodeFn_16bit_1,
+// CHECK-SPECIALIZE-TABLE-NEXT:     };
diff --git a/llvm/test/TableGen/DecoderEmitterFnTable.td b/llvm/test/TableGen/DecoderEmitterFnTable.td
index 7bed18c19a513..8929e6da716e6 100644
--- a/llvm/test/TableGen/DecoderEmitterFnTable.td
+++ b/llvm/test/TableGen/DecoderEmitterFnTable.td
@@ -71,14 +71,14 @@ def Inst3 : TestInstruction {
   let AsmString = "Inst3";
 }
 
-// CHECK-LABEL: DecodeStatus decodeFn0(DecodeStatus S, InsnType insn, MCInst &MI, uint64_t Address, const MCDisassembler *Decoder, bool &DecodeComplete)
-// CHECK-LABEL: DecodeStatus decodeFn1(DecodeStatus S, InsnType insn, MCInst &MI, uint64_t Address, const MCDisassembler *Decoder, bool &DecodeComplete)
-// CHECK-LABEL: DecodeStatus decodeFn2(DecodeStatus S, InsnType insn, MCInst &MI, uint64_t Address, const MCDisassembler *Decoder, bool &DecodeComplete)
-// CHECK-LABEL: DecodeStatus decodeFn3(DecodeStatus S, InsnType insn, MCInst &MI, uint64_t Address, const MCDisassembler *Decoder, bool &DecodeComplete)
+// CHECK-LABEL: DecodeStatus decodeFn_0(DecodeStatus S, InsnType insn, MCInst &MI, uint64_t Address, const MCDisassembler *Decoder, bool &DecodeComplete)
+// CHECK-LABEL: DecodeStatus decodeFn_1(DecodeStatus S, InsnType insn, MCInst &MI, uint64_t Address, const MCDisassembler *Decoder, bool &DecodeComplete)
+// CHECK-LABEL: DecodeStatus decodeFn_2(DecodeStatus S, InsnType insn, MCInst &MI, uint64_t Address, const MCDisassembler *Decoder, bool &DecodeComplete)
+// CHECK-LABEL: DecodeStatus decodeFn_3(DecodeStatus S, InsnType insn, MCInst &MI, uint64_t Address, const MCDisassembler *Decoder, bool &DecodeComplete)
 // CHECK-LABEL: decodeToMCInst(unsigned Idx, DecodeStatus S, InsnType insn, MCInst &MI, uint64_t Address, const MCDisassembler *Decoder, bool &DecodeComplete)
 // CHECK: static constexpr DecodeFnTy decodeFnTable[]
-// CHECK-NEXT: decodeFn0,
-// CHECK-NEXT: decodeFn1,
-// CHECK-NEXT: decodeFn2,
-// CHECK-NEXT: decodeFn3,
+// CHECK-NEXT: decodeFn_0,
+// CHECK-NEXT: decodeFn_1,
+// CHECK-NEXT: decodeFn_2,
+// CHECK-NEXT: decodeFn_3,
 // CHECK: return decodeFnTable[Idx](S, insn, MI, Address, Decoder, DecodeComplete)
diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp
index 37c8b29e7209f..2036ab1a8a56f 100644
--- a/llvm/utils/TableGen/DecoderEmitter.cpp
+++ b/llvm/utils/TableGen/DecoderEmitter.cpp
@@ -951,12 +951,32 @@ void DecoderEmitter::emitDecoderFunction(formatted_raw_ostream &OS,
       "DecodeStatus S, InsnType insn, MCInst &MI, uint64_t Address, const "
       "MCDisassembler *Decoder, bool &DecodeComplete";
 
+  // Print the name of the decode function to OS.
+  auto PrintDecodeFnName = [&OS, BucketBitWidth](unsigned DecodeIdx) {
+    OS << "decodeFn";
+    if (BucketBitWidth != 0) {
+      OS << '_' << BucketBitWidth << "bit";
+    }
+    OS << '_' << DecodeIdx;
+  };
+
+  // Print the template statement.
+  auto PrintTemplate = [&OS, BucketBitWidth]() {
+    OS << "template <typename InsnType>\n";
+    OS << "static ";
+    if (BucketBitWidth != 0)
+      OS << "std::enable_if_t<InsnBitWidth<InsnType> == " << BucketBitWidth
+         << ", DecodeStatus>\n";
+    else
+      OS << "DecodeStatus ";
+  };
+
   if (UseFnTableInDecodeToMCInst) {
     // Emit a function for each case first.
     for (const auto &[Index, Decoder] : enumerate(Decoders)) {
-      OS << "template <typename InsnType>\n";
-      OS << "static DecodeStatus decodeFn" << Index << "(" << DecodeParams
-         << ") {\n";
+      PrintTemplate();
+      PrintDecodeFnName(Index);
+      OS << "(" << DecodeParams << ") {\n";
       OS << "  using namespace llvm::MCD;\n";
       OS << "  " << TmpTypeDecl;
       OS << "  [[maybe_unused]] TmpType tmp;\n";
@@ -967,13 +987,7 @@ void DecoderEmitter::emitDecoderFunction(formatted_raw_ostream &OS,
   }
 
   OS << "// Handling " << Decoders.size() << " cases.\n";
-  OS << "template <typename InsnType>\n";
-  OS << "static ";
-  if (BucketBitWidth != 0)
-    OS << "std::enable_if_t<InsnBitWidth<InsnType> == " << BucketBitWidth
-       << ", DecodeStatus>\n";
-  else
-    OS << "DecodeStatus ";
+  PrintTemplate();
   OS << "decodeToMCInst(unsigned Idx, " << DecodeParams << ") {\n";
   OS << "  using namespace llvm::MCD;\n";
   OS << "  DecodeComplete = true;\n";
@@ -982,8 +996,11 @@ void DecoderEmitter::emitDecoderFunction(formatted_raw_ostream &OS,
     // Build a table of function pointers
     OS << "  using DecodeFnTy = DecodeStatus (*)(" << DecodeParams << ");\n";
     OS << "  static constexpr DecodeFnTy decodeFnTable[] = {\n";
-    for (size_t Index : llvm::seq(Decoders.size()))
-      OS << "    decodeFn" << Index << ",\n";
+    for (size_t Index : llvm::seq(Decoders.size())) {
+      OS << "    ";
+      PrintDecodeFnName(Index);
+      OS << ",\n";
+    }
     OS << "  };\n";
     OS << "  if (Idx >= " << Decoders.size() << ")\n";
     OS << "    llvm_unreachable(\"Invalid decoder index!\");\n";

>From 590dbbbad3cdcdcb6b7046cc8e3a4104e7323339 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi at nvidia.com>
Date: Wed, 27 Aug 2025 11:01:38 -0700
Subject: [PATCH 8/8] Add missing checks for 16-bit decoder table

---
 .../TableGen/DecoderEmitterBitwidthSpecialization.td   | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/llvm/test/TableGen/DecoderEmitterBitwidthSpecialization.td b/llvm/test/TableGen/DecoderEmitterBitwidthSpecialization.td
index 6fd9676939bde..5a5b99a65f6a1 100644
--- a/llvm/test/TableGen/DecoderEmitterBitwidthSpecialization.td
+++ b/llvm/test/TableGen/DecoderEmitterBitwidthSpecialization.td
@@ -129,12 +129,13 @@ def Inst3 : Instruction16Bit<3> {
 
 // -----------------------------------------------------------------------------
 // Per bitwidth specialization with function table.
+
+// 8 bit deccoder table, functions, and function table.
 // CHECK-SPECIALIZE-TABLE-LABEL:    DecoderTable8[25]
 // CHECK-SPECIALIZE-TABLE:            DecodeIdx: 0
 // CHECK-SPECIALIZE-TABLE:            DecodeIdx: 1
 // CHECK-SPECIALIZE-TABLE:          };
 
-// 8 bit functions and function table.
 // CHECK-SPECIALIZE-TABLE-LABEL:    template <typename InsnType>
 // CHECK-SPECIALIZE-TABLE-NEXT:     static std::enable_if_t<InsnBitWidth<InsnType> == 8, DecodeStatus>
 // CHECK-SPECIALIZE-TABLE-NEXT:     decodeFn_8bit_0
@@ -151,7 +152,12 @@ def Inst3 : Instruction16Bit<3> {
 // CHECK-SPECIALIZE-TABLE-NEXT:       decodeFn_8bit_1,
 // CHECK-SPECIALIZE-TABLE-NEXT:     };
 
-// 16 bit functions and function table.
+// 16 bit deccoder table, functions, and function table.
+// CHECK-SPECIALIZE-TABLE-LABEL:    DecoderTable16[25]
+// CHECK-SPECIALIZE-TABLE:            DecodeIdx: 0
+// CHECK-SPECIALIZE-TABLE:            DecodeIdx: 1
+// CHECK-SPECIALIZE-TABLE:          };
+
 // CHECK-SPECIALIZE-TABLE-LABEL:    template <typename InsnType>
 // CHECK-SPECIALIZE-TABLE-NEXT:     static std::enable_if_t<InsnBitWidth<InsnType> == 16, DecodeStatus>
 // CHECK-SPECIALIZE-TABLE-NEXT:     decodeFn_16bit_0