[llvm] [LLVM][MC][DecoderEmitter] Add support to specialize decoder per bitwidth (PR #154865)

Tue Aug 26 22:10:42 PDT 2025

https://github.com/jurahul updated https://github.com/llvm/llvm-project/pull/154865

>From 4ee09bb556e22de01879090adf434dc12cb7c521 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi at nvidia.com>
Date: Thu, 21 Aug 2025 16:08:03 -0700
Subject: [PATCH 1/6] Cull decoders

---
 llvm/include/llvm/MC/MCDecoder.h              |  25 ++++
 llvm/include/llvm/Target/Target.td            |   7 +
 llvm/lib/Target/AMDGPU/AMDGPU.td              |   1 +
 .../Disassembler/AMDGPUDisassembler.cpp       |  29 ++---
 .../AMDGPU/Disassembler/AMDGPUDisassembler.h  |  38 ------
 .../RISCV/Disassembler/RISCVDisassembler.cpp  |  19 ++-
 llvm/lib/Target/RISCV/RISCV.td                |   1 +
 llvm/test/TableGen/VarLenDecoder.td           |  11 +-
 llvm/utils/TableGen/DecoderEmitter.cpp        | 121 +++++++++++++-----
 9 files changed, 153 insertions(+), 99 deletions(-)

diff --git a/llvm/include/llvm/MC/MCDecoder.h b/llvm/include/llvm/MC/MCDecoder.h
index 70762a4a5ebae..6259ef5a3bd5d 100644
--- a/llvm/include/llvm/MC/MCDecoder.h
+++ b/llvm/include/llvm/MC/MCDecoder.h
@@ -12,6 +12,7 @@
 
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/Support/MathExtras.h"
+#include <bitset>
 #include <cassert>
 
 namespace llvm::MCD {
@@ -48,6 +49,15 @@ fieldFromInstruction(const InsnType &Insn, unsigned StartBit,
   return Insn.extractBitsAsZExtValue(NumBits, StartBit);
 }
 
+template <size_t N>
+uint64_t fieldFromInstruction(const std::bitset<N> &Insn, unsigned StartBit,
+                              unsigned NumBits) {
+  assert(StartBit + NumBits <= N && "Instruction field out of bounds!");
+  assert(NumBits <= 64 && "Cannot support >64-bit extractions!");
+  const std::bitset<N> Mask(maskTrailingOnes<uint64_t>(NumBits));
+  return ((Insn >> StartBit) & Mask).to_ullong();
+}
+
 // Helper function for inserting bits extracted from an encoded instruction into
 // an integer-typed field.
 template <typename IntType>
@@ -62,6 +72,21 @@ insertBits(IntType &field, IntType bits, unsigned startBit, unsigned numBits) {
   field |= bits << startBit;
 }
 
+// InsnBitWidth is essentially a type trait used by the decoder emitter to query
+// the supported bitwidth for a given type. But default, the value is 0, making
+// it an invalid type for use as `InsnType` when instantiating the decoder.
+template <typename T> inline constexpr uint32_t InsnBitWidth = 0;
+
+// Provide specializations for commonly used types.
+// Integer types.
+template <> inline constexpr uint32_t InsnBitWidth<uint8_t> = 8;
+template <> inline constexpr uint32_t InsnBitWidth<uint16_t> = 16;
+template <> inline constexpr uint32_t InsnBitWidth<uint32_t> = 32;
+template <> inline constexpr uint32_t InsnBitWidth<uint64_t> = 64;
+
+// std::bitset<N>.
+template <size_t N> inline constexpr uint32_t InsnBitWidth<std::bitset<N>> = N;
+
 } // namespace llvm::MCD
 
 #endif // LLVM_MC_MCDECODER_H
diff --git a/llvm/include/llvm/Target/Target.td b/llvm/include/llvm/Target/Target.td
index 495b59ee916cf..403c866e386ae 100644
--- a/llvm/include/llvm/Target/Target.td
+++ b/llvm/include/llvm/Target/Target.td
@@ -1158,6 +1158,13 @@ class InstrInfo {
   //
   // This option is a temporary migration help. It will go away.
   bit guessInstructionProperties = true;
+
+  // Generate decoders that are specialized per bit width in the generated
+  // decoder/disassembler. This requires use of different `InsnType` for
+  // different bitwidths and defining `InsnBitWidth` template specialization for
+  // the `InsnType` types used. Some common specializations are already defined
+  // in MCDecoder.h.
+  bit SpecializeDecodersPerBitwidth = false;
 }
 
 // Standard Pseudo Instructions.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index edd3ce72d7df3..4d9185d40b9a5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -2100,6 +2100,7 @@ def FeatureISAVersion12_Generic: FeatureSet<
 
 def AMDGPUInstrInfo : InstrInfo {
   let guessInstructionProperties = 1;
+  let SpecializeDecodersPerBitwidth = true;
 }
 
 def AMDGPUAsmParser : AsmParser {
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 6a2beeed41dfd..ac42e51d447e3 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -38,6 +38,7 @@
 #include "llvm/Support/Compiler.h"
 
 using namespace llvm;
+using namespace llvm::MCD;
 
 #define DEBUG_TYPE "amdgpu-disassembler"
 
@@ -498,26 +499,24 @@ template <typename T> static inline T eatBytes(ArrayRef<uint8_t>& Bytes) {
   return Res;
 }
 
-static inline DecoderUInt128 eat12Bytes(ArrayRef<uint8_t> &Bytes) {
+static inline std::bitset<96> eat12Bytes(ArrayRef<uint8_t> &Bytes) {
+  using namespace llvm::support::endian;
   assert(Bytes.size() >= 12);
-  uint64_t Lo =
-      support::endian::read<uint64_t, llvm::endianness::little>(Bytes.data());
+  std::bitset<96> Lo(read<uint64_t, endianness::little>(Bytes.data()));
   Bytes = Bytes.slice(8);
-  uint64_t Hi =
-      support::endian::read<uint32_t, llvm::endianness::little>(Bytes.data());
+  std::bitset<96> Hi(read<uint32_t, endianness::little>(Bytes.data()));
   Bytes = Bytes.slice(4);
-  return DecoderUInt128(Lo, Hi);
+  return (Hi << 64) | Lo;
 }
 
-static inline DecoderUInt128 eat16Bytes(ArrayRef<uint8_t> &Bytes) {
+static inline std::bitset<128> eat16Bytes(ArrayRef<uint8_t> &Bytes) {
+  using namespace llvm::support::endian;
   assert(Bytes.size() >= 16);
-  uint64_t Lo =
-      support::endian::read<uint64_t, llvm::endianness::little>(Bytes.data());
+  std::bitset<128> Lo(read<uint64_t, endianness::little>(Bytes.data()));
   Bytes = Bytes.slice(8);
-  uint64_t Hi =
-      support::endian::read<uint64_t, llvm::endianness::little>(Bytes.data());
+  std::bitset<128> Hi(read<uint64_t, endianness::little>(Bytes.data()));
   Bytes = Bytes.slice(8);
-  return DecoderUInt128(Lo, Hi);
+  return (Hi << 64) | Lo;
 }
 
 void AMDGPUDisassembler::decodeImmOperands(MCInst &MI,
@@ -600,14 +599,14 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     // Try to decode DPP and SDWA first to solve conflict with VOP1 and VOP2
     // encodings
     if (isGFX1250() && Bytes.size() >= 16) {
-      DecoderUInt128 DecW = eat16Bytes(Bytes);
+      std::bitset<128> DecW = eat16Bytes(Bytes);
       if (tryDecodeInst(DecoderTableGFX1250128, MI, DecW, Address, CS))
         break;
       Bytes = Bytes_.slice(0, MaxInstBytesNum);
     }
 
     if (isGFX11Plus() && Bytes.size() >= 12) {
-      DecoderUInt128 DecW = eat12Bytes(Bytes);
+      std::bitset<96> DecW = eat12Bytes(Bytes);
 
       if (isGFX11() &&
           tryDecodeInst(DecoderTableGFX1196, DecoderTableGFX11_FAKE1696, MI,
@@ -642,7 +641,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
 
     } else if (Bytes.size() >= 16 &&
                STI.hasFeature(AMDGPU::FeatureGFX950Insts)) {
-      DecoderUInt128 DecW = eat16Bytes(Bytes);
+      std::bitset<128> DecW = eat16Bytes(Bytes);
       if (tryDecodeInst(DecoderTableGFX940128, MI, DecW, Address, CS))
         break;
 
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index f4d164bf10c3c..ded447b6f8d5a 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -32,44 +32,6 @@ class MCOperand;
 class MCSubtargetInfo;
 class Twine;
 
-// Exposes an interface expected by autogenerated code in
-// FixedLenDecoderEmitter
-class DecoderUInt128 {
-private:
-  uint64_t Lo = 0;
-  uint64_t Hi = 0;
-
-public:
-  DecoderUInt128() = default;
-  DecoderUInt128(uint64_t Lo, uint64_t Hi = 0) : Lo(Lo), Hi(Hi) {}
-  operator bool() const { return Lo || Hi; }
-  uint64_t extractBitsAsZExtValue(unsigned NumBits,
-                                  unsigned BitPosition) const {
-    assert(NumBits && NumBits <= 64);
-    assert(BitPosition < 128);
-    uint64_t Val;
-    if (BitPosition < 64)
-      Val = Lo >> BitPosition | Hi << 1 << (63 - BitPosition);
-    else
-      Val = Hi >> (BitPosition - 64);
-    return Val & ((uint64_t(2) << (NumBits - 1)) - 1);
-  }
-  DecoderUInt128 operator&(const DecoderUInt128 &RHS) const {
-    return DecoderUInt128(Lo & RHS.Lo, Hi & RHS.Hi);
-  }
-  DecoderUInt128 operator&(const uint64_t &RHS) const {
-    return *this & DecoderUInt128(RHS);
-  }
-  DecoderUInt128 operator~() const { return DecoderUInt128(~Lo, ~Hi); }
-  bool operator==(const DecoderUInt128 &RHS) {
-    return Lo == RHS.Lo && Hi == RHS.Hi;
-  }
-  bool operator!=(const DecoderUInt128 &RHS) {
-    return Lo != RHS.Lo || Hi != RHS.Hi;
-  }
-  bool operator!=(const int &RHS) { return *this != DecoderUInt128(RHS); }
-};
-
 //===----------------------------------------------------------------------===//
 // AMDGPUDisassembler
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index dbb16fce8390a..494de07843237 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -558,7 +558,8 @@ static DecodeStatus decodeXqccmpRlistS0(MCInst &Inst, uint32_t Imm,
   return decodeZcmpRlist(Inst, Imm, Address, Decoder);
 }
 
-static DecodeStatus decodeCSSPushPopchk(MCInst &Inst, uint32_t Insn,
+static DecodeStatus decodeCSSPushPopchk(MCInst &Inst,
+                                        const std::bitset<48> &Insn,
                                         uint64_t Address,
                                         const MCDisassembler *Decoder) {
   uint32_t Rs1 = fieldFromInstruction(Insn, 7, 5);
@@ -568,7 +569,8 @@ static DecodeStatus decodeCSSPushPopchk(MCInst &Inst, uint32_t Insn,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus decodeXTHeadMemPair(MCInst &Inst, uint32_t Insn,
+static DecodeStatus decodeXTHeadMemPair(MCInst &Inst,
+                                        const std::bitset<48> &Insn,
                                         uint64_t Address,
                                         const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
@@ -710,9 +712,7 @@ DecodeStatus RISCVDisassembler::getInstruction32(MCInst &MI, uint64_t &Size,
   }
   Size = 4;
 
-  // Use uint64_t to match getInstruction48. decodeInstruction is templated
-  // on the Insn type.
-  uint64_t Insn = support::endian::read32le(Bytes.data());
+  uint32_t Insn = support::endian::read32le(Bytes.data());
 
   for (const DecoderListEntry &Entry : DecoderList32) {
     if (!Entry.haveContainedFeatures(STI.getFeatureBits()))
@@ -758,9 +758,7 @@ DecodeStatus RISCVDisassembler::getInstruction16(MCInst &MI, uint64_t &Size,
   }
   Size = 2;
 
-  // Use uint64_t to match getInstruction48. decodeInstruction is templated
-  // on the Insn type.
-  uint64_t Insn = support::endian::read16le(Bytes.data());
+  uint16_t Insn = support::endian::read16le(Bytes.data());
 
   for (const DecoderListEntry &Entry : DecoderList16) {
     if (!Entry.haveContainedFeatures(STI.getFeatureBits()))
@@ -794,10 +792,11 @@ DecodeStatus RISCVDisassembler::getInstruction48(MCInst &MI, uint64_t &Size,
   }
   Size = 6;
 
-  uint64_t Insn = 0;
+  uint64_t InsnBits = 0;
   for (size_t i = Size; i-- != 0;)
-    Insn += (static_cast<uint64_t>(Bytes[i]) << 8 * i);
+    InsnBits += (static_cast<uint64_t>(Bytes[i]) << 8 * i);
 
+  std::bitset<48> Insn = InsnBits;
   for (const DecoderListEntry &Entry : DecoderList48) {
     if (!Entry.haveContainedFeatures(STI.getFeatureBits()))
       continue;
diff --git a/llvm/lib/Target/RISCV/RISCV.td b/llvm/lib/Target/RISCV/RISCV.td
index b24d8637cb27f..bd8bc56f1da3a 100644
--- a/llvm/lib/Target/RISCV/RISCV.td
+++ b/llvm/lib/Target/RISCV/RISCV.td
@@ -85,6 +85,7 @@ include "RISCVPfmCounters.td"
 
 def RISCVInstrInfo : InstrInfo {
   let guessInstructionProperties = 0;
+  let SpecializeDecodersPerBitwidth = true;
 }
 
 def RISCVAsmParser : AsmParser {
diff --git a/llvm/test/TableGen/VarLenDecoder.td b/llvm/test/TableGen/VarLenDecoder.td
index 769c5895ec3c1..10e254f7673e6 100644
--- a/llvm/test/TableGen/VarLenDecoder.td
+++ b/llvm/test/TableGen/VarLenDecoder.td
@@ -47,6 +47,12 @@ def FOO32 : MyVarInst<MemOp32> {
   );
 }
 
+// Instruction length table
+// CHECK: InstrLenTable
+// CHECK: 27,
+// CHECK-NEXT: 43,
+// CHECK-NEXT: };
+
 // CHECK-SMALL:      /* 0 */       MCD::OPC_ExtractField, 3, 5,  // Inst{7-3} ...
 // CHECK-SMALL-NEXT: /* 3 */       MCD::OPC_FilterValue, 8, 4, 0, // Skip to: 11
 // CHECK-SMALL-NEXT: /* 7 */       MCD::OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 0, // Opcode: FOO16
@@ -61,11 +67,6 @@ def FOO32 : MyVarInst<MemOp32> {
 // CHECK-LARGE-NEXT: /* 14 */      MCD::OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 1, // Opcode: FOO32
 // CHECK-LARGE-NEXT: };
 
-// Instruction length table
-// CHECK: 27,
-// CHECK-NEXT: 43,
-// CHECK-NEXT: };
-
 // CHECK:      case 0:
 // CHECK-NEXT: tmp = fieldFromInstruction(insn, 8, 3);
 // CHECK-NEXT: if (!Check(S, DecodeRegClassRegisterClass(MI, tmp, Address, Decoder))) { return MCDisassembler::Fail; }
diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp
index ecdc48775c9c1..6117261b80bdd 100644
--- a/llvm/utils/TableGen/DecoderEmitter.cpp
+++ b/llvm/utils/TableGen/DecoderEmitter.cpp
@@ -23,6 +23,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
@@ -354,7 +355,8 @@ class DecoderEmitter {
   void emitPredicateFunction(formatted_raw_ostream &OS,
                              PredicateSet &Predicates) const;
   void emitDecoderFunction(formatted_raw_ostream &OS,
-                           DecoderSet &Decoders) const;
+                           const DecoderSet &Decoders,
+                           unsigned BucketBitWidth) const;
 
   // run - Output the code emitter
   void run(raw_ostream &o) const;
@@ -924,7 +926,8 @@ void DecoderEmitter::emitPredicateFunction(formatted_raw_ostream &OS,
 }
 
 void DecoderEmitter::emitDecoderFunction(formatted_raw_ostream &OS,
-                                         DecoderSet &Decoders) const {
+                                         const DecoderSet &Decoders,
+                                         unsigned BucketBitWidth) const {
   // The decoder function is just a big switch statement or a table of function
   // pointers based on the input decoder index.
 
@@ -955,8 +958,13 @@ void DecoderEmitter::emitDecoderFunction(formatted_raw_ostream &OS,
 
   OS << "// Handling " << Decoders.size() << " cases.\n";
   OS << "template <typename InsnType>\n";
-  OS << "static DecodeStatus decodeToMCInst(unsigned Idx, " << DecodeParams
-     << ") {\n";
+  OS << "static ";
+  if (BucketBitWidth != 0)
+    OS << "std::enable_if_t<InsnBitWidth<InsnType> == " << BucketBitWidth
+       << ", DecodeStatus>\n";
+  else
+    OS << "DecodeStatus ";
+  OS << "decodeToMCInst(unsigned Idx, " << DecodeParams << ") {\n";
   OS << "  using namespace llvm::MCD;\n";
   OS << "  DecodeComplete = true;\n";
 
@@ -969,7 +977,6 @@ void DecoderEmitter::emitDecoderFunction(formatted_raw_ostream &OS,
     OS << "  };\n";
     OS << "  if (Idx >= " << Decoders.size() << ")\n";
     OS << "    llvm_unreachable(\"Invalid decoder index!\");\n";
-
     OS << "  return decodeFnTable[Idx](S, insn, MI, Address, Decoder, "
           "DecodeComplete);\n";
   } else {
@@ -2514,21 +2521,29 @@ namespace {
 )";
 
   // Do extra bookkeeping for variable-length encodings.
-  std::vector<unsigned> InstrLen;
   bool IsVarLenInst = Target.hasVariableLengthEncodings();
   unsigned MaxInstLen = 0;
   if (IsVarLenInst) {
-    InstrLen.resize(Target.getInstructions().size(), 0);
+    std::vector<unsigned> InstrLen(Target.getInstructions().size(), 0);
     for (const InstructionEncoding &Encoding : Encodings) {
       MaxInstLen = std::max(MaxInstLen, Encoding.getBitWidth());
       InstrLen[Target.getInstrIntValue(Encoding.getInstruction()->TheDef)] =
           Encoding.getBitWidth();
     }
+
+    // For variable instruction, we emit a instruction length table to let the
+    // decoder know how long the instructions are. You can see example usage in
+    // M68k's disassembler.
+    emitInstrLenTable(OS, InstrLen);
   }
 
   // Map of (namespace, hwmode, size) tuple to encoding IDs.
-  std::map<std::tuple<StringRef, unsigned, unsigned>, std::vector<unsigned>>
-      EncMap;
+  using EncMapTy = std::map<std::tuple<StringRef, unsigned, unsigned>,
+                            std::vector<unsigned>>;
+  EncMapTy EncMap;
+
+  // The set of valid instruction bitwidths for this target.
+  SmallSet<unsigned, 4> InstrBitwidths;
   for (const auto &[HwModeID, EncodingIDs] : EncodingIDsByHwMode) {
     for (unsigned EncodingID : EncodingIDs) {
       const InstructionEncoding &Encoding = Encodings[EncodingID];
@@ -2536,37 +2551,84 @@ namespace {
       unsigned Size = EncodingDef->getValueAsInt("Size");
       StringRef DecoderNamespace =
           EncodingDef->getValueAsString("DecoderNamespace");
+      const unsigned BitWidth = IsVarLenInst ? MaxInstLen : 8 * Size;
+      InstrBitwidths.insert(BitWidth);
       EncMap[{DecoderNamespace, HwModeID, Size}].push_back(EncodingID);
     }
   }
 
+  const bool SpecializeDecodersPerBitwidth =
+      Target.getInstructionSet()->getValueAsBit(
+          "SpecializeDecodersPerBitwidth");
+
+  // Variable length instructions use the same `APInt` type for all instructions
+  // so we cannot specialize decoders based on instruction bitwidths (which
+  // requires using different `InstType` for differet bitwidths for the correct
+  // template specialization to kick in).
+  if (IsVarLenInst && SpecializeDecodersPerBitwidth)
+    PrintFatalError(
+        "Cannot specialize decoders for variable length instuctions");
+
+  // Bucket entries in the `EncMap` based on the instruction bitwidths if
+  // SpecializeDecodersPerBitwidth is enabled.
+  SmallVector<SmallVector<const EncMapTy::value_type *>> PerBitWidthEncMap;
+  if (SpecializeDecodersPerBitwidth) {
+    for (unsigned BW : InstrBitwidths) {
+      auto &Bucket = PerBitWidthEncMap.emplace_back();
+      for (const auto &Entry : EncMap) {
+        unsigned Size = std::get<2>(Entry.first);
+        const unsigned BitWidth = IsVarLenInst ? MaxInstLen : 8 * Size;
+        if (BitWidth != BW)
+          continue;
+        Bucket.push_back(&Entry);
+      }
+    }
+  } else {
+    // If we are not emitting decoders specialized per bit width, create a
+    // single bucket
+    auto &Bucket = PerBitWidthEncMap.emplace_back();
+    Bucket.reserve(EncMap.size());
+    for (const auto &Entry : EncMap)
+      Bucket.push_back(&Entry);
+  }
+
   DecoderTableInfo TableInfo;
   DecoderTableBuilder TableBuilder(Target, Encodings, TableInfo);
   unsigned OpcodeMask = 0;
 
-  for (const auto &[Key, EncodingIDs] : EncMap) {
-    auto [DecoderNamespace, HwModeID, Size] = Key;
-    const unsigned BitWidth = IsVarLenInst ? MaxInstLen : 8 * Size;
-    // Emit the decoder for this (namespace, hwmode, width) combination.
-    FilterChooser FC(Encodings, EncodingIDs);
+  for (const auto &Bucket : PerBitWidthEncMap) {
+    // Each BitWidth get's its own decoders and decoder function.
+    TableInfo.Decoders.clear();
+
+    for (const auto &[Key, EncodingIDs] : make_pointee_range(Bucket)) {
+      auto [DecoderNamespace, HwModeID, Size] = Key;
+      const unsigned BitWidth = IsVarLenInst ? MaxInstLen : 8 * Size;
+      // Emit the decoder for this (namespace, hwmode, width) combination.
+      FilterChooser FC(Encodings, EncodingIDs, BitWidth, Target);
+
+      // The decode table is cleared for each top level decoder function. The
+      // predicates and decoders themselves, however, are shared across
+      // different decoders to give more opportunities for uniqueing.
+      //  - If `SpecializeDecodersPerBitwidth` is enabled, decoders are shared
+      //    across all decoder tables for a given bitwidth, else they are shared
+      //    across all decoder tables.
+      //  - predicates are shared across all decoder tables.
+      TableInfo.Table.clear();
+      TableBuilder.buildTable(FC);
+
+      // Print the table to the output stream.
+      OpcodeMask |= emitTable(OS, TableInfo.Table, DecoderNamespace, HwModeID,
+                              BitWidth, EncodingIDs);
+    }
 
-    // The decode table is cleared for each top level decoder function. The
-    // predicates and decoders themselves, however, are shared across all
-    // decoders to give more opportunities for uniqueing.
-    TableInfo.Table.clear();
-    TableBuilder.buildTable(FC);
+    unsigned BucketSize = std::get<2>(Bucket.front()->first);
+    const unsigned BucketBitWidth =
+        SpecializeDecodersPerBitwidth ? BucketSize * 8 : 0;
 
-    // Print the table to the output stream.
-    OpcodeMask |= emitTable(OS, TableInfo.Table, DecoderNamespace, HwModeID,
-                            BitWidth, EncodingIDs);
+    // Emit the decoder function for each BitWidth bucket.
+    emitDecoderFunction(OS, TableInfo.Decoders, BucketBitWidth);
   }
 
-  // For variable instruction, we emit a instruction length table
-  // to let the decoder know how long the instructions are.
-  // You can see example usage in M68k's disassembler.
-  if (IsVarLenInst)
-    emitInstrLenTable(OS, InstrLen);
-
   const bool HasCheckPredicate =
       OpcodeMask &
       ((1 << MCD::OPC_CheckPredicate) | (1 << MCD::OPC_CheckPredicateOrFail));
@@ -2575,9 +2637,6 @@ namespace {
   if (HasCheckPredicate)
     emitPredicateFunction(OS, TableInfo.Predicates);
 
-  // Emit the decoder function.
-  emitDecoderFunction(OS, TableInfo.Decoders);
-
   // Emit the main entry point for the decoder, decodeInstruction().
   emitDecodeInstruction(OS, IsVarLenInst, OpcodeMask);
 

>From 613a7a047aba5ef216c78096f76349ccd1180738 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi at nvidia.com>
Date: Mon, 25 Aug 2025 22:32:08 -0700
Subject: [PATCH 2/6] Review feedback

---
 .../RISCV/Disassembler/RISCVDisassembler.cpp  |   6 +-
 llvm/utils/TableGen/DecoderEmitter.cpp        | 106 +++++++-----------
 2 files changed, 45 insertions(+), 67 deletions(-)

diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index 494de07843237..ccec73fb3a455 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -558,8 +558,7 @@ static DecodeStatus decodeXqccmpRlistS0(MCInst &Inst, uint32_t Imm,
   return decodeZcmpRlist(Inst, Imm, Address, Decoder);
 }
 
-static DecodeStatus decodeCSSPushPopchk(MCInst &Inst,
-                                        const std::bitset<48> &Insn,
+static DecodeStatus decodeCSSPushPopchk(MCInst &Inst, uint16_t Insn,
                                         uint64_t Address,
                                         const MCDisassembler *Decoder) {
   uint32_t Rs1 = fieldFromInstruction(Insn, 7, 5);
@@ -569,8 +568,7 @@ static DecodeStatus decodeCSSPushPopchk(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus decodeXTHeadMemPair(MCInst &Inst,
-                                        const std::bitset<48> &Insn,
+static DecodeStatus decodeXTHeadMemPair(MCInst &Inst, uint32_t Insn,
                                         uint64_t Address,
                                         const MCDisassembler *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp
index 6117261b80bdd..31a992611064d 100644
--- a/llvm/utils/TableGen/DecoderEmitter.cpp
+++ b/llvm/utils/TableGen/DecoderEmitter.cpp
@@ -2537,23 +2537,19 @@ namespace {
     emitInstrLenTable(OS, InstrLen);
   }
 
-  // Map of (namespace, hwmode, size) tuple to encoding IDs.
-  using EncMapTy = std::map<std::tuple<StringRef, unsigned, unsigned>,
-                            std::vector<unsigned>>;
-  EncMapTy EncMap;
+  // Map of (bitwidth, namespace, hwmode) tuple to encoding IDs.
+  std::map<std::tuple<unsigned, StringRef, unsigned>, std::vector<unsigned>>
+      EncMap;
 
-  // The set of valid instruction bitwidths for this target.
-  SmallSet<unsigned, 4> InstrBitwidths;
   for (const auto &[HwModeID, EncodingIDs] : EncodingIDsByHwMode) {
     for (unsigned EncodingID : EncodingIDs) {
       const InstructionEncoding &Encoding = Encodings[EncodingID];
       const Record *EncodingDef = Encoding.getRecord();
       unsigned Size = EncodingDef->getValueAsInt("Size");
+      const unsigned BitWidth = IsVarLenInst ? MaxInstLen : 8 * Size;
       StringRef DecoderNamespace =
           EncodingDef->getValueAsString("DecoderNamespace");
-      const unsigned BitWidth = IsVarLenInst ? MaxInstLen : 8 * Size;
-      InstrBitwidths.insert(BitWidth);
-      EncMap[{DecoderNamespace, HwModeID, Size}].push_back(EncodingID);
+      EncMap[{BitWidth, DecoderNamespace, HwModeID}].push_back(EncodingID);
     }
   }
 
@@ -2569,65 +2565,49 @@ namespace {
     PrintFatalError(
         "Cannot specialize decoders for variable length instuctions");
 
-  // Bucket entries in the `EncMap` based on the instruction bitwidths if
-  // SpecializeDecodersPerBitwidth is enabled.
-  SmallVector<SmallVector<const EncMapTy::value_type *>> PerBitWidthEncMap;
-  if (SpecializeDecodersPerBitwidth) {
-    for (unsigned BW : InstrBitwidths) {
-      auto &Bucket = PerBitWidthEncMap.emplace_back();
-      for (const auto &Entry : EncMap) {
-        unsigned Size = std::get<2>(Entry.first);
-        const unsigned BitWidth = IsVarLenInst ? MaxInstLen : 8 * Size;
-        if (BitWidth != BW)
-          continue;
-        Bucket.push_back(&Entry);
-      }
-    }
-  } else {
-    // If we are not emitting decoders specialized per bit width, create a
-    // single bucket
-    auto &Bucket = PerBitWidthEncMap.emplace_back();
-    Bucket.reserve(EncMap.size());
-    for (const auto &Entry : EncMap)
-      Bucket.push_back(&Entry);
-  }
-
+  // Entries in `EncMap` are already sorted by bitwidth. So bucketing per
+  // bitwidth can be done on-the-fly as we iterate over tha map.
   DecoderTableInfo TableInfo;
   DecoderTableBuilder TableBuilder(Target, Encodings, TableInfo);
   unsigned OpcodeMask = 0;
 
-  for (const auto &Bucket : PerBitWidthEncMap) {
-    // Each BitWidth get's its own decoders and decoder function.
-    TableInfo.Decoders.clear();
-
-    for (const auto &[Key, EncodingIDs] : make_pointee_range(Bucket)) {
-      auto [DecoderNamespace, HwModeID, Size] = Key;
-      const unsigned BitWidth = IsVarLenInst ? MaxInstLen : 8 * Size;
-      // Emit the decoder for this (namespace, hwmode, width) combination.
-      FilterChooser FC(Encodings, EncodingIDs, BitWidth, Target);
-
-      // The decode table is cleared for each top level decoder function. The
-      // predicates and decoders themselves, however, are shared across
-      // different decoders to give more opportunities for uniqueing.
-      //  - If `SpecializeDecodersPerBitwidth` is enabled, decoders are shared
-      //    across all decoder tables for a given bitwidth, else they are shared
-      //    across all decoder tables.
-      //  - predicates are shared across all decoder tables.
-      TableInfo.Table.clear();
-      TableBuilder.buildTable(FC);
-
-      // Print the table to the output stream.
-      OpcodeMask |= emitTable(OS, TableInfo.Table, DecoderNamespace, HwModeID,
-                              BitWidth, EncodingIDs);
+  unsigned PrevBitWidth = 0;
+  for (const auto &[Key, EncodingIDs] : EncMap) {
+    auto [BitWidth, DecoderNamespace, HwModeID] = Key;
+
+    // If we are starting a new bitwidth and SpecializeDecodersPerBitwidth is
+    // enabled, emit the decoder function for the previous bitwidth.
+    if (SpecializeDecodersPerBitwidth && PrevBitWidth != BitWidth &&
+        PrevBitWidth != 0) {
+      emitDecoderFunction(OS, TableInfo.Decoders, PrevBitWidth);
+      // Each BitWidth get's its own decoders and decoder function.
+      TableInfo.Decoders.clear();
     }
-
-    unsigned BucketSize = std::get<2>(Bucket.front()->first);
-    const unsigned BucketBitWidth =
-        SpecializeDecodersPerBitwidth ? BucketSize * 8 : 0;
-
-    // Emit the decoder function for each BitWidth bucket.
-    emitDecoderFunction(OS, TableInfo.Decoders, BucketBitWidth);
-  }
+    PrevBitWidth = BitWidth;
+
+    // Emit the decoder for this (namespace, hwmode, width) combination.
+    FilterChooser FC(Encodings, EncodingIDs, BitWidth, Target);
+
+    // The decode table is cleared for each top level decoder function. The
+    // predicates and decoders themselves, however, are shared across
+    // different decoders to give more opportunities for uniqueing.
+    //  - If `SpecializeDecodersPerBitwidth` is enabled, decoders are shared
+    //    across all decoder tables for a given bitwidth, else they are shared
+    //    across all decoder tables.
+    //  - predicates are shared across all decoder tables.
+    TableInfo.Table.clear();
+    TableBuilder.buildTable(FC);
+
+    // Print the table to the output stream.
+    OpcodeMask |= emitTable(OS, TableInfo.Table, DecoderNamespace, HwModeID,
+                            BitWidth, EncodingIDs);
+  }
+
+  // Emit the decoder function for the last bucket. This will also emit the
+  // single decoder function if SpecializeDecodersPerBitwidth = false.
+  if (!SpecializeDecodersPerBitwidth)
+    PrevBitWidth = 0;
+  emitDecoderFunction(OS, TableInfo.Decoders, PrevBitWidth);
 
   const bool HasCheckPredicate =
       OpcodeMask &

>From a26edaf0e73169f0ced858b640221f927a9477cc Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi at nvidia.com>
Date: Tue, 26 Aug 2025 09:02:06 -0700
Subject: [PATCH 3/6] Use nested maps and use cl option

---
 llvm/include/llvm/Target/Target.td            |  7 --
 llvm/lib/Target/AMDGPU/AMDGPU.td              |  1 -
 llvm/lib/Target/AMDGPU/CMakeLists.txt         |  3 +-
 llvm/lib/Target/DirectX/CMakeLists.txt        |  1 +
 llvm/lib/Target/RISCV/CMakeLists.txt          |  3 +-
 llvm/lib/Target/RISCV/RISCV.td                |  1 -
 llvm/test/TableGen/HwModeEncodeDecode3.td     | 14 ++--
 llvm/utils/TableGen/DecoderEmitter.cpp        | 79 ++++++++++---------
 .../lib/Target/AMDGPU/Disassembler/BUILD.gn   |  7 +-
 .../lib/Target/RISCV/Disassembler/BUILD.gn    |  5 +-
 10 files changed, 65 insertions(+), 56 deletions(-)

diff --git a/llvm/include/llvm/Target/Target.td b/llvm/include/llvm/Target/Target.td
index 403c866e386ae..495b59ee916cf 100644
--- a/llvm/include/llvm/Target/Target.td
+++ b/llvm/include/llvm/Target/Target.td
@@ -1158,13 +1158,6 @@ class InstrInfo {
   //
   // This option is a temporary migration help. It will go away.
   bit guessInstructionProperties = true;
-
-  // Generate decoders that are specialized per bit width in the generated
-  // decoder/disassembler. This requires use of different `InsnType` for
-  // different bitwidths and defining `InsnBitWidth` template specialization for
-  // the `InsnType` types used. Some common specializations are already defined
-  // in MCDecoder.h.
-  bit SpecializeDecodersPerBitwidth = false;
 }
 
 // Standard Pseudo Instructions.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 4d9185d40b9a5..edd3ce72d7df3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -2100,7 +2100,6 @@ def FeatureISAVersion12_Generic: FeatureSet<
 
 def AMDGPUInstrInfo : InstrInfo {
   let guessInstructionProperties = 1;
-  let SpecializeDecodersPerBitwidth = true;
 }
 
 def AMDGPUAsmParser : AsmParser {
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index dc9dd220130ea..56dc6f9e8e6e9 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -6,7 +6,8 @@ tablegen(LLVM AMDGPUGenAsmMatcher.inc -gen-asm-matcher)
 tablegen(LLVM AMDGPUGenAsmWriter.inc -gen-asm-writer)
 tablegen(LLVM AMDGPUGenCallingConv.inc -gen-callingconv)
 tablegen(LLVM AMDGPUGenDAGISel.inc -gen-dag-isel)
-tablegen(LLVM AMDGPUGenDisassemblerTables.inc -gen-disassembler)
+tablegen(LLVM AMDGPUGenDisassemblerTables.inc -gen-disassembler
+              --specialize-decoders-per-bitwidth)
 tablegen(LLVM AMDGPUGenInstrInfo.inc -gen-instr-info)
 tablegen(LLVM AMDGPUGenMCCodeEmitter.inc -gen-emitter)
 tablegen(LLVM AMDGPUGenMCPseudoLowering.inc -gen-pseudo-lowering)
diff --git a/llvm/lib/Target/DirectX/CMakeLists.txt b/llvm/lib/Target/DirectX/CMakeLists.txt
index 8100f941c8d94..6c079517e22d6 100644
--- a/llvm/lib/Target/DirectX/CMakeLists.txt
+++ b/llvm/lib/Target/DirectX/CMakeLists.txt
@@ -41,6 +41,7 @@ add_llvm_target(DirectXCodeGen
   LINK_COMPONENTS
   Analysis
   AsmPrinter
+  BinaryFormat
   CodeGen
   CodeGenTypes
   Core
diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt
index 47329b2c2f4d2..531238ae85029 100644
--- a/llvm/lib/Target/RISCV/CMakeLists.txt
+++ b/llvm/lib/Target/RISCV/CMakeLists.txt
@@ -7,7 +7,8 @@ tablegen(LLVM RISCVGenAsmWriter.inc -gen-asm-writer)
 tablegen(LLVM RISCVGenCompressInstEmitter.inc -gen-compress-inst-emitter)
 tablegen(LLVM RISCVGenMacroFusion.inc -gen-macro-fusion-pred)
 tablegen(LLVM RISCVGenDAGISel.inc -gen-dag-isel)
-tablegen(LLVM RISCVGenDisassemblerTables.inc -gen-disassembler)
+tablegen(LLVM RISCVGenDisassemblerTables.inc -gen-disassembler
+              --specialize-decoders-per-bitwidth)
 tablegen(LLVM RISCVGenInstrInfo.inc -gen-instr-info)
 tablegen(LLVM RISCVGenMCCodeEmitter.inc -gen-emitter)
 tablegen(LLVM RISCVGenMCPseudoLowering.inc -gen-pseudo-lowering)
diff --git a/llvm/lib/Target/RISCV/RISCV.td b/llvm/lib/Target/RISCV/RISCV.td
index bd8bc56f1da3a..b24d8637cb27f 100644
--- a/llvm/lib/Target/RISCV/RISCV.td
+++ b/llvm/lib/Target/RISCV/RISCV.td
@@ -85,7 +85,6 @@ include "RISCVPfmCounters.td"
 
 def RISCVInstrInfo : InstrInfo {
   let guessInstructionProperties = 0;
-  let SpecializeDecodersPerBitwidth = true;
 }
 
 def RISCVAsmParser : AsmParser {
diff --git a/llvm/test/TableGen/HwModeEncodeDecode3.td b/llvm/test/TableGen/HwModeEncodeDecode3.td
index dbbf866f057e5..5e9ac7d17e45a 100644
--- a/llvm/test/TableGen/HwModeEncodeDecode3.td
+++ b/llvm/test/TableGen/HwModeEncodeDecode3.td
@@ -118,8 +118,6 @@ def unrelated: Instruction {
 // exact duplicates and could effectively be merged into one.
 // DECODER-LABEL: DecoderTable32
 // DECODER-DAG: Opcode: bar
-// DECODER-LABEL: DecoderTable64
-// DECODER-DAG: Opcode: fooTypeEncDefault:foo
 // DECODER-LABEL: DecoderTable_ModeA32
 // DECODER-DAG: Opcode: fooTypeEncA:foo
 // DECODER-DAG: Opcode: bar
@@ -138,13 +136,13 @@ def unrelated: Instruction {
 // DECODER-DAG: Opcode: unrelated
 // DECODER-LABEL: DecoderTableAlt_ModeC32
 // DECODER-DAG: Opcode: unrelated
+// DECODER-LABEL: DecoderTable64
+// DECODER-DAG: Opcode: fooTypeEncDefault:foo
 
 // Under the 'O1' optimization level, unnecessary duplicate tables will be eliminated,
 // reducing the four ‘Alt’ tables down to just one.
 // DECODER-SUPPRESS-O1-LABEL: DecoderTable32
 // DECODER-SUPPRESS-O1-DAG: Opcode: bar
-// DECODER-SUPPRESS-O1-LABEL: DecoderTable64
-// DECODER-SUPPRESS-O1-DAG: Opcode: fooTypeEncDefault:foo
 // DECODER-SUPPRESS-O1-LABEL: DecoderTable_ModeA32
 // DECODER-SUPPRESS-O1-DAG: Opcode: fooTypeEncA:foo
 // DECODER-SUPPRESS-O1-DAG: Opcode: bar
@@ -157,6 +155,8 @@ def unrelated: Instruction {
 // DECODER-SUPPRESS-O1-DAG: Opcode: bar
 // DECODER-SUPPRESS-O1-LABEL: DecoderTableAlt32
 // DECODER-SUPPRESS-O1-DAG: Opcode: unrelated
+// DECODER-SUPPRESS-O1-LABEL: DecoderTable64
+// DECODER-SUPPRESS-O1-DAG: Opcode: fooTypeEncDefault:foo
 
 // Under the 'O2' optimization condition, instructions possessing the 'EncodingByHwMode'
 // attribute will be extracted from their original DecoderNamespace and placed into their
@@ -166,9 +166,6 @@ def unrelated: Instruction {
 // consider the interplay between HwMode and DecoderNamespace for their instructions.
 // DECODER-SUPPRESS-O2-LABEL: DecoderTable32
 // DECODER-SUPPRESS-O2-DAG: Opcode: bar
-// DECODER-SUPPRESS-O2-LABEL: DecoderTable64
-// DECODER-SUPPRESS-O2-NOT: Opcode: bar
-// DECODER-SUPPRESS-O2-DAG: Opcode: fooTypeEncDefault:foo
 // DECODER-SUPPRESS-O2-LABEL: DecoderTable_ModeA32
 // DECODER-SUPPRESS-O2-DAG: Opcode: fooTypeEncA:foo
 // DECODER-SUPPRESS-O2-NOT: Opcode: bar
@@ -181,6 +178,9 @@ def unrelated: Instruction {
 // DECODER-SUPPRESS-O2-NOT: Opcode: bar
 // DECODER-SUPPRESS-O2-LABEL: DecoderTableAlt32
 // DECODER-SUPPRESS-O2-DAG: Opcode: unrelated
+// DECODER-SUPPRESS-O2-LABEL: DecoderTable64
+// DECODER-SUPPRESS-O2-NOT: Opcode: bar
+// DECODER-SUPPRESS-O2-DAG: Opcode: fooTypeEncDefault:foo
 
 // For 'bar' and 'unrelated', we didn't assign any HwModes for them,
 // they should keep the same in the following four tables.
diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp
index 31a992611064d..9900f33f76e6e 100644
--- a/llvm/utils/TableGen/DecoderEmitter.cpp
+++ b/llvm/utils/TableGen/DecoderEmitter.cpp
@@ -94,6 +94,15 @@ static cl::opt<bool> UseFnTableInDecodeToMCInst(
         "of the generated code."),
     cl::init(false), cl::cat(DisassemblerEmitterCat));
 
+static cl::opt<bool> SpecializeDecodersPerBitwidth(
+    "specialize-decoders-per-bitwidth",
+    cl::desc("Specialize the generated `decodeToMCInst` function per bitwidth. "
+             "Can help reduce the code size. This requires use of different "
+             "`InsnType` for different bitwidths and defining `InsnBitWidth` "
+             "template specialization for the `InsnType` types used. Some "
+             "common specializations are already defined in MCDecoder.h."),
+    cl::init(false), cl::cat(DisassemblerEmitterCat));
+
 STATISTIC(NumEncodings, "Number of encodings considered");
 STATISTIC(NumEncodingsLackingDisasm,
           "Number of encodings without disassembler info");
@@ -2538,8 +2547,13 @@ namespace {
   }
 
   // Map of (bitwidth, namespace, hwmode) tuple to encoding IDs.
-  std::map<std::tuple<unsigned, StringRef, unsigned>, std::vector<unsigned>>
-      EncMap;
+  // Its organized as a nested map, with the (namespace, hwmode) as the key for
+  // the inner map and bitwidth as the key for the outer map. We use std::map
+  // for deterministic iteration order so that the code emitted is also
+  // deterministic.
+  using InnerKeyTy = std::pair<StringRef, unsigned>;
+  using InnerMapTy = std::map<InnerKeyTy, std::vector<unsigned>>;
+  std::map<unsigned, InnerMapTy> EncMap;
 
   for (const auto &[HwModeID, EncodingIDs] : EncodingIDsByHwMode) {
     for (unsigned EncodingID : EncodingIDs) {
@@ -2549,14 +2563,10 @@ namespace {
       const unsigned BitWidth = IsVarLenInst ? MaxInstLen : 8 * Size;
       StringRef DecoderNamespace =
           EncodingDef->getValueAsString("DecoderNamespace");
-      EncMap[{BitWidth, DecoderNamespace, HwModeID}].push_back(EncodingID);
+      EncMap[BitWidth][{DecoderNamespace, HwModeID}].push_back(EncodingID);
     }
   }
 
-  const bool SpecializeDecodersPerBitwidth =
-      Target.getInstructionSet()->getValueAsBit(
-          "SpecializeDecodersPerBitwidth");
-
   // Variable length instructions use the same `APInt` type for all instructions
   // so we cannot specialize decoders based on instruction bitwidths (which
   // requires using different `InstType` for differet bitwidths for the correct
@@ -2571,43 +2581,40 @@ namespace {
   DecoderTableBuilder TableBuilder(Target, Encodings, TableInfo);
   unsigned OpcodeMask = 0;
 
-  unsigned PrevBitWidth = 0;
-  for (const auto &[Key, EncodingIDs] : EncMap) {
-    auto [BitWidth, DecoderNamespace, HwModeID] = Key;
+  for (const auto &[BitWidth, BWMap] : EncMap) {
+    for (const auto &[Key, EncodingIDs] : BWMap) {
+      auto [DecoderNamespace, HwModeID] = Key;
+
+      // Emit the decoder for this (namespace, hwmode, width) combination.
+      FilterChooser FC(Encodings, EncodingIDs);
+
+      // The decode table is cleared for each top level decoder function. The
+      // predicates and decoders themselves, however, are shared across
+      // different decoders to give more opportunities for uniqueing.
+      //  - If `SpecializeDecodersPerBitwidth` is enabled, decoders are shared
+      //    across all decoder tables for a given bitwidth, else they are shared
+      //    across all decoder tables.
+      //  - predicates are shared across all decoder tables.
+      TableInfo.Table.clear();
+      TableBuilder.buildTable(FC);
+
+      // Print the table to the output stream.
+      OpcodeMask |= emitTable(OS, TableInfo.Table, DecoderNamespace, HwModeID,
+                              BitWidth, EncodingIDs);
+    }
 
-    // If we are starting a new bitwidth and SpecializeDecodersPerBitwidth is
-    // enabled, emit the decoder function for the previous bitwidth.
-    if (SpecializeDecodersPerBitwidth && PrevBitWidth != BitWidth &&
-        PrevBitWidth != 0) {
-      emitDecoderFunction(OS, TableInfo.Decoders, PrevBitWidth);
-      // Each BitWidth get's its own decoders and decoder function.
+    // Each BitWidth get's its own decoders and decoder function if
+    // SpecializeDecodersPerBitwidth is enabled.
+    if (SpecializeDecodersPerBitwidth) {
+      emitDecoderFunction(OS, TableInfo.Decoders, BitWidth);
       TableInfo.Decoders.clear();
     }
-    PrevBitWidth = BitWidth;
-
-    // Emit the decoder for this (namespace, hwmode, width) combination.
-    FilterChooser FC(Encodings, EncodingIDs, BitWidth, Target);
-
-    // The decode table is cleared for each top level decoder function. The
-    // predicates and decoders themselves, however, are shared across
-    // different decoders to give more opportunities for uniqueing.
-    //  - If `SpecializeDecodersPerBitwidth` is enabled, decoders are shared
-    //    across all decoder tables for a given bitwidth, else they are shared
-    //    across all decoder tables.
-    //  - predicates are shared across all decoder tables.
-    TableInfo.Table.clear();
-    TableBuilder.buildTable(FC);
-
-    // Print the table to the output stream.
-    OpcodeMask |= emitTable(OS, TableInfo.Table, DecoderNamespace, HwModeID,
-                            BitWidth, EncodingIDs);
   }
 
   // Emit the decoder function for the last bucket. This will also emit the
   // single decoder function if SpecializeDecodersPerBitwidth = false.
   if (!SpecializeDecodersPerBitwidth)
-    PrevBitWidth = 0;
-  emitDecoderFunction(OS, TableInfo.Decoders, PrevBitWidth);
+    emitDecoderFunction(OS, TableInfo.Decoders, 0);
 
   const bool HasCheckPredicate =
       OpcodeMask &
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/Disassembler/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/Disassembler/BUILD.gn
index 11bc537936508..9cc98cd8642d6 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/Disassembler/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/Disassembler/BUILD.gn
@@ -2,7 +2,12 @@ import("//llvm/utils/TableGen/tablegen.gni")
 
 tablegen("AMDGPUGenDisassemblerTables") {
   visibility = [ ":Disassembler" ]
-  args = [ "-gen-disassembler" ]
+  args = [
+    "-gen-disassembler",
+    "-specialize-decoders-per-bitwidth",
+  ]
+
+  args = [   ]
   td_file = "../AMDGPU.td"
 }
 
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/Disassembler/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/Disassembler/BUILD.gn
index cb579221fd366..447a67af6be7b 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/Disassembler/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/Disassembler/BUILD.gn
@@ -2,7 +2,10 @@ import("//llvm/utils/TableGen/tablegen.gni")
 
 tablegen("RISCVGenDisassemblerTables") {
   visibility = [ ":Disassembler" ]
-  args = [ "-gen-disassembler" ]
+  args = [
+    "-gen-disassembler",
+    "-specialize-decoders-per-bitwidth",
+  ]
   td_file = "../RISCV.td"
 }
 

>From 0467f567125b24bf086a67a2f10c20e74a176fb5 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi at nvidia.com>
Date: Tue, 26 Aug 2025 11:35:42 -0700
Subject: [PATCH 4/6] Review feedback: Make help message succint

---
 llvm/utils/TableGen/DecoderEmitter.cpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp
index 9900f33f76e6e..37c8b29e7209f 100644
--- a/llvm/utils/TableGen/DecoderEmitter.cpp
+++ b/llvm/utils/TableGen/DecoderEmitter.cpp
@@ -94,13 +94,14 @@ static cl::opt<bool> UseFnTableInDecodeToMCInst(
         "of the generated code."),
     cl::init(false), cl::cat(DisassemblerEmitterCat));
 
+// Enabling this option requires use of different `InsnType` for different
+// bitwidths and defining `InsnBitWidth` template specialization for the
+// `InsnType` types used. Some common specializations are already defined in
+// MCDecoder.h.
 static cl::opt<bool> SpecializeDecodersPerBitwidth(
     "specialize-decoders-per-bitwidth",
     cl::desc("Specialize the generated `decodeToMCInst` function per bitwidth. "
-             "Can help reduce the code size. This requires use of different "
-             "`InsnType` for different bitwidths and defining `InsnBitWidth` "
-             "template specialization for the `InsnType` types used. Some "
-             "common specializations are already defined in MCDecoder.h."),
+             "Helps reduce the code size."),
     cl::init(false), cl::cat(DisassemblerEmitterCat));
 
 STATISTIC(NumEncodings, "Number of encodings considered");
@@ -2576,7 +2577,7 @@ namespace {
         "Cannot specialize decoders for variable length instuctions");
 
   // Entries in `EncMap` are already sorted by bitwidth. So bucketing per
-  // bitwidth can be done on-the-fly as we iterate over tha map.
+  // bitwidth can be done on-the-fly as we iterate over the map.
   DecoderTableInfo TableInfo;
   DecoderTableBuilder TableBuilder(Target, Encodings, TableInfo);
   unsigned OpcodeMask = 0;

>From 8d54b49a044dc04b611e634ff670587d204d4786 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi at nvidia.com>
Date: Tue, 26 Aug 2025 21:08:19 -0700
Subject: [PATCH 5/6] Use uint64_t for RISCV 48-bit insts

---
 llvm/include/llvm/MC/MCDecoder.h                     | 12 ++----------
 .../AMDGPU/Disassembler/AMDGPUDisassembler.cpp       |  8 ++++++++
 .../Target/RISCV/Disassembler/RISCVDisassembler.cpp  | 11 ++++++++---
 3 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/llvm/include/llvm/MC/MCDecoder.h b/llvm/include/llvm/MC/MCDecoder.h
index 6259ef5a3bd5d..f2da8b52251a6 100644
--- a/llvm/include/llvm/MC/MCDecoder.h
+++ b/llvm/include/llvm/MC/MCDecoder.h
@@ -75,18 +75,10 @@ insertBits(IntType &field, IntType bits, unsigned startBit, unsigned numBits) {
 // InsnBitWidth is essentially a type trait used by the decoder emitter to query
 // the supported bitwidth for a given type. But default, the value is 0, making
 // it an invalid type for use as `InsnType` when instantiating the decoder.
+// Individual targets are expected to provide specializations for these based
+// on their usage.
 template <typename T> inline constexpr uint32_t InsnBitWidth = 0;
 
-// Provide specializations for commonly used types.
-// Integer types.
-template <> inline constexpr uint32_t InsnBitWidth<uint8_t> = 8;
-template <> inline constexpr uint32_t InsnBitWidth<uint16_t> = 16;
-template <> inline constexpr uint32_t InsnBitWidth<uint32_t> = 32;
-template <> inline constexpr uint32_t InsnBitWidth<uint64_t> = 64;
-
-// std::bitset<N>.
-template <size_t N> inline constexpr uint32_t InsnBitWidth<std::bitset<N>> = N;
-
 } // namespace llvm::MCD
 
 #endif // LLVM_MC_MCDECODER_H
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index ac42e51d447e3..2dc607608b80c 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -447,6 +447,14 @@ static DecodeStatus decodeVersionImm(MCInst &Inst, unsigned Imm,
 
 #include "AMDGPUGenDisassemblerTables.inc"
 
+// Define bitwidths for various types used to instantiate the decoder.
+template <> inline constexpr uint32_t llvm::MCD::InsnBitWidth<uint32_t> = 32;
+template <> inline constexpr uint32_t llvm::MCD::InsnBitWidth<uint64_t> = 64;
+template <>
+inline constexpr uint32_t llvm::MCD::InsnBitWidth<std::bitset<96>> = 96;
+template <>
+inline constexpr uint32_t llvm::MCD::InsnBitWidth<std::bitset<128>> = 128;
+
 //===----------------------------------------------------------------------===//
 //
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index ccec73fb3a455..1447ba364c482 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -700,6 +700,12 @@ static constexpr DecoderListEntry DecoderList32[]{
     {DecoderTableZdinxRV32Only32, {}, "RV32-only Zdinx (Double in Integer)"},
 };
 
+// Define bitwidths for various types used to instantiate the decoder.
+template <> inline constexpr uint32_t llvm::MCD::InsnBitWidth<uint16_t> = 16;
+template <> inline constexpr uint32_t llvm::MCD::InsnBitWidth<uint32_t> = 32;
+// Use uint64_t to represent 48 bit instructions.
+template <> inline constexpr uint32_t llvm::MCD::InsnBitWidth<uint64_t> = 48;
+
 DecodeStatus RISCVDisassembler::getInstruction32(MCInst &MI, uint64_t &Size,
                                                  ArrayRef<uint8_t> Bytes,
                                                  uint64_t Address,
@@ -790,11 +796,10 @@ DecodeStatus RISCVDisassembler::getInstruction48(MCInst &MI, uint64_t &Size,
   }
   Size = 6;
 
-  uint64_t InsnBits = 0;
+  uint64_t Insn = 0;
   for (size_t i = Size; i-- != 0;)
-    InsnBits += (static_cast<uint64_t>(Bytes[i]) << 8 * i);
+    Insn += (static_cast<uint64_t>(Bytes[i]) << 8 * i);
 
-  std::bitset<48> Insn = InsnBits;
   for (const DecoderListEntry &Entry : DecoderList48) {
     if (!Entry.haveContainedFeatures(STI.getFeatureBits()))
       continue;

>From 212e68995e4bbbee6069eeaee525586ceb9d86a5 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi at nvidia.com>
Date: Tue, 26 Aug 2025 22:10:04 -0700
Subject: [PATCH 6/6] Make InstBitWidth specializations static

---
 .../lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp | 8 ++++----
 llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp  | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 2dc607608b80c..80d194afa926b 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -448,12 +448,12 @@ static DecodeStatus decodeVersionImm(MCInst &Inst, unsigned Imm,
 #include "AMDGPUGenDisassemblerTables.inc"
 
 // Define bitwidths for various types used to instantiate the decoder.
-template <> inline constexpr uint32_t llvm::MCD::InsnBitWidth<uint32_t> = 32;
-template <> inline constexpr uint32_t llvm::MCD::InsnBitWidth<uint64_t> = 64;
+template <> static constexpr uint32_t llvm::MCD::InsnBitWidth<uint32_t> = 32;
+template <> static constexpr uint32_t llvm::MCD::InsnBitWidth<uint64_t> = 64;
 template <>
-inline constexpr uint32_t llvm::MCD::InsnBitWidth<std::bitset<96>> = 96;
+static constexpr uint32_t llvm::MCD::InsnBitWidth<std::bitset<96>> = 96;
 template <>
-inline constexpr uint32_t llvm::MCD::InsnBitWidth<std::bitset<128>> = 128;
+static constexpr uint32_t llvm::MCD::InsnBitWidth<std::bitset<128>> = 128;
 
 //===----------------------------------------------------------------------===//
 //
diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index 1447ba364c482..4becd3bbb25f2 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -701,10 +701,10 @@ static constexpr DecoderListEntry DecoderList32[]{
 };
 
 // Define bitwidths for various types used to instantiate the decoder.
-template <> inline constexpr uint32_t llvm::MCD::InsnBitWidth<uint16_t> = 16;
-template <> inline constexpr uint32_t llvm::MCD::InsnBitWidth<uint32_t> = 32;
+template <> static constexpr uint32_t llvm::MCD::InsnBitWidth<uint16_t> = 16;
+template <> static constexpr uint32_t llvm::MCD::InsnBitWidth<uint32_t> = 32;
 // Use uint64_t to represent 48 bit instructions.
-template <> inline constexpr uint32_t llvm::MCD::InsnBitWidth<uint64_t> = 48;
+template <> static constexpr uint32_t llvm::MCD::InsnBitWidth<uint64_t> = 48;
 
 DecodeStatus RISCVDisassembler::getInstruction32(MCInst &MI, uint64_t &Size,
                                                  ArrayRef<uint8_t> Bytes,