[llvm] [TableGen][DecoderEmitter] Add option to emit type-specialized code (PR #146593)

Tue Aug 19 02:23:16 PDT 2025

https://github.com/s-barannikov updated https://github.com/llvm/llvm-project/pull/146593

>From 94522ba04995a2ed60b6938b2c9cc4bdd5376622 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi at nvidia.com>
Date: Tue, 1 Jul 2025 09:47:10 -0700
Subject: [PATCH] [TableGen][DecoderEmitter] Add option to emit
 type-specialized `decodeToMCInst`

---
 llvm/include/llvm/Target/Target.td            |   7 +-
 .../Disassembler/AMDGPUDisassembler.cpp       |  31 +-
 .../AMDGPU/Disassembler/AMDGPUDisassembler.h  |  38 --
 llvm/lib/Target/ARC/ARC.td                    |   5 +-
 llvm/lib/Target/AVR/AVR.td                    |   5 +-
 llvm/lib/Target/CSKY/CSKY.td                  |   5 +-
 llvm/lib/Target/MSP430/MSP430.td              |   5 +-
 llvm/lib/Target/Mips/Mips.td                  |   2 +
 llvm/lib/Target/PowerPC/PPC.td                |   2 +
 .../RISCV/Disassembler/RISCVDisassembler.cpp  |  13 +-
 llvm/lib/Target/SystemZ/SystemZ.td            |   6 +-
 llvm/lib/Target/Xtensa/Xtensa.td              |   5 +-
 llvm/test/TableGen/BitOffsetDecoder.td        |  10 +-
 llvm/test/TableGen/DecoderEmitterFnTable.td   |  20 +-
 .../FixedLenDecoderEmitter/InitValue.td       |   6 +-
 llvm/test/TableGen/HwModeEncodeDecode3.td     |  19 +-
 llvm/test/TableGen/VarLenDecoder.td           |  37 +-
 llvm/test/TableGen/trydecode-emission.td      |  26 +-
 llvm/test/TableGen/trydecode-emission2.td     |   8 +-
 llvm/test/TableGen/trydecode-emission4.td     |   4 +-
 llvm/utils/TableGen/DecoderEmitter.cpp        | 561 +++++++++++++-----
 21 files changed, 552 insertions(+), 263 deletions(-)

diff --git a/llvm/include/llvm/Target/Target.td b/llvm/include/llvm/Target/Target.td
index 495b59ee916cf..2fbd9a836d1c0 100644
--- a/llvm/include/llvm/Target/Target.td
+++ b/llvm/include/llvm/Target/Target.td
@@ -1137,7 +1137,6 @@ class OptionalDefOperand<ValueType ty, dag OpTypes, dag defaultops>
   let MIOperandInfo = OpTypes;
 }
 
-
 // InstrInfo - This class should only be instantiated once to provide parameters
 // which are global to the target machine.
 //
@@ -1158,6 +1157,12 @@ class InstrInfo {
   //
   // This option is a temporary migration help. It will go away.
   bit guessInstructionProperties = true;
+
+  // Option to choose bewteen templated and non-templated code from decoder
+  // emitter. This means that the generated `decodeInstruction` function will
+  // use auto-inferred types for the instruction payload instead of generating
+  // templated code using `InsnType` for the instruction payload.
+  bit GenerateTemplatedDecoder = false;
 }
 
 // Standard Pseudo Instructions.
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index fb7d634e62272..68f645195ac34 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -35,6 +35,7 @@
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/AMDHSAKernelDescriptor.h"
 #include "llvm/Support/Compiler.h"
+#include <bitset>
 
 using namespace llvm;
 
@@ -497,26 +498,24 @@ template <typename T> static inline T eatBytes(ArrayRef<uint8_t>& Bytes) {
   return Res;
 }
 
-static inline DecoderUInt128 eat12Bytes(ArrayRef<uint8_t> &Bytes) {
+static inline std::bitset<96> eat12Bytes(ArrayRef<uint8_t> &Bytes) {
+  using namespace llvm::support::endian;
   assert(Bytes.size() >= 12);
-  uint64_t Lo =
-      support::endian::read<uint64_t, llvm::endianness::little>(Bytes.data());
+  std::bitset<96> Lo(read<uint64_t, endianness::little>(Bytes.data()));
   Bytes = Bytes.slice(8);
-  uint64_t Hi =
-      support::endian::read<uint32_t, llvm::endianness::little>(Bytes.data());
+  std::bitset<96> Hi(read<uint32_t, endianness::little>(Bytes.data()));
   Bytes = Bytes.slice(4);
-  return DecoderUInt128(Lo, Hi);
+  return (Hi << 64) | Lo;
 }
 
-static inline DecoderUInt128 eat16Bytes(ArrayRef<uint8_t> &Bytes) {
+static inline std::bitset<128> eat16Bytes(ArrayRef<uint8_t> &Bytes) {
+  using namespace llvm::support::endian;
   assert(Bytes.size() >= 16);
-  uint64_t Lo =
-      support::endian::read<uint64_t, llvm::endianness::little>(Bytes.data());
+  std::bitset<128> Lo(read<uint64_t, endianness::little>(Bytes.data()));
   Bytes = Bytes.slice(8);
-  uint64_t Hi =
-      support::endian::read<uint64_t, llvm::endianness::little>(Bytes.data());
+  std::bitset<128> Hi(read<uint64_t, endianness::little>(Bytes.data()));
   Bytes = Bytes.slice(8);
-  return DecoderUInt128(Lo, Hi);
+  return (Hi << 64) | Lo;
 }
 
 void AMDGPUDisassembler::decodeImmOperands(MCInst &MI,
@@ -599,14 +598,14 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     // Try to decode DPP and SDWA first to solve conflict with VOP1 and VOP2
     // encodings
     if (isGFX1250() && Bytes.size() >= 16) {
-      DecoderUInt128 DecW = eat16Bytes(Bytes);
+      std::bitset<128> DecW = eat16Bytes(Bytes);
       if (tryDecodeInst(DecoderTableGFX1250128, MI, DecW, Address, CS))
         break;
       Bytes = Bytes_.slice(0, MaxInstBytesNum);
     }
 
-    if (isGFX11Plus() && Bytes.size() >= 12 ) {
-      DecoderUInt128 DecW = eat12Bytes(Bytes);
+    if (isGFX11Plus() && Bytes.size() >= 12) {
+      std::bitset<96> DecW = eat12Bytes(Bytes);
 
       if (isGFX11() &&
           tryDecodeInst(DecoderTableGFX1196, DecoderTableGFX11_FAKE1696, MI,
@@ -641,7 +640,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
 
     } else if (Bytes.size() >= 16 &&
                STI.hasFeature(AMDGPU::FeatureGFX950Insts)) {
-      DecoderUInt128 DecW = eat16Bytes(Bytes);
+      std::bitset<128> DecW = eat16Bytes(Bytes);
       if (tryDecodeInst(DecoderTableGFX940128, MI, DecW, Address, CS))
         break;
 
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index f4d164bf10c3c..ded447b6f8d5a 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -32,44 +32,6 @@ class MCOperand;
 class MCSubtargetInfo;
 class Twine;
 
-// Exposes an interface expected by autogenerated code in
-// FixedLenDecoderEmitter
-class DecoderUInt128 {
-private:
-  uint64_t Lo = 0;
-  uint64_t Hi = 0;
-
-public:
-  DecoderUInt128() = default;
-  DecoderUInt128(uint64_t Lo, uint64_t Hi = 0) : Lo(Lo), Hi(Hi) {}
-  operator bool() const { return Lo || Hi; }
-  uint64_t extractBitsAsZExtValue(unsigned NumBits,
-                                  unsigned BitPosition) const {
-    assert(NumBits && NumBits <= 64);
-    assert(BitPosition < 128);
-    uint64_t Val;
-    if (BitPosition < 64)
-      Val = Lo >> BitPosition | Hi << 1 << (63 - BitPosition);
-    else
-      Val = Hi >> (BitPosition - 64);
-    return Val & ((uint64_t(2) << (NumBits - 1)) - 1);
-  }
-  DecoderUInt128 operator&(const DecoderUInt128 &RHS) const {
-    return DecoderUInt128(Lo & RHS.Lo, Hi & RHS.Hi);
-  }
-  DecoderUInt128 operator&(const uint64_t &RHS) const {
-    return *this & DecoderUInt128(RHS);
-  }
-  DecoderUInt128 operator~() const { return DecoderUInt128(~Lo, ~Hi); }
-  bool operator==(const DecoderUInt128 &RHS) {
-    return Lo == RHS.Lo && Hi == RHS.Hi;
-  }
-  bool operator!=(const DecoderUInt128 &RHS) {
-    return Lo != RHS.Lo || Hi != RHS.Hi;
-  }
-  bool operator!=(const int &RHS) { return *this != DecoderUInt128(RHS); }
-};
-
 //===----------------------------------------------------------------------===//
 // AMDGPUDisassembler
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/ARC/ARC.td b/llvm/lib/Target/ARC/ARC.td
index 142ce7f747919..989dc53794ae6 100644
--- a/llvm/lib/Target/ARC/ARC.td
+++ b/llvm/lib/Target/ARC/ARC.td
@@ -24,7 +24,10 @@ include "ARCRegisterInfo.td"
 include "ARCInstrInfo.td"
 include "ARCCallingConv.td"
 
-def ARCInstrInfo : InstrInfo;
+def ARCInstrInfo : InstrInfo {
+  // FIXME: Migrate ARC disassembler to work with non-templated decoder.
+  let GenerateTemplatedDecoder = true;
+}
 
 class Proc<string Name, list<SubtargetFeature> Features>
  : Processor<Name, NoItineraries, Features>;
diff --git a/llvm/lib/Target/AVR/AVR.td b/llvm/lib/Target/AVR/AVR.td
index 22ffc4a368ad6..dec1925b34035 100644
--- a/llvm/lib/Target/AVR/AVR.td
+++ b/llvm/lib/Target/AVR/AVR.td
@@ -32,7 +32,10 @@ include "AVRRegisterInfo.td"
 
 include "AVRInstrInfo.td"
 
-def AVRInstrInfo : InstrInfo;
+def AVRInstrInfo : InstrInfo {
+  // FIXME: Migrate AVR disassembler to work with non-templated decoder.
+  let GenerateTemplatedDecoder = true;
+}
 
 //===---------------------------------------------------------------------===//
 // Calling Conventions
diff --git a/llvm/lib/Target/CSKY/CSKY.td b/llvm/lib/Target/CSKY/CSKY.td
index b5df93a9d464c..3d3d8dbe8bbfa 100644
--- a/llvm/lib/Target/CSKY/CSKY.td
+++ b/llvm/lib/Target/CSKY/CSKY.td
@@ -671,7 +671,10 @@ def : CK860V<"ck860fv", NoSchedModel,
 // Define the CSKY target.
 //===----------------------------------------------------------------------===//
 
-def CSKYInstrInfo : InstrInfo;
+def CSKYInstrInfo : InstrInfo {
+  // FIXME: Migrate CSKY disassembler to work with non-templated decoder.
+  let GenerateTemplatedDecoder = true;
+}
 
 
 def CSKYAsmParser : AsmParser {
diff --git a/llvm/lib/Target/MSP430/MSP430.td b/llvm/lib/Target/MSP430/MSP430.td
index 38aa30fcf4dd1..d6d569ce33204 100644
--- a/llvm/lib/Target/MSP430/MSP430.td
+++ b/llvm/lib/Target/MSP430/MSP430.td
@@ -61,7 +61,10 @@ include "MSP430CallingConv.td"
 
 include "MSP430InstrInfo.td"
 
-def MSP430InstrInfo : InstrInfo;
+def MSP430InstrInfo : InstrInfo {
+  // FIXME: Migrate MPS430 disassembler to work with non-templated decoder.
+  let GenerateTemplatedDecoder = true;
+}
 
 //===---------------------------------------------------------------------===//
 // Assembly Printers
diff --git a/llvm/lib/Target/Mips/Mips.td b/llvm/lib/Target/Mips/Mips.td
index b346ba95f5984..49bec7e02e4b2 100644
--- a/llvm/lib/Target/Mips/Mips.td
+++ b/llvm/lib/Target/Mips/Mips.td
@@ -229,6 +229,8 @@ include "MipsScheduleP5600.td"
 include "MipsScheduleGeneric.td"
 
 def MipsInstrInfo : InstrInfo {
+  // FIXME: Migrate MIPS disassembler to work with non-templated decoder.
+  let GenerateTemplatedDecoder = true;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td
index ea7c2203662bd..54b2ff6b3ef05 100644
--- a/llvm/lib/Target/PowerPC/PPC.td
+++ b/llvm/lib/Target/PowerPC/PPC.td
@@ -721,6 +721,8 @@ include "PPCCallingConv.td"
 
 def PPCInstrInfo : InstrInfo {
   let isLittleEndianEncoding = 1;
+  // FIXME: Migrate PPC disassembler to work with non-templated decoder.
+  let GenerateTemplatedDecoder = true;
 }
 
 def PPCAsmWriter : AsmWriter {
diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index 78be55b3a51d3..8c71cd097b8c9 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -712,9 +712,7 @@ DecodeStatus RISCVDisassembler::getInstruction32(MCInst &MI, uint64_t &Size,
   }
   Size = 4;
 
-  // Use uint64_t to match getInstruction48. decodeInstruction is templated
-  // on the Insn type.
-  uint64_t Insn = support::endian::read32le(Bytes.data());
+  uint32_t Insn = support::endian::read32le(Bytes.data());
 
   for (const DecoderListEntry &Entry : DecoderList32) {
     if (!Entry.haveContainedFeatures(STI.getFeatureBits()))
@@ -760,9 +758,7 @@ DecodeStatus RISCVDisassembler::getInstruction16(MCInst &MI, uint64_t &Size,
   }
   Size = 2;
 
-  // Use uint64_t to match getInstruction48. decodeInstruction is templated
-  // on the Insn type.
-  uint64_t Insn = support::endian::read16le(Bytes.data());
+  uint16_t Insn = support::endian::read16le(Bytes.data());
 
   for (const DecoderListEntry &Entry : DecoderList16) {
     if (!Entry.haveContainedFeatures(STI.getFeatureBits()))
@@ -796,9 +792,10 @@ DecodeStatus RISCVDisassembler::getInstruction48(MCInst &MI, uint64_t &Size,
   }
   Size = 6;
 
-  uint64_t Insn = 0;
+  uint64_t InsnBits = 0;
   for (size_t i = Size; i-- != 0;)
-    Insn += (static_cast<uint64_t>(Bytes[i]) << 8 * i);
+    InsnBits += (static_cast<uint64_t>(Bytes[i]) << 8 * i);
+  std::bitset<48> Insn(InsnBits);
 
   for (const DecoderListEntry &Entry : DecoderList48) {
     if (!Entry.haveContainedFeatures(STI.getFeatureBits()))
diff --git a/llvm/lib/Target/SystemZ/SystemZ.td b/llvm/lib/Target/SystemZ/SystemZ.td
index ec110645c62dd..9961eee895326 100644
--- a/llvm/lib/Target/SystemZ/SystemZ.td
+++ b/llvm/lib/Target/SystemZ/SystemZ.td
@@ -57,7 +57,11 @@ include "SystemZInstrHFP.td"
 include "SystemZInstrDFP.td"
 include "SystemZInstrSystem.td"
 
-def SystemZInstrInfo : InstrInfo { let guessInstructionProperties = 0; }
+def SystemZInstrInfo : InstrInfo {
+  let guessInstructionProperties = 0;
+  // FIXME: Migrate SystemZ disassembler to work with non-templated decoder.
+  let GenerateTemplatedDecoder = true;
+}
 
 //===----------------------------------------------------------------------===//
 // Assembly parser
diff --git a/llvm/lib/Target/Xtensa/Xtensa.td b/llvm/lib/Target/Xtensa/Xtensa.td
index 4ef885e19101e..d0e7e349bb708 100644
--- a/llvm/lib/Target/Xtensa/Xtensa.td
+++ b/llvm/lib/Target/Xtensa/Xtensa.td
@@ -44,7 +44,10 @@ include "XtensaCallingConv.td"
 
 include "XtensaInstrInfo.td"
 
-def XtensaInstrInfo : InstrInfo;
+def XtensaInstrInfo : InstrInfo {
+  // FIXME: Migrate XTensa disassembler to work with non-templated decoder.
+  let GenerateTemplatedDecoder = true;
+}
 
 //===----------------------------------------------------------------------===//
 // Target Declaration
diff --git a/llvm/test/TableGen/BitOffsetDecoder.td b/llvm/test/TableGen/BitOffsetDecoder.td
index 04d6e164d0eee..63b874ace85d5 100644
--- a/llvm/test/TableGen/BitOffsetDecoder.td
+++ b/llvm/test/TableGen/BitOffsetDecoder.td
@@ -57,8 +57,8 @@ def baz : Instruction {
 
 }
 
-// CHECK: tmp = fieldFromInstruction(insn, 8, 7);
-// CHECK: tmp = fieldFromInstruction(insn, 8, 8) << 3;
-// CHECK: insertBits(tmp, fieldFromInstruction(insn, 8, 4), 7, 4);
-// CHECK: insertBits(tmp, fieldFromInstruction(insn, 12, 4), 3, 4);
-// CHECK: tmp = fieldFromInstruction(insn, 8, 8) << 4;
+// CHECK: tmp = fieldFromInstruction(Insn, 8, 7);
+// CHECK: tmp = fieldFromInstruction(Insn, 8, 8) << 3;
+// CHECK: insertBits(tmp, fieldFromInstruction(Insn, 8, 4), 7, 4);
+// CHECK: insertBits(tmp, fieldFromInstruction(Insn, 12, 4), 3, 4);
+// CHECK: tmp = fieldFromInstruction(Insn, 8, 8) << 4;
diff --git a/llvm/test/TableGen/DecoderEmitterFnTable.td b/llvm/test/TableGen/DecoderEmitterFnTable.td
index 7bed18c19a513..a08ed66ecbabc 100644
--- a/llvm/test/TableGen/DecoderEmitterFnTable.td
+++ b/llvm/test/TableGen/DecoderEmitterFnTable.td
@@ -71,14 +71,14 @@ def Inst3 : TestInstruction {
   let AsmString = "Inst3";
 }
 
-// CHECK-LABEL: DecodeStatus decodeFn0(DecodeStatus S, InsnType insn, MCInst &MI, uint64_t Address, const MCDisassembler *Decoder, bool &DecodeComplete)
-// CHECK-LABEL: DecodeStatus decodeFn1(DecodeStatus S, InsnType insn, MCInst &MI, uint64_t Address, const MCDisassembler *Decoder, bool &DecodeComplete)
-// CHECK-LABEL: DecodeStatus decodeFn2(DecodeStatus S, InsnType insn, MCInst &MI, uint64_t Address, const MCDisassembler *Decoder, bool &DecodeComplete)
-// CHECK-LABEL: DecodeStatus decodeFn3(DecodeStatus S, InsnType insn, MCInst &MI, uint64_t Address, const MCDisassembler *Decoder, bool &DecodeComplete)
-// CHECK-LABEL: decodeToMCInst(unsigned Idx, DecodeStatus S, InsnType insn, MCInst &MI, uint64_t Address, const MCDisassembler *Decoder, bool &DecodeComplete)
+// CHECK-LABEL: DecodeStatus decodeFn_0(DecodeStatus S, uint8_t Insn, MCInst &MI, uint64_t Address, const MCDisassembler *Decoder, bool &DecodeComplete)
+// CHECK-LABEL: DecodeStatus decodeFn_1(DecodeStatus S, uint8_t Insn, MCInst &MI, uint64_t Address, const MCDisassembler *Decoder, bool &DecodeComplete)
+// CHECK-LABEL: DecodeStatus decodeFn_2(DecodeStatus S, uint8_t Insn, MCInst &MI, uint64_t Address, const MCDisassembler *Decoder, bool &DecodeComplete)
+// CHECK-LABEL: DecodeStatus decodeFn_3(DecodeStatus S, uint8_t Insn, MCInst &MI, uint64_t Address, const MCDisassembler *Decoder, bool &DecodeComplete)
+// CHECK-LABEL: decodeToMCInst(unsigned Idx, DecodeStatus S, uint8_t Insn, MCInst &MI, uint64_t Address, const MCDisassembler *Decoder, bool &DecodeComplete)
 // CHECK: static constexpr DecodeFnTy decodeFnTable[]
-// CHECK-NEXT: decodeFn0,
-// CHECK-NEXT: decodeFn1,
-// CHECK-NEXT: decodeFn2,
-// CHECK-NEXT: decodeFn3,
-// CHECK: return decodeFnTable[Idx](S, insn, MI, Address, Decoder, DecodeComplete)
+// CHECK-NEXT: decodeFn_0,
+// CHECK-NEXT: decodeFn_1,
+// CHECK-NEXT: decodeFn_2,
+// CHECK-NEXT: decodeFn_3,
+// CHECK: return decodeFnTable[Idx](S, Insn, MI, Address, Decoder, DecodeComplete)
diff --git a/llvm/test/TableGen/FixedLenDecoderEmitter/InitValue.td b/llvm/test/TableGen/FixedLenDecoderEmitter/InitValue.td
index 03847439ffc2e..5dc91b0ebedb1 100644
--- a/llvm/test/TableGen/FixedLenDecoderEmitter/InitValue.td
+++ b/llvm/test/TableGen/FixedLenDecoderEmitter/InitValue.td
@@ -39,8 +39,8 @@ def bax : Instruction {
 
 }
 
-// CHECK: tmp = fieldFromInstruction(insn, 9, 7) << 1;
+// CHECK: tmp = fieldFromInstruction(Insn, 9, 7) << 1;
 // CHECK: tmp = 0x1;
-// CHECK: insertBits(tmp, fieldFromInstruction(insn, 9, 7), 1, 7);
+// CHECK: insertBits(tmp, fieldFromInstruction(Insn, 9, 7), 1, 7);
 // CHECK: tmp = 0x100000000;
-// CHECK: insertBits(tmp, fieldFromInstruction(insn, 8, 7), 25, 7);
+// CHECK: insertBits(tmp, fieldFromInstruction(Insn, 8, 7), 25, 7);
diff --git a/llvm/test/TableGen/HwModeEncodeDecode3.td b/llvm/test/TableGen/HwModeEncodeDecode3.td
index c4d488d9d5f8f..d9172b0384e8c 100644
--- a/llvm/test/TableGen/HwModeEncodeDecode3.td
+++ b/llvm/test/TableGen/HwModeEncodeDecode3.td
@@ -118,8 +118,6 @@ def unrelated: Instruction {
 // exact duplicates and could effectively be merged into one.
 // DECODER-LABEL: DecoderTable32[] =
 // DECODER-DAG: Opcode: bar
-// DECODER-LABEL: DecoderTable64[] =
-// DECODER-DAG: Opcode: fooTypeEncDefault:foo
 // DECODER-LABEL: DecoderTableAlt32[] =
 // DECODER-DAG: Opcode: unrelated
 // DECODER-LABEL: DecoderTableAlt_ModeA32[] =
@@ -138,13 +136,13 @@ def unrelated: Instruction {
 // DECODER-LABEL: DecoderTable_ModeC32[] =
 // DECODER-DAG: Opcode: fooTypeEncC:foo
 // DECODER-DAG: Opcode: bar
+// DECODER-LABEL: DecoderTable64[] =
+// DECODER-DAG: Opcode: fooTypeEncDefault:foo
 
 // Under the 'O1' optimization level, unnecessary duplicate tables will be eliminated,
 // reducing the four ‘Alt’ tables down to just one.
 // DECODER-SUPPRESS-O1-LABEL: DecoderTable32[] =
 // DECODER-SUPPRESS-O1-DAG: Opcode: bar
-// DECODER-SUPPRESS-O1-LABEL: DecoderTable64[] =
-// DECODER-SUPPRESS-O1-DAG: Opcode: fooTypeEncDefault:foo
 // DECODER-SUPPRESS-O1-LABEL: DecoderTableAlt32[] =
 // DECODER-SUPPRESS-O1-DAG: Opcode: unrelated
 // DECODER-SUPPRESS-O1-LABEL: DecoderTable_ModeA32[] =
@@ -157,6 +155,8 @@ def unrelated: Instruction {
 // DECODER-SUPPRESS-O1-LABEL: DecoderTable_ModeC32[] =
 // DECODER-SUPPRESS-O1-DAG: Opcode: fooTypeEncC:foo
 // DECODER-SUPPRESS-O1-DAG: Opcode: bar
+// DECODER-SUPPRESS-O1-LABEL: DecoderTable64[] =
+// DECODER-SUPPRESS-O1-DAG: Opcode: fooTypeEncDefault:foo
 
 // Under the 'O2' optimization condition, instructions possessing the 'EncodingByHwMode'
 // attribute will be extracted from their original DecoderNamespace and placed into their
@@ -164,11 +164,13 @@ def unrelated: Instruction {
 // attribute but are within the same DecoderNamespace will be stored in the 'Default' table. This
 // approach will significantly reduce instruction redundancy, but it necessitates users to thoroughly
 // consider the interplay between HwMode and DecoderNamespace for their instructions.
+//
+// Additionally, no 32-bit instruction will appear in a 64-bit decoder table and
+// vice-versa.
+//
 // DECODER-SUPPRESS-O2-LABEL: DecoderTable32[] =
 // DECODER-SUPPRESS-O2-DAG: Opcode: bar
-// DECODER-SUPPRESS-O2-LABEL: DecoderTable64[] =
-// DECODER-SUPPRESS-O2-NOT: Opcode: bar
-// DECODER-SUPPRESS-O2-DAG: Opcode: fooTypeEncDefault:foo
+// DECODER-SUPPRESS-O2-NOT: Opcode: fooTypeEncDefault:foo
 // DECODER-SUPPRESS-O2-LABEL: DecoderTableAlt32[] =
 // DECODER-SUPPRESS-O2-DAG: Opcode: unrelated
 // DECODER-SUPPRESS-O2-LABEL: DecoderTable_ModeA32[] =
@@ -181,6 +183,9 @@ def unrelated: Instruction {
 // DECODER-SUPPRESS-O2-LABEL: DecoderTable_ModeC32[] =
 // DECODER-SUPPRESS-O2-DAG: Opcode: fooTypeEncC:foo
 // DECODER-SUPPRESS-O2-NOT: Opcode: bar
+// DECODER-SUPPRESS-O2-LABEL: DecoderTable64[] =
+// DECODER-SUPPRESS-O2-DAG: Opcode: fooTypeEncDefault:foo
+// DECODER-SUPPRESS-O2-NOT: Opcode: bar
 
 // For 'bar' and 'unrelated', we didn't assign any HwModes for them,
 // they should keep the same in the following four tables.
diff --git a/llvm/test/TableGen/VarLenDecoder.td b/llvm/test/TableGen/VarLenDecoder.td
index 06ff62294a196..6b792efc81f8f 100644
--- a/llvm/test/TableGen/VarLenDecoder.td
+++ b/llvm/test/TableGen/VarLenDecoder.td
@@ -3,7 +3,9 @@
 
 include "llvm/Target/Target.td"
 
-def ArchInstrInfo : InstrInfo { }
+def ArchInstrInfo : InstrInfo {
+  let GenerateTemplatedDecoder = false;
+}
 
 def Arch : Target {
   let InstructionSet = ArchInstrInfo;
@@ -47,6 +49,12 @@ def FOO32 : MyVarInst<MemOp32> {
   );
 }
 
+// Instruction length table
+// CHECK-LABEL: InstrLenTable
+// CHECK: 27,
+// CHECK-NEXT: 43,
+// CHECK-NEXT: };
+
 // CHECK-SMALL:      /* 0 */       MCD::OPC_ExtractField, 3, 5,  // Inst{7-3} ...
 // CHECK-SMALL-NEXT: /* 3 */       MCD::OPC_FilterValue, 8, 4, 0, // Skip to: 11
 // CHECK-SMALL-NEXT: /* 7 */       MCD::OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 0, // Opcode: FOO16
@@ -61,37 +69,34 @@ def FOO32 : MyVarInst<MemOp32> {
 // CHECK-LARGE-NEXT: /* 14 */      MCD::OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 1, // Opcode: FOO32
 // CHECK-LARGE-NEXT: /* 18 */      MCD::OPC_Fail,
 
-// Instruction length table
-// CHECK: 27,
-// CHECK-NEXT: 43,
-// CHECK-NEXT: };
-
+// CHECK-LABEL: decodeToMCInst
 // CHECK:      case 0:
-// CHECK-NEXT: tmp = fieldFromInstruction(insn, 8, 3);
+// CHECK-NEXT: tmp = fieldFromInstruction(Insn, 8, 3);
 // CHECK-NEXT: if (!Check(S, DecodeRegClassRegisterClass(MI, tmp, Address, Decoder))) { return MCDisassembler::Fail; }
-// CHECK-NEXT: tmp = fieldFromInstruction(insn, 0, 3);
+// CHECK-NEXT: tmp = fieldFromInstruction(Insn, 0, 3);
 // CHECK-NEXT: if (!Check(S, DecodeRegClassRegisterClass(MI, tmp, Address, Decoder))) { return MCDisassembler::Fail; }
-// CHECK-NEXT: tmp = fieldFromInstruction(insn, 11, 16);
+// CHECK-NEXT: tmp = fieldFromInstruction(Insn, 11, 16);
 // CHECK-NEXT: if (!Check(S, myCustomDecoder(MI, tmp, Address, Decoder))) { return MCDisassembler::Fail; }
 // CHECK-NEXT: return S;
 // CHECK-NEXT: case 1:
-// CHECK-NEXT: tmp = fieldFromInstruction(insn, 8, 3);
+// CHECK-NEXT: tmp = fieldFromInstruction(Insn, 8, 3);
 // CHECK-NEXT: if (!Check(S, DecodeRegClassRegisterClass(MI, tmp, Address, Decoder))) { return MCDisassembler::Fail; }
-// CHECK-NEXT: tmp = fieldFromInstruction(insn, 0, 3);
+// CHECK-NEXT: tmp = fieldFromInstruction(Insn, 0, 3);
 // CHECK-NEXT: if (!Check(S, myCustomDecoder(MI, tmp, Address, Decoder))) { return MCDisassembler::Fail; }
 // CHECK-NEXT: tmp = 0x0;
-// CHECK-NEXT: insertBits(tmp, fieldFromInstruction(insn, 11, 16), 16, 16);
-// CHECK-NEXT: insertBits(tmp, fieldFromInstruction(insn, 27, 16), 0, 16);
+// CHECK-NEXT: insertBits(tmp, fieldFromInstruction(Insn, 11, 16), 16, 16);
+// CHECK-NEXT: insertBits(tmp, fieldFromInstruction(Insn, 27, 16), 0, 16);
 // CHECK-NEXT: MI.addOperand(MCOperand::createImm(tmp));
 // CHECK-NEXT: return S;
 
+// CHECK-LABEL: decodeInstruction
 // CHECK-LABEL: case MCD::OPC_ExtractField: {
-// CHECK: makeUp(insn, Start + Len);
+// CHECK: makeUp(Insn, Start + Len);
 
 // CHECK-LABEL: case MCD::OPC_CheckField:
 // CHECK-NEXT:  case MCD::OPC_CheckFieldOrFail: {
-// CHECK: makeUp(insn, Start + Len);
+// CHECK: makeUp(Insn, Start + Len);
 
 // CHECK-LABEL: case MCD::OPC_Decode: {
 // CHECK: Len = InstrLenTable[Opc];
-// CHECK-NEXT: makeUp(insn, Len);
+// CHECK-NEXT: makeUp(Insn, Len);
diff --git a/llvm/test/TableGen/trydecode-emission.td b/llvm/test/TableGen/trydecode-emission.td
index c3178dd71cf4b..48004751d24e3 100644
--- a/llvm/test/TableGen/trydecode-emission.td
+++ b/llvm/test/TableGen/trydecode-emission.td
@@ -8,7 +8,7 @@
 
 include "llvm/Target/Target.td"
 
-def archInstrInfo : InstrInfo { }
+def archInstrInfo : InstrInfo;
 
 def arch : Target {
   let InstructionSet = archInstrInfo;
@@ -34,6 +34,17 @@ def InstB : TestInstruction {
   let hasCompleteDecoder = 0;
 }
 
+// CHECK-LABEL: decodeNumToSkip
+// CHECK-NEXT:  unsigned NumToSkip = *Ptr++;
+// CHECK-NEXT:  NumToSkip |= (*Ptr++) << 8;
+// CHECK-NEXT:  return NumToSkip;
+
+// CHECK-LARGE-LABEL: decodeNumToSkip
+// CHECK-LARGE-NEXT:  unsigned NumToSkip = *Ptr++;
+// CHECK-LARGE-NEXT:  NumToSkip |= (*Ptr++) << 8;
+// CHECK-LARGE-NEXT:  NumToSkip |= (*Ptr++) << 16;
+// CHECK-LARGE-NEXT:  return NumToSkip;
+
 // CHECK:      /* 0 */       MCD::OPC_ExtractField, 4, 4,  // Inst{7-4} ...
 // CHECK-NEXT: /* 3 */       MCD::OPC_FilterValueOrFail, 0,
 // CHECK-NEXT: /* 5 */       MCD::OPC_CheckField, 2, 2, 0, 6, 0, // Skip to: 17
@@ -41,11 +52,7 @@ def InstB : TestInstruction {
 // CHECK-NEXT: /* 17 */      MCD::OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 1, // Opcode: InstA, DecodeIdx: {{[0-9]+}}
 // CHECK-NEXT: /* 21 */      MCD::OPC_Fail,
 
-// CHECK: if (!Check(S, DecodeInstB(MI, insn, Address, Decoder))) { DecodeComplete = false; return MCDisassembler::Fail; }
-
-// CHECK:       unsigned NumToSkip = *Ptr++;
-// CHECK-NEXT:  NumToSkip |= (*Ptr++) << 8;
-// CHECK-NEXT:  return NumToSkip;
+// CHECK: if (!Check(S, DecodeInstB(MI, Insn, Address, Decoder))) { DecodeComplete = false; return MCDisassembler::Fail; }
 
 // CHECK-LARGE:      /* 0 */       MCD::OPC_ExtractField, 4, 4,  // Inst{7-4} ...
 // CHECK-LARGE-NEXT: /* 3 */       MCD::OPC_FilterValueOrFail, 0,
@@ -54,9 +61,4 @@ def InstB : TestInstruction {
 // CHECK-LARGE-NEXT: /* 19 */      MCD::OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 1, // Opcode: InstA, DecodeIdx: {{[0-9]+}}
 // CHECK-LARGE-NEXT: /* 23 */      MCD::OPC_Fail,
 
-// CHECK-LARGE: if (!Check(S, DecodeInstB(MI, insn, Address, Decoder))) { DecodeComplete = false; return MCDisassembler::Fail; }
-
-// CHECK-LARGE:       unsigned NumToSkip = *Ptr++;
-// CHECK-LARGE-NEXT:  NumToSkip |= (*Ptr++) << 8;
-// CHECK-LARGE-NEXT:  NumToSkip |= (*Ptr++) << 16;
-// CHECK-LARGE-NEXT:  return NumToSkip;
+// CHECK-LARGE: if (!Check(S, DecodeInstB(MI, Insn, Address, Decoder))) { DecodeComplete = false; return MCDisassembler::Fail; }
diff --git a/llvm/test/TableGen/trydecode-emission2.td b/llvm/test/TableGen/trydecode-emission2.td
index 4c8a95eff5dd1..7ad91cb23fffe 100644
--- a/llvm/test/TableGen/trydecode-emission2.td
+++ b/llvm/test/TableGen/trydecode-emission2.td
@@ -41,8 +41,8 @@ def InstB : TestInstruction {
 // CHECK-NEXT: /* 26 */      MCD::OPC_TryDecodeOrFail, {{[0-9]+}}, {{[0-9]+}}, 1,
 // CHECK-NEXT: /* 30 */      MCD::OPC_Fail,
 
-// CHECK: if (!Check(S, DecodeInstB(MI, insn, Address, Decoder))) { DecodeComplete = false; return MCDisassembler::Fail; }
-// CHECK: if (!Check(S, DecodeInstA(MI, insn, Address, Decoder))) { DecodeComplete = false; return MCDisassembler::Fail; }
+// CHECK: if (!Check(S, DecodeInstB(MI, Insn, Address, Decoder))) { DecodeComplete = false; return MCDisassembler::Fail; }
+// CHECK: if (!Check(S, DecodeInstA(MI, Insn, Address, Decoder))) { DecodeComplete = false; return MCDisassembler::Fail; }
 
 // CHECK-LARGE:      /* 0 */       MCD::OPC_ExtractField, 2, 1,  // Inst{2} ...
 // CHECK-LARGE-NEXT: /* 3 */       MCD::OPC_FilterValueOrFail, 0,
@@ -54,5 +54,5 @@ def InstB : TestInstruction {
 // CHECK-LARGE-NEXT: /* 28 */      MCD::OPC_TryDecodeOrFail, {{[0-9]+}}, {{[0-9]+}}, 1,
 // CHECK-LARGE-NEXT: /* 32 */      MCD::OPC_Fail,
 
-// CHECK-LARGE: if (!Check(S, DecodeInstB(MI, insn, Address, Decoder))) { DecodeComplete = false; return MCDisassembler::Fail; }
-// CHECK-LARGE: if (!Check(S, DecodeInstA(MI, insn, Address, Decoder))) { DecodeComplete = false; return MCDisassembler::Fail; }
+// CHECK-LARGE: if (!Check(S, DecodeInstB(MI, Insn, Address, Decoder))) { DecodeComplete = false; return MCDisassembler::Fail; }
+// CHECK-LARGE: if (!Check(S, DecodeInstA(MI, Insn, Address, Decoder))) { DecodeComplete = false; return MCDisassembler::Fail; }
diff --git a/llvm/test/TableGen/trydecode-emission4.td b/llvm/test/TableGen/trydecode-emission4.td
index 2c63229c053a5..bdf87c6359e8e 100644
--- a/llvm/test/TableGen/trydecode-emission4.td
+++ b/llvm/test/TableGen/trydecode-emission4.td
@@ -40,7 +40,7 @@ def InstB : TestInstruction {
 // CHECK-NEXT: /* 19 */      MCD::OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 1, // Opcode: InstA, DecodeIdx: {{[0-9]+}}
 // CHECK-NEXT: /* 23 */      MCD::OPC_Fail,
 
-// CHECK: if (!Check(S, DecodeInstB(MI, insn, Address, Decoder))) { DecodeComplete = false; return MCDisassembler::Fail; }
+// CHECK: if (!Check(S, DecodeInstB(MI, Insn, Address, Decoder))) { DecodeComplete = false; return MCDisassembler::Fail; }
 
 
 // CHECK-LARGE:      /* 0 */       MCD::OPC_ExtractField, 250, 3, 4,  // Inst{509-506} ...
@@ -50,5 +50,5 @@ def InstB : TestInstruction {
 // CHECK-LARGE-NEXT: /* 21 */      MCD::OPC_Decode, {{[0-9]+}}, {{[0-9]+}}, 1, // Opcode: InstA, DecodeIdx: {{[0-9]+}}
 // CHECK-LARGE-NEXT: /* 25 */      MCD::OPC_Fail,
 
-// CHECK-LARGE: if (!Check(S, DecodeInstB(MI, insn, Address, Decoder))) { DecodeComplete = false; return MCDisassembler::Fail; }
+// CHECK-LARGE: if (!Check(S, DecodeInstB(MI, Insn, Address, Decoder))) { DecodeComplete = false; return MCDisassembler::Fail; }
 
diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp
index 2b44577253982..9591b874f71ce 100644
--- a/llvm/utils/TableGen/DecoderEmitter.cpp
+++ b/llvm/utils/TableGen/DecoderEmitter.cpp
@@ -23,6 +23,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
@@ -217,6 +218,20 @@ struct EncodingAndInst {
 
 using NamespacesHwModesMap = std::map<std::string, std::set<unsigned>>;
 
+// A struct to represent the C++ type for the instruction payload.
+struct CPPType {
+  enum TypeKind { TemplateTy, UIntTy, APIntTy, BitsetTy } Kind;
+  unsigned Bitwidth; // 0 for TemplateTy.
+
+  CPPType(unsigned Bitwidth, bool IsVarLenInst);
+
+  // Returns the C++ type name for code generation.
+  std::string getName() const;
+
+  // Returns the parameter declration for code generation.
+  std::string getParamDecl() const;
+};
+
 class DecoderEmitter {
   const RecordKeeper &RK;
   std::vector<EncodingAndInst> NumberedEncodings;
@@ -236,8 +251,8 @@ class DecoderEmitter {
                          ArrayRef<unsigned> InstrLen) const;
   void emitPredicateFunction(formatted_raw_ostream &OS,
                              PredicateSet &Predicates) const;
-  void emitDecoderFunction(formatted_raw_ostream &OS,
-                           DecoderSet &Decoders) const;
+  void emitDecoderFunction(formatted_raw_ostream &OS, DecoderSet &Decoders,
+                           const CPPType &Type, StringRef Suffix) const;
 
   // run - Output the code emitter
   void run(raw_ostream &o);
@@ -1017,8 +1032,8 @@ void DecoderEmitter::emitPredicateFunction(formatted_raw_ostream &OS,
                                            PredicateSet &Predicates) const {
   // The predicate function is just a big switch statement based on the
   // input predicate index.
-  OS << "static bool checkDecoderPredicate(unsigned Idx, const FeatureBitset "
-        "&Bits) {\n";
+  OS << "static bool checkDecoderPredicate(unsigned Idx, "
+     << "const FeatureBitset &Bits) {\n";
   OS << "  switch (Idx) {\n";
   OS << "  default: llvm_unreachable(\"Invalid index!\");\n";
   for (const auto &[Index, Predicate] : enumerate(Predicates)) {
@@ -1029,38 +1044,109 @@ void DecoderEmitter::emitPredicateFunction(formatted_raw_ostream &OS,
   OS << "}\n\n";
 }
 
+// ----------------------------------------------------------------------------
+// CPPType implementation.
+
+CPPType::CPPType(unsigned Bitwidth, bool IsVarLenInst) : Bitwidth(Bitwidth) {
+  if (IsVarLenInst)
+    Kind = APIntTy;
+  else if (Bitwidth == 0)
+    Kind = TemplateTy;
+  else if (Bitwidth == 8 || Bitwidth == 16 || Bitwidth == 32 || Bitwidth == 64)
+    Kind = UIntTy;
+  else
+    Kind = BitsetTy;
+}
+
+std::string CPPType::getName() const {
+  switch (Kind) {
+  case TemplateTy:
+    return "InsnType";
+  case UIntTy:
+    return "uint" + std::to_string(Bitwidth) + "_t";
+  case APIntTy:
+    return "APInt";
+  case BitsetTy:
+    return "std::bitset<" + std::to_string(Bitwidth) + ">";
+  }
+  llvm_unreachable("Unexpected kind");
+}
+
+std::string CPPType::getParamDecl() const {
+  switch (Kind) {
+  case TemplateTy:
+  case BitsetTy:
+    return "const " + getName() + " &Insn";
+  case UIntTy:
+    return getName() + " Insn";
+  case APIntTy:
+    return "APInt &Insn";
+  }
+  llvm_unreachable("Unexpected kind");
+}
+
+static void emitTemplate(formatted_raw_ostream &OS, const CPPType &Type) {
+  if (Type.Kind == CPPType::TemplateTy)
+    OS << "template <typename InsnType>\n";
+}
+
 void DecoderEmitter::emitDecoderFunction(formatted_raw_ostream &OS,
-                                         DecoderSet &Decoders) const {
+                                         DecoderSet &Decoders,
+                                         const CPPType &Type,
+                                         StringRef Suffix) const {
   // The decoder function is just a big switch statement or a table of function
   // pointers based on the input decoder index.
+  const std::string TypeName = Type.getName();
+  const std::string TypeParamDecl = Type.getParamDecl();
 
   // TODO: When InsnType is large, using uint64_t limits all fields to 64 bits
   // It would be better for emitBinaryParser to use a 64-bit tmp whenever
   // possible but fall back to an InsnType-sized tmp for truly large fields.
-  StringRef TmpTypeDecl =
-      "using TmpType = std::conditional_t<std::is_integral<InsnType>::value, "
-      "InsnType, uint64_t>;\n";
-  StringRef DecodeParams =
-      "DecodeStatus S, InsnType insn, MCInst &MI, uint64_t Address, const "
-      "MCDisassembler *Decoder, bool &DecodeComplete";
+  auto emitTmpTypeDec = [&Type, &TypeName, &OS]() {
+    if (Type.Kind == CPPType::TemplateTy)
+      OS << formatv(
+          "  using TmpType = std::conditional_t<std::is_integral<{0}>::value, "
+          "{0}, uint64_t>;\n",
+          TypeName);
+  };
+
+  // Returns the type to use for the `tmp` variable.
+  StringRef TmpType = [&Type, &TypeName]() -> StringRef {
+    switch (Type.Kind) {
+    case CPPType::TemplateTy:
+      return "TmpType";
+    case CPPType::UIntTy:
+      return TypeName;
+    default:
+      return "uint64_t";
+    }
+  }();
+
+  auto DecodeParams =
+      formatv("DecodeStatus S, {}, MCInst &MI, uint64_t Address, "
+              "const MCDisassembler *Decoder, bool &DecodeComplete",
+              TypeParamDecl);
 
   if (UseFnTableInDecodeToMCInst) {
     // Emit a function for each case first.
     for (const auto &[Index, Decoder] : enumerate(Decoders)) {
-      OS << "template <typename InsnType>\n";
-      OS << "DecodeStatus decodeFn" << Index << "(" << DecodeParams << ") {\n";
-      OS << "  " << TmpTypeDecl;
-      OS << "  [[maybe_unused]] TmpType tmp;\n";
+      emitTemplate(OS, Type);
+      OS << "DecodeStatus decodeFn" << Suffix << '_' << Index << "("
+         << DecodeParams << ") {\n";
+      emitTmpTypeDec();
+      OS << "  [[maybe_unused]] " << TmpType << " tmp;\n";
       OS << Decoder;
       OS << "  return S;\n";
       OS << "}\n\n";
     }
   }
 
+  assert(!Decoders.empty() && "Did not find any decoders");
+
   OS << "// Handling " << Decoders.size() << " cases.\n";
-  OS << "template <typename InsnType>\n";
-  OS << "static DecodeStatus decodeToMCInst(unsigned Idx, " << DecodeParams
-     << ") {\n";
+  emitTemplate(OS, Type);
+  OS << "static DecodeStatus decodeToMCInst" << Suffix << "(unsigned Idx, "
+     << DecodeParams << ") {\n";
   OS << "  DecodeComplete = true;\n";
 
   if (UseFnTableInDecodeToMCInst) {
@@ -1068,15 +1154,15 @@ void DecoderEmitter::emitDecoderFunction(formatted_raw_ostream &OS,
     OS << "  using DecodeFnTy = DecodeStatus (*)(" << DecodeParams << ");\n";
     OS << "  static constexpr DecodeFnTy decodeFnTable[] = {\n";
     for (size_t Index : llvm::seq(Decoders.size()))
-      OS << "    decodeFn" << Index << ",\n";
+      OS << "    decodeFn" << Suffix << '_' << Index << ",\n";
     OS << "  };\n";
     OS << "  if (Idx >= " << Decoders.size() << ")\n";
     OS << "    llvm_unreachable(\"Invalid index!\");\n";
-    OS << "  return decodeFnTable[Idx](S, insn, MI, Address, Decoder, "
+    OS << "  return decodeFnTable[Idx](S, Insn, MI, Address, Decoder, "
           "DecodeComplete);\n";
   } else {
-    OS << "  " << TmpTypeDecl;
-    OS << "  TmpType tmp;\n";
+    emitTmpTypeDec();
+    OS << "  " << TmpType << " tmp;\n";
     OS << "  switch (Idx) {\n";
     OS << "  default: llvm_unreachable(\"Invalid index!\");\n";
     for (const auto &[Index, Decoder] : enumerate(Decoders)) {
@@ -1174,7 +1260,7 @@ bool FilterChooser::emitBinaryParser(raw_ostream &OS, indent Indent,
       OS << "insertBits(tmp, ";
     else
       OS << "tmp = ";
-    OS << "fieldFromInstruction(insn, " << EF.Base << ", " << EF.Width << ')';
+    OS << "fieldFromInstruction(Insn, " << EF.Base << ", " << EF.Width << ')';
     if (UseInsertBits)
       OS << ", " << EF.Offset << ", " << EF.Width << ')';
     else if (EF.Offset != 0)
@@ -1205,7 +1291,7 @@ bool FilterChooser::emitDecoder(raw_ostream &OS, indent Indent,
     if (Op.numFields() == 0 && !Op.Decoder.empty()) {
       HasCompleteDecoder = Op.HasCompleteDecoder;
       OS << Indent << "if (!Check(S, " << Op.Decoder
-         << "(MI, insn, Address, Decoder))) { "
+         << "(MI, Insn, Address, Decoder))) { "
          << (HasCompleteDecoder ? "" : "DecodeComplete = false; ")
          << "return MCDisassembler::Fail; }\n";
       break;
@@ -2086,69 +2172,161 @@ populateInstruction(const CodeGenTarget &Target, const Record &EncodingDef,
   return Bits.getNumBits();
 }
 
-// emitFieldFromInstruction - Emit the templated helper function
-// fieldFromInstruction().
-// On Windows we make sure that this function is not inlined when
-// using the VS compiler. It has a bug which causes the function
-// to be optimized out in some circumstances. See llvm.org/pr38292
-static void emitFieldFromInstruction(formatted_raw_ostream &OS) {
-  OS << R"(
-// Helper functions for extracting fields from encoded instructions.
-// InsnType must either be integral or an APInt-like object that must:
-// * be default-constructible and copy-constructible
-// * Support extractBitsAsZExtValue(numBits, startBit)
-// * Support the ~, &, ==, and != operators with other objects of the same type
-// * Support the != and bitwise & with uint64_t
-template <typename InsnType>
+// Emit the templated helper function fieldFromInstruction().
+//
+// On Windows we make sure that this function is not inlined when using the VS
+// compiler. It has a bug which causes the function to be optimized out in some
+// circumstances. See llvm.org/pr38292
+//
+// There are 4 variants of this function that can be generated under different
+// conditions:
+//
+// 1. Integer types (for non-templated code when using integer types and when
+//                   generating templated code).
+// 2. bitset types (for non-templated code with bitset type).
+// 3. APInt type (for variable length instructions).
+// 4. Non-Integer `InsnType` (when generating templated code)
+
+static void emitFieldFromInstruction(formatted_raw_ostream &OS,
+                                     bool GenerateIntType,
+                                     bool GenerateBitsetType,
+                                     bool GenerateAPIntType,
+                                     bool GenerateTemplateType) {
+  if (GenerateIntType) {
+    OS << R"(
+// Helper macro to disable inlining of `fieldFromInstruction` for integer types.
 #if defined(_MSC_VER) && !defined(__clang__)
-__declspec(noinline)
+#define DEC_EMIT_NO_INLINE __declspec(noinline)
+#else
+#define DEC_EMIT_NO_INLINE
 #endif
-static std::enable_if_t<std::is_integral<InsnType>::value, InsnType>
-fieldFromInstruction(const InsnType &insn, unsigned startBit,
-                     unsigned numBits) {
-  assert(startBit + numBits <= 64 && "Cannot support >64-bit extractions!");
-  assert(startBit + numBits <= (sizeof(InsnType) * 8) &&
+
+template <typename IntType>
+DEC_EMIT_NO_INLINE static 
+std::enable_if_t<std::is_integral_v<IntType>, IntType>
+fieldFromInstruction(const IntType &Insn, unsigned StartBit, unsigned Size) {
+  assert(StartBit + Size <= 64 && "Cannot support >64-bit extractions!");
+  assert(StartBit + Size <= (sizeof(IntType) * 8) &&
          "Instruction field out of bounds!");
-  InsnType fieldMask;
-  if (numBits == sizeof(InsnType) * 8)
-    fieldMask = (InsnType)(-1LL);
+  IntType fieldMask;
+  if (Size == sizeof(IntType) * 8)
+    fieldMask = (IntType)(-1LL);
   else
-    fieldMask = (((InsnType)1 << numBits) - 1) << startBit;
-  return (insn & fieldMask) >> startBit;
+    fieldMask = (((IntType)1 << Size) - 1) << StartBit;
+  return (Insn & fieldMask) >> StartBit;
+}
+#undef DEC_EMIT_NO_INLINE
+
+)";
+  }
+
+  if (GenerateBitsetType) {
+    // Emit a version that will work with a std::bitset.
+    OS << R"(
+template <size_t N>
+uint64_t fieldFromInstruction(const std::bitset<N>& Insn, unsigned StartBit,
+                              unsigned NumBits) {
+  assert(StartBit + NumBits <= N && "Instruction field out of bounds!");
+  assert(NumBits <= 64 && "Cannot support >64-bit extractions!");
+  const std::bitset<N> Mask(maskTrailingOnes<uint64_t>(NumBits));
+  return ((Insn >> StartBit) & Mask).to_ullong();
+}
+)";
+  }
+
+  if (GenerateAPIntType) {
+    OS << R"(
+static uint64_t fieldFromInstruction(const APInt &Insn, unsigned StartBit,
+                     unsigned NumBits) {
+  return Insn.extractBitsAsZExtValue(NumBits, StartBit);
 }
+)";
+  }
+
+  if (GenerateTemplateType) {
+    OS << R"(
+// Helper functions for extracting fields from encoded instructions.
+// InsnType must either be integral or an APInt-like object that must:
+// * be default-constructible and copy-constructible
+// * Support extractBitsAsZExtValue(NumBits, StartBit)
+// * Support the ~, &, ==, and != operators with other objects of the same type
+// * Support the != and bitwise & with uint64_t
 
 template <typename InsnType>
-static std::enable_if_t<!std::is_integral<InsnType>::value, uint64_t>
-fieldFromInstruction(const InsnType &insn, unsigned startBit,
-                     unsigned numBits) {
-  return insn.extractBitsAsZExtValue(numBits, startBit);
+static std::enable_if_t<!std::is_integral_v<InsnType>, uint64_t>
+fieldFromInstruction(const InsnType &Insn, unsigned StartBit,
+                     unsigned NumBits) {
+  return Insn.extractBitsAsZExtValue(NumBits, StartBit);
 }
 )";
+  }
 }
 
-// emitInsertBits - Emit the templated helper function insertBits().
+// Emit the helper function insertBits().
 static void emitInsertBits(formatted_raw_ostream &OS) {
   OS << R"(
 // Helper function for inserting bits extracted from an encoded instruction into
 // an integer-typed field.
 template <typename IntType>
 static std::enable_if_t<std::is_integral_v<IntType>, void>
-insertBits(IntType &field, IntType bits, unsigned startBit, unsigned numBits) {
-  // Check that no bit beyond numBits is set, so that a simple bitwise |
+insertBits(IntType &Field, IntType Bits, unsigned StartBit, unsigned NumBits) {
+  // Check that no bit beyond NumBits is set, so that a simple bitwise |
   // is sufficient.
-  assert((~(((IntType)1 << numBits) - 1) & bits) == 0 &&
-           "bits has more than numBits bits set");
-  assert(startBit + numBits <= sizeof(IntType) * 8);
-  (void)numBits;
-  field |= bits << startBit;
+  assert((~(((IntType)1 << NumBits) - 1) & Bits) == 0 &&
+           "Bits has more than NumBits bits set");
+  assert(StartBit + NumBits <= sizeof(IntType) * 8);
+  (void)NumBits;
+  Field |= Bits << StartBit;
 }
 )";
 }
 
-// emitDecodeInstruction - Emit the templated helper function
-// decodeInstruction().
+static void emitDecodeInstructionAsCallToImpl(formatted_raw_ostream &OS,
+                                              const CPPType &Type,
+                                              const CPPType &MaxType,
+                                              StringRef Suffix) {
+  OS << formatv(R"(
+static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
+    {}, uint64_t Address, const MCDisassembler *DisAsm,
+    const MCSubtargetInfo &STI) {{
+)",
+                Type.getParamDecl());
+
+  // Convert `Insn` to max bitwidth type.
+  bool UseAssignment = MaxType.Kind == CPPType::UIntTy ||
+                       Type.Kind == CPPType::UIntTy ||
+                       Type.Bitwidth == MaxType.Bitwidth;
+  if (UseAssignment) {
+    OS << formatv("  {} InsnMaxWidth = Insn;", MaxType.getName());
+  } else {
+    // Expand from smaller bitset type to larger bitset type.
+    OS << formatv("  const {} Mask(maskTrailingOnes<uint64_t>(64));\n",
+                  Type.getName());
+    OS << formatv("  {} InsnMaxWidth((Insn & Mask).to_ullong());\n",
+                  MaxType.getName());
+    for (unsigned I = 64; I < Type.Bitwidth; I += 64)
+      OS << formatv(
+          "  InsnMaxWidth |= {0}(((Insn >> {1}) & Mask).to_ullong()) << {1};\n",
+          MaxType.getName(), I);
+  }
+
+  OS << formatv(R"(
+  auto DecodeToMCInst = [{}Insn](unsigned DecodeIdx, DecodeStatus S, MCInst &MI,
+                                 uint64_t Address, const MCDisassembler *DisAsm,
+                                 bool &DecodeComplete) {{
+    return decodeToMCInst{}(DecodeIdx, S, Insn, MI, Address, DisAsm, DecodeComplete);
+  };
+  return decodeInstructionImpl(DecodeTable, MI, InsnMaxWidth, Address, DisAsm, STI, DecodeToMCInst);
+}
+
+)",
+                Type.Kind == CPPType::UIntTy ? "" : "&", Suffix);
+}
+
+// Emit the entry function function decodeInstruction().
 static void emitDecodeInstruction(formatted_raw_ostream &OS, bool IsVarLenInst,
-                                  unsigned OpcodeMask) {
+                                  unsigned OpcodeMask, const CPPType &Type,
+                                  StringRef Suffix, bool IsImplFunction) {
   const bool HasTryDecode = OpcodeMask & ((1 << MCD::OPC_TryDecode) |
                                           (1 << MCD::OPC_TryDecodeOrFail));
   const bool HasCheckPredicate =
@@ -2156,29 +2334,34 @@ static void emitDecodeInstruction(formatted_raw_ostream &OS, bool IsVarLenInst,
       ((1 << MCD::OPC_CheckPredicate) | (1 << MCD::OPC_CheckPredicateOrFail));
   const bool HasSoftFail = OpcodeMask & (1 << MCD::OPC_SoftFail);
 
-  OS << R"(
-static unsigned decodeNumToSkip(const uint8_t *&Ptr) {
-  unsigned NumToSkip = *Ptr++;
-  NumToSkip |= (*Ptr++) << 8;
-)";
-  if (getNumToSkipInBytes() == 3)
-    OS << "  NumToSkip |= (*Ptr++) << 16;\n";
-  OS << R"(  return NumToSkip;
-}
+  assert(
+      (!IsVarLenInst || (Type.Kind == CPPType::APIntTy && !IsImplFunction)) &&
+      "For variable length instructions, expected use of APInt and no impl "
+      "function");
 
-template <typename InsnType>
-static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
-                                      InsnType insn, uint64_t Address,
-                                      const MCDisassembler *DisAsm,
-                                      const MCSubtargetInfo &STI)";
+  emitTemplate(OS, Type);
+  OS << formatv("static DecodeStatus decodeInstruction{}(const uint8_t "
+                "DecodeTable[], MCInst &MI, {}, uint64_t Address, "
+                "const MCDisassembler *DisAsm, const MCSubtargetInfo &STI",
+                IsImplFunction ? "Impl" : "", Type.getParamDecl());
   if (IsVarLenInst) {
-    OS << ",\n                                      "
-          "llvm::function_ref<void(APInt &, uint64_t)> makeUp";
+    OS << ", function_ref<void(APInt &, uint64_t)> makeUp";
+  } else if (IsImplFunction) {
+    OS << ", function_ref<DecodeStatus(unsigned, DecodeStatus, MCInst &, "
+          "uint64_t, const MCDisassembler *, bool &)> decodeToMCInstPtr";
   }
+
   OS << ") {\n";
+
   if (HasCheckPredicate)
     OS << "  const FeatureBitset &Bits = STI.getFeatureBits();\n";
 
+  std::string DecodeToMCInstDirectCall =
+      "decodeToMCInst" + Suffix.str() + "(DecodeIdx, S, Insn";
+  StringRef DecodeToMCInstCall = DecodeToMCInstDirectCall;
+  if (IsImplFunction)
+    DecodeToMCInstCall = "decodeToMCInstPtr(DecodeIdx, S";
+
   OS << R"(
   const uint8_t *Ptr = DecodeTable;
   uint64_t CurFieldValue = 0;
@@ -2196,9 +2379,9 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
       unsigned Start = decodeULEB128AndIncUnsafe(Ptr);
       unsigned Len = *Ptr++;)";
   if (IsVarLenInst)
-    OS << "\n      makeUp(insn, Start + Len);";
+    OS << "\n      makeUp(Insn, Start + Len);";
   OS << R"(
-      CurFieldValue = fieldFromInstruction(insn, Start, Len);
+      CurFieldValue = fieldFromInstruction(Insn, Start, Len);
       LLVM_DEBUG(dbgs() << Loc << ": OPC_ExtractField(" << Start << ", "
                    << Len << "): " << CurFieldValue << "\n");
       break;
@@ -2235,9 +2418,9 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
       unsigned Start = decodeULEB128AndIncUnsafe(Ptr);
       unsigned Len = *Ptr;)";
   if (IsVarLenInst)
-    OS << "\n      makeUp(insn, Start + Len);";
+    OS << "\n      makeUp(Insn, Start + Len);";
   OS << R"(
-      uint64_t FieldValue = fieldFromInstruction(insn, Start, Len);
+      uint64_t FieldValue = fieldFromInstruction(Insn, Start, Len);
       // Decode the field value.
       unsigned PtrLen = 0;
       uint64_t ExpectedValue = decodeULEB128(++Ptr, &PtrLen);
@@ -2297,21 +2480,23 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
       bool DecodeComplete;)";
   if (IsVarLenInst) {
     OS << "\n      unsigned Len = InstrLenTable[Opc];\n"
-       << "      makeUp(insn, Len);";
+       << "      makeUp(Insn, Len);";
   }
-  OS << R"(
-      S = decodeToMCInst(DecodeIdx, S, insn, MI, Address, DisAsm, DecodeComplete);
+
+  OS << formatv(R"(
+      S = {}, MI, Address, DisAsm, DecodeComplete);
       assert(DecodeComplete);
 
       LLVM_DEBUG(dbgs() << Loc << ": OPC_Decode: opcode " << Opc
                    << ", using decoder " << DecodeIdx << ": "
                    << (S != MCDisassembler::Fail ? "PASS\n" : "FAIL\n"));
       return S;
-    })";
+    })",
+                DecodeToMCInstCall);
   if (HasTryDecode) {
-    OS << R"(
+    OS << formatv(R"(
     case MCD::OPC_TryDecode:
-    case MCD::OPC_TryDecodeOrFail: {
+    case MCD::OPC_TryDecodeOrFail: {{
       bool IsFail = DecoderOp == MCD::OPC_TryDecodeOrFail;
       // Decode the Opcode value.
       unsigned Opc = decodeULEB128AndIncUnsafe(Ptr);
@@ -2322,18 +2507,18 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
       MCInst TmpMI;
       TmpMI.setOpcode(Opc);
       bool DecodeComplete;
-      S = decodeToMCInst(DecodeIdx, S, insn, TmpMI, Address, DisAsm, DecodeComplete);
+      S = {}, TmpMI, Address, DisAsm, DecodeComplete);
       LLVM_DEBUG(dbgs() << Loc << ": OPC_TryDecode: opcode " << Opc
                    << ", using decoder " << DecodeIdx << ": ");
 
-      if (DecodeComplete) {
+      if (DecodeComplete) {{
         // Decoding complete.
         LLVM_DEBUG(dbgs() << (S != MCDisassembler::Fail ? "PASS\n" : "FAIL\n"));
         MI = TmpMI;
         return S;
       }
       assert(S == MCDisassembler::Fail);
-      if (IsFail) {
+      if (IsFail) {{
         LLVM_DEBUG(dbgs() << "FAIL: returning FAIL\n");
         return MCDisassembler::Fail;
       }
@@ -2344,7 +2529,8 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
       // set before the decode attempt.
       S = MCDisassembler::Success;
       break;
-    })";
+    })",
+                  DecodeToMCInstCall);
   }
   if (HasSoftFail) {
     OS << R"(
@@ -2352,7 +2538,7 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
         // Decode the mask values.
         uint64_t PositiveMask = decodeULEB128AndIncUnsafe(Ptr);
         uint64_t NegativeMask = decodeULEB128AndIncUnsafe(Ptr);
-        bool Failed = (insn & PositiveMask) != 0 || (~insn & NegativeMask) != 0;
+        bool Failed = (Insn & PositiveMask) != 0 || (~Insn & NegativeMask) != 0;
         if (Failed)
           S = MCDisassembler::SoftFail;
         LLVM_DEBUG(dbgs() << Loc << ": OPC_SoftFail: " << (Failed ? "FAIL\n" : "PASS\n"));
@@ -2372,16 +2558,31 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
 )";
 }
 
+static void emitCommonFunctions(formatted_raw_ostream &OS) {
+  OS << R"(
 // Helper to propagate SoftFail status. Returns false if the status is Fail;
 // callers are expected to early-exit in that condition. (Note, the '&' operator
 // is correct to propagate the values of this enum; see comment on 'enum
 // DecodeStatus'.)
-static void emitCheck(formatted_raw_ostream &OS) {
-  OS << R"(
 static bool Check(DecodeStatus &Out, DecodeStatus In) {
   Out = static_cast<DecodeStatus>(Out & In);
   return Out != MCDisassembler::Fail;
 }
+)";
+
+  OS << R"(
+// Helper to decode the `NumToSkip` value encoded in the decoder table.
+static unsigned decodeNumToSkip(const uint8_t *&Ptr) {
+  unsigned NumToSkip = *Ptr++;
+  NumToSkip |= (*Ptr++) << 8;
+)";
+  if (getNumToSkipInBytes() == 3)
+    OS << "  NumToSkip |= (*Ptr++) << 16;\n";
+  OS << R"(  return NumToSkip;
+}
+
+// Forward declaration.
+[[maybe_unused]] static bool checkDecoderPredicate(unsigned Idx, const FeatureBitset &Bits);
 
 )";
 }
@@ -2445,17 +2646,18 @@ void DecoderEmitter::run(raw_ostream &o) {
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetParser/SubtargetFeature.h"
 #include <assert.h>
+#include <bitset>
 
 namespace {
 )";
 
-  emitFieldFromInstruction(OS);
+  emitCommonFunctions(OS);
   emitInsertBits(OS);
-  emitCheck(OS);
 
   Target.reverseBitsForLittleEndianEncoding();
 
@@ -2542,53 +2744,142 @@ namespace {
     }
   }
 
+  bool GenerateTemplated =
+      Target.getInstructionSet()->getValueAsBit("GenerateTemplatedDecoder");
+
+  // For variable instruction, we emit a instruction length table to let the
+  // decoder know how long the instructions are. You can see example usage in
+  // M68k's disassembler.
+  if (IsVarLenInst) {
+    if (GenerateTemplated)
+      PrintFatalError(
+          "Templated decoder not needed for variable length instruction");
+    emitInstrLenTable(OS, InstrLen);
+  }
+
   DecoderTableInfo TableInfo;
+  bool HasCheckPredicate = false;
   unsigned OpcodeMask = 0;
-  for (const auto &[NSAndByteSize, EncodingIDs] : EncMap) {
-    const std::string &DecoderNamespace = NSAndByteSize.first;
-    const unsigned BitWidth = 8 * NSAndByteSize.second;
-    // Emit the decoder for this namespace+width combination.
-    FilterChooser FC(NumberedEncodings, EncodingIDs, Operands,
-                     IsVarLenInst ? MaxInstLen : BitWidth, this);
-
-    // The decode table is cleared for each top level decoder function. The
-    // predicates and decoders themselves, however, are shared across all
-    // decoders to give more opportunities for uniqueing.
-    TableInfo.Table.clear();
-    TableInfo.pushScope();
-    FC.emitTableEntries(TableInfo);
-    // Any NumToSkip fixups in the top level scope can resolve to the
-    // OPC_Fail at the end of the table.
-    assert(TableInfo.isOutermostScope() && "fixup stack phasing error!");
-    TableInfo.popScope();
 
-    TableInfo.Table.push_back(MCD::OPC_Fail);
+  // Helper lambda to emit the decoder code for a given instruction Bitwidth
+  // and associated C++ type. If `Bitwidth` is 0 (and CPPType is empty) it will
+  // generate templated decoder code.
+  auto emitDecoder = [&](const CPPType &Type, StringRef Suffix) {
+    // Reset the Decoders for each non-templated type.
+    TableInfo.Decoders.clear();
+
+    const bool IsTemplate = Type.Kind == CPPType::TemplateTy;
+    if (!IsTemplate) {
+      OS << "// ------------------------------------------------------------\n";
+      OS << "// Decoder tables for " << Type.Bitwidth << "-bit instructions.\n";
+      OS << "// Using C++ type `" << Type.getName() << "` for payload.\n\n";
+    }
 
-    // Print the table to the output stream.
-    OpcodeMask |= emitTable(OS, TableInfo.Table, FC.getBitWidth(),
-                            DecoderNamespace, EncodingIDs);
-  }
+    for (const auto &[NSAndByteSize, EncodingIDs] : EncMap) {
+      const std::string &DecoderNamespace = NSAndByteSize.first;
+      const unsigned InstrBitwidth =
+          IsVarLenInst ? MaxInstLen : 8 * NSAndByteSize.second;
 
-  // For variable instruction, we emit a instruction length table
-  // to let the decoder know how long the instructions are.
-  // You can see example usage in M68k's disassembler.
-  if (IsVarLenInst)
-    emitInstrLenTable(OS, InstrLen);
+      if (!IsTemplate && InstrBitwidth != Type.Bitwidth)
+        continue;
 
-  const bool HasCheckPredicate =
-      OpcodeMask &
-      ((1 << MCD::OPC_CheckPredicate) | (1 << MCD::OPC_CheckPredicateOrFail));
+      // Emit the decoder table for this namespace+ bitwidth combination.
+      FilterChooser FC(NumberedEncodings, EncodingIDs, Operands,
+                       IsVarLenInst ? MaxInstLen : InstrBitwidth, this);
+
+      // The decode table is cleared for each top level decoder table generated.
+      //
+      // The decoders themselves are shared across all decoder tables for a
+      // given instuction bitwidth, to give more opporuninity for uniqueing.
+      // Generally, decoders across different instruction do not have much
+      // uniqueing opportunity, and we generate a different decode function
+      // for each bitwidth, so we clear the decoders themselves for each
+      // bitwidth (at the start of `emitDecoder`).
+      //
+      // Predicates are shared across all decoders to give more opportunities
+      // for uniqueing.
+      TableInfo.Table.clear();
+      TableInfo.pushScope();
+      FC.emitTableEntries(TableInfo);
+      // Any NumToSkip fixups in the top level scope can resolve to the
+      // OPC_Fail at the end of the table.
+      assert(TableInfo.isOutermostScope() && "fixup stack phasing error!");
+      TableInfo.popScope();
+      TableInfo.Table.push_back(MCD::OPC_Fail);
+
+      // Print the table to the output stream.
+      OpcodeMask |= emitTable(OS, TableInfo.Table, FC.getBitWidth(),
+                              DecoderNamespace, EncodingIDs);
+    }
+
+    // Emit the decoder function for this Bitwidth.
+    emitDecoderFunction(OS, TableInfo.Decoders, Type, Suffix);
+
+    HasCheckPredicate |= OpcodeMask & ((1 << MCD::OPC_CheckPredicate) |
+                                       (1 << MCD::OPC_CheckPredicateOrFail));
+  };
+
+  if (GenerateTemplated) {
+    // Generate the tempated variaant of `fieldFromInstruction`.
+    emitFieldFromInstruction(
+        OS, /*GenerateIntType=*/true, /*GenerateBitsetType=*/false,
+        /*GenerateAPIntType=*/false, /*GenerateTemplateType=*/true);
+    const CPPType Type(0, false);
+    emitDecoder(Type, /*Suffix=*/"");
+    emitDecodeInstruction(OS, IsVarLenInst, OpcodeMask, Type, /*Suffix=*/"",
+                          /*IsImplFnuction=*/false);
+  } else {
+    // Collect all allowed Bitwidths for instructions and keep track of the
+    // variants of `fieldFromInstruction` we need to generate.
+    SmallSet<unsigned, 4> InstrBitwidths;
+    bool GenerateIntType = false;
+    bool GenerateBitsetType = false;
+    for (const auto &[NSAndByteSize, _] : EncMap) {
+      const unsigned Bitwidth =
+          IsVarLenInst ? MaxInstLen : 8 * NSAndByteSize.second;
+      InstrBitwidths.insert(Bitwidth);
+      const CPPType::TypeKind Kind = CPPType(Bitwidth, IsVarLenInst).Kind;
+      GenerateIntType |= Kind == CPPType::UIntTy;
+      GenerateBitsetType |= Kind == CPPType::BitsetTy;
+    }
+    assert((!IsVarLenInst || InstrBitwidths.size() == 1) &&
+           "Expect a single instruction bitwidth "
+           "for variable length instructions");
+
+    // Generate required variants of `fieldFromInstruction`.
+    emitFieldFromInstruction(OS, GenerateIntType, GenerateBitsetType,
+                             /*GenerateAPIntType=*/IsVarLenInst,
+                             /*GenerateTemplateType=*/false);
+    const bool AddSuffix = InstrBitwidths.size() > 1;
+    unsigned MaxBitwidth = 0;
+    for (unsigned Bitwidth : InstrBitwidths) {
+      std::string Suffix = AddSuffix ? std::to_string(Bitwidth) : "";
+      const CPPType Type(Bitwidth, IsVarLenInst);
+      emitDecoder(Type, Suffix);
+      if (!AddSuffix) {
+        emitDecodeInstruction(OS, IsVarLenInst, OpcodeMask, Type, Suffix,
+                              /*IsImplFunction=*/false);
+      }
+      MaxBitwidth = std::max(MaxBitwidth, Bitwidth);
+    }
+
+    if (AddSuffix) {
+      // Emit top-level decodeInstruction. This will be an implementation
+      // function and one entry point for each bit width. The implementation
+      // function will operate on the max bitwidth type.
+      const CPPType MaxType(MaxBitwidth, IsVarLenInst);
+      emitDecodeInstruction(OS, IsVarLenInst, OpcodeMask, MaxType, "",
+                            /*IsImplFunction=*/true);
+      for (unsigned Bitwidth : InstrBitwidths)
+        emitDecodeInstructionAsCallToImpl(OS, CPPType(Bitwidth, IsVarLenInst),
+                                          MaxType, std::to_string(Bitwidth));
+    }
+  }
 
   // Emit the predicate function.
   if (HasCheckPredicate)
     emitPredicateFunction(OS, TableInfo.Predicates);
 
-  // Emit the decoder function.
-  emitDecoderFunction(OS, TableInfo.Decoders);
-
-  // Emit the main entry point for the decoder, decodeInstruction().
-  emitDecodeInstruction(OS, IsVarLenInst, OpcodeMask);
-
   OS << "\n} // namespace\n";
 }