[llvm] [TableGen][DecoderEmitter] Add option to emit type-specialized code (PR #146593)

Fri Jul 25 15:53:00 PDT 2025

jurahul wrote:

I finally got around to prototyping this. I went with the idea that we will have a single non-templated `decodeInstructionImpl` that operates at the max instruction width and that accepts a `function_ref<>` for dynamic dispatch to the correct version of `decodeToMCInst`. As an example, here's the code generated for AMDGPU:

```
static DecodeStatus decodeInstructionImpl(const uint8_t DecodeTable[], MCInst &MI, const std::bitset<128> &insn, 
                                          uint64_t Address, const MCDisassembler *DisAsm, const MCSubtargetInfo &STI,
                                          function_ref<DecodeStatus(unsigned, DecodeStatus, MCInst &, uint64_t, const MCDisassembler *, bool &)> decodeToMCInstPtr) {
...
}

static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI, uint32_t insn, uint64_t Address, const MCDisassembler *DisAsm, const MCSubtargetInfo &STI) {
  std::bitset<128> InsnMaxWidth = insn;
  auto DecodeToMCInst = [insn](unsigned DecodeIdx, DecodeStatus S, MCInst &MI, uint64_t Address, const MCDisassembler *DisAsm, bool &DecodeComplete) {
    return decodeToMCInst32(DecodeIdx, S, insn, MI, Address, DisAsm, DecodeComplete);
  };
  return decodeInstructionImpl(DecodeTable, MI, InsnMaxWidth, Address, DisAsm, STI, DecodeToMCInst);
}

static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI, uint64_t insn, uint64_t Address, const MCDisassembler *DisAsm, const MCSubtargetInfo &STI) {
  std::bitset<128> InsnMaxWidth = insn;
  auto DecodeToMCInst = [insn](unsigned DecodeIdx, DecodeStatus S, MCInst &MI, uint64_t Address, const MCDisassembler *DisAsm, bool &DecodeComplete) {
    return decodeToMCInst64(DecodeIdx, S, insn, MI, Address, DisAsm, DecodeComplete);
  };
  return decodeInstructionImpl(DecodeTable, MI, InsnMaxWidth, Address, DisAsm, STI, DecodeToMCInst);
}

static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI, const std::bitset<96> &insn, uint64_t Address, const MCDisassembler *DisAsm, const MCSubtargetInfo &STI) {
  const std::bitset<96> Mask(maskTrailingOnes<uint64_t>(64));
  std::bitset<128> InsnMaxWidth((insn & Mask).to_ulong());
  InsnMaxWidth |= std::bitset<128>(((insn >> 64) & Mask).to_ulong()) << 64;

  auto DecodeToMCInst = [&insn](unsigned DecodeIdx, DecodeStatus S, MCInst &MI, uint64_t Address, const MCDisassembler *DisAsm, bool &DecodeComplete) {
    return decodeToMCInst96(DecodeIdx, S, insn, MI, Address, DisAsm, DecodeComplete);
  };
  return decodeInstructionImpl(DecodeTable, MI, InsnMaxWidth, Address, DisAsm, STI, DecodeToMCInst);
}

static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI, const std::bitset<128> &insn, uint64_t Address, const MCDisassembler *DisAsm, const MCSubtargetInfo &STI) {
  std::bitset<128> InsnMaxWidth = insn;
  auto DecodeToMCInst = [&insn](unsigned DecodeIdx, DecodeStatus S, MCInst &MI, uint64_t Address, const MCDisassembler *DisAsm, bool &DecodeComplete) {
    return decodeToMCInst128(DecodeIdx, S, insn, MI, Address, DisAsm, DecodeComplete);
  };
  return decodeInstructionImpl(DecodeTable, MI, InsnMaxWidth, Address, DisAsm, STI, DecodeToMCInst);
}
```

Do we still have the per-bit-width overloads of `decodeInstruction` that upcast the bits to the max bit width and call `decodeInstructionImpl`. With this, the code size regression for RISCV is not as much as the earlier version:

```
                New         old

RISCV   text    57270       55660
        rodata  37825       38058

AMDGPU  text    268044      440444
        rodata  360568      378952

```

I'll do some more testing and put this version up for review.

https://github.com/llvm/llvm-project/pull/146593