[llvm] [AMDGPU] Eliminate unnecessary packing in wider f16 vectors for sdwa/opsel-able instruction (PR #137137)
Pierre van Houtryve via llvm-commits
llvm-commits at lists.llvm.org
Thu Dec 11 00:42:23 PST 2025
================
@@ -1369,6 +1458,411 @@ bool SIPeepholeSDWALegacy::runOnMachineFunction(MachineFunction &MF) {
return SIPeepholeSDWA().run(MF);
}
+/// Returns true if the instruction has FP16 destination and all 16-bit sources.
+/// This is TableGen-generated via VOPSrcDestFP16Table in VOPInstructions.td.
+static bool isSrcDestFP16Bits(MachineInstr *MI, const SIInstrInfo *TII) {
+ unsigned Opcode = MI->getOpcode();
+ if (TII->isSDWA(Opcode))
+ Opcode = AMDGPU::getBasicFromSDWAOp(Opcode);
+
+ return AMDGPU::isSrcDestFP16Inst(Opcode);
+}
+
+static bool checkForRightSrcRootAccess(MachineInstr *Def0MI,
+ MachineInstr *Def1MI,
+ Register SrcRootReg,
+ const SIInstrInfo *TII) {
+ // The intended scenario is that Def1MI already reads the upper half from
+ // SrcRootReg via SDWA-able instruction while Def0MI still consumes the lower
+ // half from SrcRootReg without the SDWA counterpart. Any other arrangement
+ // would imply violation of SrcRootReg usage.
+ if (!TII->isSDWA(Def1MI->getOpcode()) || TII->isSDWA(Def0MI->getOpcode()))
+ return false;
+ MachineOperand *Def1DstSel =
+ TII->getNamedOperand(*Def1MI, AMDGPU::OpName::dst_sel);
+ if (!Def1DstSel || Def1DstSel->getImm() != AMDGPU::SDWA::SdwaSel::DWORD)
+ return false;
+ MachineOperand *Def1DstUnused =
+ TII->getNamedOperand(*Def1MI, AMDGPU::OpName::dst_unused);
+ if (!Def1DstUnused ||
+ Def1DstUnused->getImm() != AMDGPU::SDWA::DstUnused::UNUSED_PAD)
+ return false;
+
+ // Helper to validate whether DefMI uses SrcRootReg as the specified source
+ // operand (SrcName), and if the corresponding SDWA selection operand
+ // (SrcSelName) matches the expected SdwaSel.
+ const auto CheckSrcSel = [&](MachineInstr *DefMI, AMDGPU::OpName SrcName,
+ AMDGPU::OpName SrcSelName,
+ AMDGPU::SDWA::SdwaSel SdwaSel) -> bool {
+ MachineOperand *DefSrc = TII->getNamedOperand(*DefMI, SrcName);
+ if (DefSrc && DefSrc->isReg() && (DefSrc->getReg() == SrcRootReg)) {
+ MachineOperand *DefSrcSel = TII->getNamedOperand(*DefMI, SrcSelName);
+ if (SdwaSel == AMDGPU::SDWA::SdwaSel::WORD_0) {
+ if (!DefSrcSel || DefSrcSel->getImm() == SdwaSel)
+ return true;
+ } else {
+ assert(SdwaSel == AMDGPU::SDWA::SdwaSel::WORD_1 &&
+ "Not valid SDWA SrcSel operand");
+ if (DefSrcSel && DefSrcSel->getImm() == SdwaSel)
+ return true;
+ }
+ }
+ return false;
+ };
+
+ if (!CheckSrcSel(Def1MI, AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel,
+ AMDGPU::SDWA::SdwaSel::WORD_1) &&
+ !CheckSrcSel(Def1MI, AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel,
+ AMDGPU::SDWA::SdwaSel::WORD_1))
+ return false;
+
+ return CheckSrcSel(Def0MI, AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel,
+ AMDGPU::SDWA::SdwaSel::WORD_0) ||
+ CheckSrcSel(Def0MI, AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel,
+ AMDGPU::SDWA::SdwaSel::WORD_0);
+}
+
+/// Given A and B are in the same MBB, returns true if A comes before B.
+static bool dominates(MachineBasicBlock::const_iterator A,
+ MachineBasicBlock::const_iterator B) {
+ assert(A->getParent() == B->getParent());
+ const MachineBasicBlock *MBB = A->getParent();
+ auto MBBEnd = MBB->end();
+ if (B == MBBEnd)
+ return true;
+
+ if (A == MBBEnd)
+ return false;
+
+ MachineBasicBlock::const_iterator I = A;
+ while (I != B && I != MBBEnd)
+ I++;
+
+ return (I == B);
+}
+
+// Convert MI into its SDWA version with its Dst_Sel & SrcMO_Sel set to OpSel
+// and preserve the untouched destination bits by tying the implicit operand.
+void SDWAFP16ChainOperand::convertMIToSDWAWithOpsel(SIPeepholeSDWA &Parent,
+ MachineInstr *MI,
+ MachineOperand &SrcMO,
+ SdwaSel OpSel) {
+ const SIInstrInfo *TII = Parent.TII;
+ LLVM_DEBUG(dbgs() << "Convert instruction:" << MI);
+
+ if (!TII->isSDWA(MI->getOpcode())) {
+ MachineInstr *SDWAInst = Parent.createSDWAVersion(*MI);
+ MI->eraseFromParent();
+ MI = SDWAInst;
+ }
+
+ Parent.ConvertedInstructions.push_back(MI);
+ unsigned SDWAOpcode = MI->getOpcode();
+ ++NumSDWAInstructionsToEliminateFP16Pack;
+
+ MachineOperand *Dst = TII->getNamedOperand(*MI, AMDGPU::OpName::vdst);
+ assert(Dst && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::vdst));
+
+ MachineOperand *DstSel = TII->getNamedOperand(*MI, AMDGPU::OpName::dst_sel);
+ assert(DstSel &&
+ AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::dst_sel));
+ DstSel->setImm(OpSel);
+
+ MachineOperand *DstUnused =
+ TII->getNamedOperand(*MI, AMDGPU::OpName::dst_unused);
+ assert(DstUnused &&
+ AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::dst_unused));
+ assert(DstUnused->getImm() != AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE &&
+ "Dst_unused should not be UNUSED_PRESERVE already");
+ DstUnused->setImm(AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE);
+
+ int PreserveDstIdx =
+ AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst);
+ assert(PreserveDstIdx != -1);
+ MachineOperand NewSrcImplitMO =
+ MachineOperand::CreateReg(SrcMO.getReg(), false, true);
+ copyRegOperand(NewSrcImplitMO, SrcMO);
+ MI->addOperand(NewSrcImplitMO);
+ MI->tieOperands(PreserveDstIdx, MI->getNumOperands() - 1);
+
+ auto ModifySrcSelIntoOpSel = [&](AMDGPU::OpName SrcName,
+ AMDGPU::OpName SrcSelName) -> bool {
+ MachineOperand *Src = TII->getNamedOperand(*MI, SrcName);
+ assert(Src && AMDGPU::hasNamedOperand(SDWAOpcode, SrcName));
+ if (Src->isReg() && (Src->getReg() == SrcMO.getReg())) {
+ MachineOperand *SrcSel = TII->getNamedOperand(*MI, SrcSelName);
+ assert(SrcSel && AMDGPU::hasNamedOperand(SDWAOpcode, SrcSelName));
+ SrcSel->setImm(OpSel);
+
+ LLVM_DEBUG(dbgs() << "\nInto:" << *MI << '\n');
+ return true;
+ }
+
+ return false;
+ };
+
+ if (ModifySrcSelIntoOpSel(AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel))
+ return;
+
+ if (ModifySrcSelIntoOpSel(AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel))
+ return;
----------------
Pierre-vh wrote:
```suggestion
ModifySrcSelIntoOpSel(AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel);
```
https://github.com/llvm/llvm-project/pull/137137
More information about the llvm-commits
mailing list