[llvm] [AMDGPU] Eliminate unnecessary packing in wider f16 vectors for sdwa/opsel-able instruction (PR #137137)
Vikash Gupta via llvm-commits
llvm-commits at lists.llvm.org
Mon Apr 28 04:09:55 PDT 2025
================
@@ -1361,6 +1379,499 @@ bool SIPeepholeSDWALegacy::runOnMachineFunction(MachineFunction &MF) {
return SIPeepholeSDWA().run(MF);
}
+static bool isSrcDestFP16Bits(MachineInstr *MI, const SIInstrInfo *TII) {
+ unsigned Opcode = MI->getOpcode();
+ if (TII->isSDWA(Opcode))
+ Opcode = AMDGPU::getBasicFromSDWAOp(Opcode);
+
+ switch (Opcode) {
+ case AMDGPU::V_CVT_F16_U16_e32:
+ case AMDGPU::V_CVT_F16_U16_e64:
+ case AMDGPU::V_CVT_F16_I16_e32:
+ case AMDGPU::V_CVT_F16_I16_e64:
+ case AMDGPU::V_RCP_F16_e64:
+ case AMDGPU::V_RCP_F16_e32:
+ case AMDGPU::V_RSQ_F16_e64:
+ case AMDGPU::V_RSQ_F16_e32:
+ case AMDGPU::V_SQRT_F16_e64:
+ case AMDGPU::V_SQRT_F16_e32:
+ case AMDGPU::V_LOG_F16_e64:
+ case AMDGPU::V_LOG_F16_e32:
+ case AMDGPU::V_EXP_F16_e64:
+ case AMDGPU::V_EXP_F16_e32:
+ case AMDGPU::V_SIN_F16_e64:
+ case AMDGPU::V_SIN_F16_e32:
+ case AMDGPU::V_COS_F16_e64:
+ case AMDGPU::V_COS_F16_e32:
+ case AMDGPU::V_FLOOR_F16_e64:
+ case AMDGPU::V_FLOOR_F16_e32:
+ case AMDGPU::V_CEIL_F16_e64:
+ case AMDGPU::V_CEIL_F16_e32:
+ case AMDGPU::V_TRUNC_F16_e64:
+ case AMDGPU::V_TRUNC_F16_e32:
+ case AMDGPU::V_RNDNE_F16_e64:
+ case AMDGPU::V_RNDNE_F16_e32:
+ case AMDGPU::V_FRACT_F16_e64:
+ case AMDGPU::V_FRACT_F16_e32:
+ case AMDGPU::V_FREXP_MANT_F16_e64:
+ case AMDGPU::V_FREXP_MANT_F16_e32:
+ case AMDGPU::V_FREXP_EXP_I16_F16_e64:
+ case AMDGPU::V_FREXP_EXP_I16_F16_e32:
+ case AMDGPU::V_LDEXP_F16_e64:
+ case AMDGPU::V_LDEXP_F16_e32:
+ case AMDGPU::V_ADD_F16_e64:
+ case AMDGPU::V_ADD_F16_e32:
+ case AMDGPU::V_SUB_F16_e64:
+ case AMDGPU::V_SUB_F16_e32:
+ case AMDGPU::V_SUBREV_F16_e64:
+ case AMDGPU::V_SUBREV_F16_e32:
+ case AMDGPU::V_MUL_F16_e64:
+ case AMDGPU::V_MUL_F16_e32:
+ case AMDGPU::V_MAX_F16_e64:
+ case AMDGPU::V_MAX_F16_e32:
+ case AMDGPU::V_MIN_F16_e64:
+ case AMDGPU::V_MIN_F16_e32:
+ case AMDGPU::V_MAD_F16_e64:
+ case AMDGPU::V_FMA_F16_e64:
+ case AMDGPU::V_DIV_FIXUP_F16_e64:
+ return true;
+ case AMDGPU::V_MADAK_F16:
+ case AMDGPU::V_MADMK_F16:
+ case AMDGPU::V_FMAMK_F16:
+ case AMDGPU::V_FMAAK_F16:
+ // NOTE : SKEPTICAL ABOUT IT
+ return false;
+ case AMDGPU::V_FMAC_F16_e32:
+ case AMDGPU::V_FMAC_F16_e64:
+ case AMDGPU::V_MAC_F16_e32:
+ case AMDGPU::V_MAC_F16_e64:
+ // As their sdwa version allow dst_sel to be equal only set to DWORD
+ default:
+ return false;
+ }
+}
+
+static bool checkForRightSrcRootAccess(MachineInstr *Def0MI,
+ MachineInstr *Def1MI,
+ Register SrcRootReg,
+ const SIInstrInfo *TII) {
+ // As if could, the Def1MI would have been sdwa-ed in order to access
+ // upper half, and Def0MI should not be as it accessing lower half.
+ if (!TII->isSDWA(Def1MI->getOpcode()) || TII->isSDWA(Def0MI->getOpcode()))
+ return false;
+
+ // Def1 should be writing into entire DWORD of dst, with unused part set
+ // to zero-pad.
+ MachineOperand *Def1DstSel =
+ TII->getNamedOperand(*Def1MI, AMDGPU::OpName::dst_sel);
+ if (!Def1DstSel || Def1DstSel->getImm() != AMDGPU::SDWA::SdwaSel::DWORD)
+ return false;
+ MachineOperand *Def1DstUnused =
+ TII->getNamedOperand(*Def1MI, AMDGPU::OpName::dst_unused);
+ if (!Def1DstUnused ||
+ Def1DstUnused->getImm() != AMDGPU::SDWA::DstUnused::UNUSED_PAD)
+ return false;
+
+ MachineOperand *Def1Src0 =
+ TII->getNamedOperand(*Def1MI, AMDGPU::OpName::src0);
+ MachineOperand *Def1Src1 =
+ TII->getNamedOperand(*Def1MI, AMDGPU::OpName::src1);
+ MachineOperand *Def0Src0 =
+ TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src0);
+ MachineOperand *Def0Src1 =
+ TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src1);
+
+ auto chkForDef0MIAccess = [&]() -> bool {
+ if (Def0Src0 && Def0Src0->isReg() && (Def0Src0->getReg() == SrcRootReg)) {
+ MachineOperand *Def0Src0Sel =
+ TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src0_sel);
+ if (!Def0Src0Sel)
+ return true;
+ if (Def0Src0Sel && Def0Src0Sel->getImm() == AMDGPU::SDWA::SdwaSel::WORD_0)
+ return true;
+ }
+
+ if (Def0Src1 && Def0Src1->isReg() && (Def0Src1->getReg() == SrcRootReg)) {
+ MachineOperand *Def0Src1Sel =
+ TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src1_sel);
+ if (!Def0Src1Sel)
+ return true;
+ if (Def0Src1Sel && Def0Src1Sel->getImm() == AMDGPU::SDWA::SdwaSel::WORD_0)
+ return true;
+ }
+
+ return false;
+ };
+
+ if (Def1Src0 && Def1Src0->isReg() && (Def1Src0->getReg() == SrcRootReg)) {
+ MachineOperand *Def1Src0Sel =
+ TII->getNamedOperand(*Def1MI, AMDGPU::OpName::src0_sel);
+ if (!Def1Src0Sel ||
+ (Def1Src0Sel->getImm() != AMDGPU::SDWA::SdwaSel::WORD_1))
+ return false;
+
+ if (chkForDef0MIAccess())
+ return true;
+ }
+
+ if (Def1Src1 && Def1Src1->isReg() && (Def1Src1->getReg() == SrcRootReg)) {
+ MachineOperand *Def1Src1Sel =
+ TII->getNamedOperand(*Def1MI, AMDGPU::OpName::src1_sel);
+ if (!Def1Src1Sel ||
+ (Def1Src1Sel->getImm() != AMDGPU::SDWA::SdwaSel::WORD_1))
+ return false;
+
+ if (chkForDef0MIAccess())
+ return true;
+ }
+
+ return false;
+}
+
+/// Given A and B are in the same MBB, returns true if A comes before B.
+static bool dominates(MachineBasicBlock::const_iterator A,
+ MachineBasicBlock::const_iterator B) {
+ assert(A->getParent() == B->getParent());
+ const MachineBasicBlock *MBB = A->getParent();
+ auto MBBEnd = MBB->end();
+ if (B == MBBEnd)
+ return true;
+
+ MachineBasicBlock::const_iterator I = MBB->begin();
+ for (; &*I != A && &*I != B; ++I)
+ ;
+
+ return &*I == A;
+}
+
+// Convert MI into its SDWA version with its Dst_Sel & SrcMO_Sel set with OpSel
+// and preserving the rest of Dst's bits.
+void SIPeepholeSDWA::convertMIToSDWAWithOpsel(MachineInstr &MI,
+ MachineOperand &SrcMO,
+ AMDGPU::SDWA::SdwaSel OpSel) {
+ LLVM_DEBUG(dbgs() << "Convert instruction:" << MI);
+
+ MachineInstr *SDWAInst;
+ if (TII->isSDWA(MI.getOpcode())) {
+ SDWAInst = &MI;
+ } else {
+ SDWAInst = createSDWAVersion(MI);
+ MI.eraseFromParent();
+ }
+
+ ConvertedInstructions.push_back(SDWAInst);
+ unsigned SDWAOpcode = SDWAInst->getOpcode();
+ ++NumSDWAInstructionsToEliminateFP16Pack;
+
+ MachineOperand *Dst = TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::vdst);
+ assert(Dst && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::vdst));
+
+ MachineOperand *DstSel =
+ TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::dst_sel);
+ assert(DstSel &&
+ AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::dst_sel));
+ DstSel->setImm(OpSel);
+
+ MachineOperand *DstUnused =
+ TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::dst_unused);
+ assert(DstUnused &&
+ AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::dst_unused));
+ assert(!(DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) &&
+ "Dst_unused should not be UNUSED_PRESERVE already");
+ DstUnused->setImm(AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE);
+
+ auto PreserveDstIdx =
+ AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst);
+ assert(PreserveDstIdx != -1);
+ auto NewSrcImplitMO = MachineOperand::CreateReg(SrcMO.getReg(), false, true);
+ copyRegOperand(NewSrcImplitMO, SrcMO);
+ SDWAInst->addOperand(NewSrcImplitMO);
+ SDWAInst->tieOperands(PreserveDstIdx, SDWAInst->getNumOperands() - 1);
+
+ MachineOperand *Src0 = TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::src0);
+ assert(Src0 && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0));
+ if (Src0->isReg() && (Src0->getReg() == SrcMO.getReg())) {
+ MachineOperand *Src0Sel =
+ TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::src0_sel);
+ assert(Src0Sel &&
+ AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_sel));
+ Src0Sel->setImm(OpSel);
+
+ LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n');
+ return;
+ }
+
+ MachineOperand *Src1 = TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::src1);
+ assert(Src1 && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1));
+ if (Src1->isReg() && (Src1->getReg() == SrcMO.getReg())) {
+ MachineOperand *Src1Sel =
+ TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::src1_sel);
+ assert(Src1Sel &&
+ AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_sel));
+ Src1Sel->setImm(OpSel);
+
+ LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n');
+ return;
+ }
+}
+
+// BackTracks the given Parent MI to look for any of its use operand that has
+// been defined by FP16 (sdwa-able) in recursive fashion.
+unsigned SIPeepholeSDWA::computeMIChainsForPackedOps(
+ MachineInstr *ParentMI, std::queue<MachineOperand *> &DefSrcQueue,
+ const GCNSubtarget &ST) {
+ unsigned NumOfFP16Def;
+ do {
+ MachineInstr *NextMIInChain = nullptr;
+ NumOfFP16Def = 0;
+ for (MachineOperand ¤tMO : ParentMI->uses()) {
+ if (!currentMO.isReg() || currentMO.getReg().isPhysical() ||
+ !MRI->hasOneUse(currentMO.getReg()))
+ continue;
+
+ MachineOperand *DefCurrMO = findSingleRegDef(¤tMO, MRI);
+ if (!DefCurrMO)
+ continue;
+
+ MachineInstr *DefCurrMI = DefCurrMO->getParent();
+ if (!isSrcDestFP16Bits(DefCurrMI, TII) ||
+ !isConvertibleToSDWA(*DefCurrMI, ST, TII))
+ continue;
+
+ NextMIInChain = DefCurrMI;
+ DefSrcQueue.push(DefCurrMO);
+ NumOfFP16Def++;
+ }
+
+ if (NumOfFP16Def > 1)
+ break;
+
+ ParentMI = NextMIInChain;
+ } while (ParentMI);
+
+ return NumOfFP16Def;
+}
+
+void SIPeepholeSDWA::eliminateFP16Packing(MachineBasicBlock &MBB,
+ const GCNSubtarget &ST) {
+ if (!ST.has16BitInsts())
+ return;
+
+ for (MachineInstr &MI : make_early_inc_range(MBB)) {
----------------
vg0204 wrote:
MIR test added may give better idea!
https://github.com/llvm/llvm-project/pull/137137
More information about the llvm-commits
mailing list