[llvm] [AMDGPU] Eliminate unnecessary packing in wider f16 vectors for sdwa/opsel-able instruction (PR #137137)

Mon May 5 02:15:25 PDT 2025

================
@@ -1361,6 +1375,484 @@ bool SIPeepholeSDWALegacy::runOnMachineFunction(MachineFunction &MF) {
   return SIPeepholeSDWA().run(MF);
 }
 
+static bool isSrcDestFP16Bits(MachineInstr *MI, const SIInstrInfo *TII) {
+  unsigned Opcode = MI->getOpcode();
+  if (TII->isSDWA(Opcode))
+    Opcode = AMDGPU::getBasicFromSDWAOp(Opcode);
+
+  switch (Opcode) {
+  case AMDGPU::V_CVT_F16_U16_e32:
+  case AMDGPU::V_CVT_F16_U16_e64:
+  case AMDGPU::V_CVT_F16_I16_e32:
+  case AMDGPU::V_CVT_F16_I16_e64:
+  case AMDGPU::V_RCP_F16_e64:
+  case AMDGPU::V_RCP_F16_e32:
+  case AMDGPU::V_RSQ_F16_e64:
+  case AMDGPU::V_RSQ_F16_e32:
+  case AMDGPU::V_SQRT_F16_e64:
+  case AMDGPU::V_SQRT_F16_e32:
+  case AMDGPU::V_LOG_F16_e64:
+  case AMDGPU::V_LOG_F16_e32:
+  case AMDGPU::V_EXP_F16_e64:
+  case AMDGPU::V_EXP_F16_e32:
+  case AMDGPU::V_SIN_F16_e64:
+  case AMDGPU::V_SIN_F16_e32:
+  case AMDGPU::V_COS_F16_e64:
+  case AMDGPU::V_COS_F16_e32:
+  case AMDGPU::V_FLOOR_F16_e64:
+  case AMDGPU::V_FLOOR_F16_e32:
+  case AMDGPU::V_CEIL_F16_e64:
+  case AMDGPU::V_CEIL_F16_e32:
+  case AMDGPU::V_TRUNC_F16_e64:
+  case AMDGPU::V_TRUNC_F16_e32:
+  case AMDGPU::V_RNDNE_F16_e64:
+  case AMDGPU::V_RNDNE_F16_e32:
+  case AMDGPU::V_FRACT_F16_e64:
+  case AMDGPU::V_FRACT_F16_e32:
+  case AMDGPU::V_FREXP_MANT_F16_e64:
+  case AMDGPU::V_FREXP_MANT_F16_e32:
+  case AMDGPU::V_FREXP_EXP_I16_F16_e64:
+  case AMDGPU::V_FREXP_EXP_I16_F16_e32:
+  case AMDGPU::V_LDEXP_F16_e64:
+  case AMDGPU::V_LDEXP_F16_e32:
+  case AMDGPU::V_ADD_F16_e64:
+  case AMDGPU::V_ADD_F16_e32:
+  case AMDGPU::V_SUB_F16_e64:
+  case AMDGPU::V_SUB_F16_e32:
+  case AMDGPU::V_SUBREV_F16_e64:
+  case AMDGPU::V_SUBREV_F16_e32:
+  case AMDGPU::V_MUL_F16_e64:
+  case AMDGPU::V_MUL_F16_e32:
+  case AMDGPU::V_MAX_F16_e64:
+  case AMDGPU::V_MAX_F16_e32:
+  case AMDGPU::V_MIN_F16_e64:
+  case AMDGPU::V_MIN_F16_e32:
+  case AMDGPU::V_MAD_F16_e64:
+  case AMDGPU::V_FMA_F16_e64:
+  case AMDGPU::V_DIV_FIXUP_F16_e64:
+    return true;
+  case AMDGPU::V_MADAK_F16:
+  case AMDGPU::V_MADMK_F16:
+  case AMDGPU::V_FMAMK_F16:
+  case AMDGPU::V_FMAAK_F16:
+    // NOTE : SKEPTICAL ABOUT IT
+    return false;
+  case AMDGPU::V_FMAC_F16_e32:
+  case AMDGPU::V_FMAC_F16_e64:
+  case AMDGPU::V_MAC_F16_e32:
+  case AMDGPU::V_MAC_F16_e64:
+    // As their sdwa version allow dst_sel to be equal only set to DWORD
+  default:
+    return false;
+  }
+}
+
+static bool checkForRightSrcRootAccess(MachineInstr *Def0MI,
+                                       MachineInstr *Def1MI,
+                                       Register SrcRootReg,
+                                       const SIInstrInfo *TII) {
+  // As if could, the Def1MI would have been sdwa-ed in order to access
+  // upper half, and Def0MI should not be as it accessing lower half.
+  if (!TII->isSDWA(Def1MI->getOpcode()) || TII->isSDWA(Def0MI->getOpcode()))
+    return false;
+
+  // Def1 should be writing into entire DWORD of dst, with unused part set
+  // to zero-pad.
+  MachineOperand *Def1DstSel =
+      TII->getNamedOperand(*Def1MI, AMDGPU::OpName::dst_sel);
+  if (!Def1DstSel || Def1DstSel->getImm() != AMDGPU::SDWA::SdwaSel::DWORD)
+    return false;
+  MachineOperand *Def1DstUnused =
+      TII->getNamedOperand(*Def1MI, AMDGPU::OpName::dst_unused);
+  if (!Def1DstUnused ||
+      Def1DstUnused->getImm() != AMDGPU::SDWA::DstUnused::UNUSED_PAD)
+    return false;
+
+  MachineOperand *Def1Src0 =
+      TII->getNamedOperand(*Def1MI, AMDGPU::OpName::src0);
+  MachineOperand *Def1Src1 =
+      TII->getNamedOperand(*Def1MI, AMDGPU::OpName::src1);
+  MachineOperand *Def0Src0 =
+      TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src0);
+  MachineOperand *Def0Src1 =
+      TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src1);
+
+  auto checkForDef0MIAccess = [&]() -> bool {
+    if (Def0Src0 && Def0Src0->isReg() && (Def0Src0->getReg() == SrcRootReg)) {
+      MachineOperand *Def0Src0Sel =
+          TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src0_sel);
+      if (!Def0Src0Sel ||
+          Def0Src0Sel->getImm() == AMDGPU::SDWA::SdwaSel::WORD_0)
+        return true;
+    }
+
+    if (Def0Src1 && Def0Src1->isReg() && (Def0Src1->getReg() == SrcRootReg)) {
+      MachineOperand *Def0Src1Sel =
+          TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src1_sel);
+      if (!Def0Src1Sel ||
+          Def0Src1Sel->getImm() == AMDGPU::SDWA::SdwaSel::WORD_0)
+        return true;
+    }
+
+    return false;
+  };
+
+  if (Def1Src0 && Def1Src0->isReg() && (Def1Src0->getReg() == SrcRootReg)) {
+    MachineOperand *Def1Src0Sel =
+        TII->getNamedOperand(*Def1MI, AMDGPU::OpName::src0_sel);
+    if (!Def1Src0Sel || Def1Src0Sel->getImm() != AMDGPU::SDWA::SdwaSel::WORD_1)
+      return false;
+
+    if (checkForDef0MIAccess())
+      return true;
+  }
+
+  if (Def1Src1 && Def1Src1->isReg() && (Def1Src1->getReg() == SrcRootReg)) {
+    MachineOperand *Def1Src1Sel =
+        TII->getNamedOperand(*Def1MI, AMDGPU::OpName::src1_sel);
+    if (!Def1Src1Sel || Def1Src1Sel->getImm() != AMDGPU::SDWA::SdwaSel::WORD_1)
+      return false;
+
+    if (checkForDef0MIAccess())
+      return true;
+  }
+
+  return false;
+}
+
+/// Given A and B are in the same MBB, returns true if A comes before B.
+static bool dominates(MachineBasicBlock::const_iterator A,
+                      MachineBasicBlock::const_iterator B) {
+  assert(A->getParent() == B->getParent());
+  const MachineBasicBlock *MBB = A->getParent();
+  auto MBBEnd = MBB->end();
+  if (B == MBBEnd)
+    return true;
+
+  if (A == MBBEnd)
+    return false;
+
+  MachineBasicBlock::const_iterator I = A;
+  while (I != B && I != MBBEnd)
+    I++;
+
+  return (I == B);
+}
+
+// Convert MI into its SDWA version with its Dst_Sel & SrcMO_Sel set with OpSel
+// and preserving the rest of Dst's bits.
+void SIPeepholeSDWA::convertMIToSDWAWithOpsel(MachineInstr *MI,
+                                              MachineOperand &SrcMO,
+                                              AMDGPU::SDWA::SdwaSel OpSel) {
+  LLVM_DEBUG(dbgs() << "Convert instruction:" << MI);
+
+  if (!TII->isSDWA(MI->getOpcode())) {
+    MachineInstr *SDWAInst = createSDWAVersion(*MI);
+    MI->eraseFromParent();
+    MI = SDWAInst;
+  }
+
+  ConvertedInstructions.push_back(MI);
+  unsigned SDWAOpcode = MI->getOpcode();
+  ++NumSDWAInstructionsToEliminateFP16Pack;
+
+  MachineOperand *Dst = TII->getNamedOperand(*MI, AMDGPU::OpName::vdst);
+  assert(Dst && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::vdst));
+
+  MachineOperand *DstSel = TII->getNamedOperand(*MI, AMDGPU::OpName::dst_sel);
+  assert(DstSel &&
+         AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::dst_sel));
+  DstSel->setImm(OpSel);
+
+  MachineOperand *DstUnused =
+      TII->getNamedOperand(*MI, AMDGPU::OpName::dst_unused);
+  assert(DstUnused &&
+         AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::dst_unused));
+  assert(!(DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) &&
+         "Dst_unused should not be UNUSED_PRESERVE already");
+  DstUnused->setImm(AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE);
+
+  int PreserveDstIdx =
+      AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst);
+  assert(PreserveDstIdx != -1);
+  MachineOperand NewSrcImplitMO =
+      MachineOperand::CreateReg(SrcMO.getReg(), false, true);
+  copyRegOperand(NewSrcImplitMO, SrcMO);
+  MI->addOperand(NewSrcImplitMO);
+  MI->tieOperands(PreserveDstIdx, MI->getNumOperands() - 1);
+
+  MachineOperand *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
+  assert(Src0 && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0));
+  if (Src0->isReg() && (Src0->getReg() == SrcMO.getReg())) {
+    MachineOperand *Src0Sel =
+        TII->getNamedOperand(*MI, AMDGPU::OpName::src0_sel);
+    assert(Src0Sel &&
+           AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_sel));
+    Src0Sel->setImm(OpSel);
+
+    LLVM_DEBUG(dbgs() << "\nInto:" << *MI << '\n');
+    return;
+  }
+
+  MachineOperand *Src1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1);
+  assert(Src1 && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1));
+  if (Src1->isReg() && (Src1->getReg() == SrcMO.getReg())) {
+    MachineOperand *Src1Sel =
+        TII->getNamedOperand(*MI, AMDGPU::OpName::src1_sel);
+    assert(Src1Sel &&
+           AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_sel));
+    Src1Sel->setImm(OpSel);
+
+    LLVM_DEBUG(dbgs() << "\nInto:" << *MI << '\n');
+    return;
----------------
Pierre-vh wrote:

These two cases are also similar enough, you could use a function here too to avoid repetition

https://github.com/llvm/llvm-project/pull/137137