[llvm] [AMDGPU] Eliminate unnecessary packing in wider f16 vectors for sdwa/opsel-able instruction (PR #137137)

Thu Dec 11 00:42:25 PST 2025

================
@@ -1369,6 +1458,411 @@ bool SIPeepholeSDWALegacy::runOnMachineFunction(MachineFunction &MF) {
   return SIPeepholeSDWA().run(MF);
 }
 
+/// Returns true if the instruction has FP16 destination and all 16-bit sources.
+/// This is TableGen-generated via VOPSrcDestFP16Table in VOPInstructions.td.
+static bool isSrcDestFP16Bits(MachineInstr *MI, const SIInstrInfo *TII) {
+  unsigned Opcode = MI->getOpcode();
+  if (TII->isSDWA(Opcode))
+    Opcode = AMDGPU::getBasicFromSDWAOp(Opcode);
+
+  return AMDGPU::isSrcDestFP16Inst(Opcode);
+}
+
+static bool checkForRightSrcRootAccess(MachineInstr *Def0MI,
+                                       MachineInstr *Def1MI,
+                                       Register SrcRootReg,
+                                       const SIInstrInfo *TII) {
+  // The intended scenario is that Def1MI already reads the upper half from
+  // SrcRootReg via SDWA-able instruction while Def0MI still consumes the lower
+  // half from SrcRootReg without the SDWA counterpart. Any other arrangement
+  // would imply violation of SrcRootReg usage.
+  if (!TII->isSDWA(Def1MI->getOpcode()) || TII->isSDWA(Def0MI->getOpcode()))
+    return false;
+  MachineOperand *Def1DstSel =
+      TII->getNamedOperand(*Def1MI, AMDGPU::OpName::dst_sel);
+  if (!Def1DstSel || Def1DstSel->getImm() != AMDGPU::SDWA::SdwaSel::DWORD)
+    return false;
+  MachineOperand *Def1DstUnused =
+      TII->getNamedOperand(*Def1MI, AMDGPU::OpName::dst_unused);
+  if (!Def1DstUnused ||
+      Def1DstUnused->getImm() != AMDGPU::SDWA::DstUnused::UNUSED_PAD)
+    return false;
+
+  // Helper to validate whether DefMI uses SrcRootReg as the specified source
+  // operand (SrcName), and if the corresponding SDWA selection operand
+  // (SrcSelName) matches the expected SdwaSel.
+  const auto CheckSrcSel = [&](MachineInstr *DefMI, AMDGPU::OpName SrcName,
+                               AMDGPU::OpName SrcSelName,
+                               AMDGPU::SDWA::SdwaSel SdwaSel) -> bool {
+    MachineOperand *DefSrc = TII->getNamedOperand(*DefMI, SrcName);
+    if (DefSrc && DefSrc->isReg() && (DefSrc->getReg() == SrcRootReg)) {
+      MachineOperand *DefSrcSel = TII->getNamedOperand(*DefMI, SrcSelName);
+      if (SdwaSel == AMDGPU::SDWA::SdwaSel::WORD_0) {
+        if (!DefSrcSel || DefSrcSel->getImm() == SdwaSel)
+          return true;
+      } else {
+        assert(SdwaSel == AMDGPU::SDWA::SdwaSel::WORD_1 &&
+               "Not valid SDWA SrcSel operand");
+        if (DefSrcSel && DefSrcSel->getImm() == SdwaSel)
+          return true;
+      }
+    }
+    return false;
+  };
+
+  if (!CheckSrcSel(Def1MI, AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel,
+                   AMDGPU::SDWA::SdwaSel::WORD_1) &&
+      !CheckSrcSel(Def1MI, AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel,
+                   AMDGPU::SDWA::SdwaSel::WORD_1))
+    return false;
+
+  return CheckSrcSel(Def0MI, AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel,
+                     AMDGPU::SDWA::SdwaSel::WORD_0) ||
+         CheckSrcSel(Def0MI, AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel,
+                     AMDGPU::SDWA::SdwaSel::WORD_0);
+}
+
+/// Given A and B are in the same MBB, returns true if A comes before B.
+static bool dominates(MachineBasicBlock::const_iterator A,
+                      MachineBasicBlock::const_iterator B) {
+  assert(A->getParent() == B->getParent());
+  const MachineBasicBlock *MBB = A->getParent();
+  auto MBBEnd = MBB->end();
+  if (B == MBBEnd)
+    return true;
+
+  if (A == MBBEnd)
+    return false;
+
+  MachineBasicBlock::const_iterator I = A;
+  while (I != B && I != MBBEnd)
+    I++;
+
+  return (I == B);
+}
+
+// Convert MI into its SDWA version with its Dst_Sel & SrcMO_Sel set to OpSel
+// and preserve the untouched destination bits by tying the implicit operand.
+void SDWAFP16ChainOperand::convertMIToSDWAWithOpsel(SIPeepholeSDWA &Parent,
+                                                    MachineInstr *MI,
+                                                    MachineOperand &SrcMO,
+                                                    SdwaSel OpSel) {
+  const SIInstrInfo *TII = Parent.TII;
+  LLVM_DEBUG(dbgs() << "Convert instruction:" << MI);
+
+  if (!TII->isSDWA(MI->getOpcode())) {
+    MachineInstr *SDWAInst = Parent.createSDWAVersion(*MI);
+    MI->eraseFromParent();
+    MI = SDWAInst;
+  }
+
+  Parent.ConvertedInstructions.push_back(MI);
+  unsigned SDWAOpcode = MI->getOpcode();
+  ++NumSDWAInstructionsToEliminateFP16Pack;
+
+  MachineOperand *Dst = TII->getNamedOperand(*MI, AMDGPU::OpName::vdst);
+  assert(Dst && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::vdst));
+
+  MachineOperand *DstSel = TII->getNamedOperand(*MI, AMDGPU::OpName::dst_sel);
+  assert(DstSel &&
+         AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::dst_sel));
+  DstSel->setImm(OpSel);
+
+  MachineOperand *DstUnused =
+      TII->getNamedOperand(*MI, AMDGPU::OpName::dst_unused);
+  assert(DstUnused &&
+         AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::dst_unused));
+  assert(DstUnused->getImm() != AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE &&
+         "Dst_unused should not be UNUSED_PRESERVE already");
+  DstUnused->setImm(AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE);
+
+  int PreserveDstIdx =
+      AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst);
+  assert(PreserveDstIdx != -1);
+  MachineOperand NewSrcImplitMO =
+      MachineOperand::CreateReg(SrcMO.getReg(), false, true);
+  copyRegOperand(NewSrcImplitMO, SrcMO);
+  MI->addOperand(NewSrcImplitMO);
+  MI->tieOperands(PreserveDstIdx, MI->getNumOperands() - 1);
+
+  auto ModifySrcSelIntoOpSel = [&](AMDGPU::OpName SrcName,
+                                   AMDGPU::OpName SrcSelName) -> bool {
+    MachineOperand *Src = TII->getNamedOperand(*MI, SrcName);
+    assert(Src && AMDGPU::hasNamedOperand(SDWAOpcode, SrcName));
+    if (Src->isReg() && (Src->getReg() == SrcMO.getReg())) {
+      MachineOperand *SrcSel = TII->getNamedOperand(*MI, SrcSelName);
+      assert(SrcSel && AMDGPU::hasNamedOperand(SDWAOpcode, SrcSelName));
+      SrcSel->setImm(OpSel);
+
+      LLVM_DEBUG(dbgs() << "\nInto:" << *MI << '\n');
+      return true;
+    }
+
+    return false;
+  };
+
+  if (ModifySrcSelIntoOpSel(AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel))
+    return;
+
+  if (ModifySrcSelIntoOpSel(AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel))
+    return;
+}
+
+// Backtrack ParentMI to locate use operands defined by SDWA-convertible FP16
+// instructions. Track the linear chain as long as exactly one qualifying def is
+// found; bail out once the path forks.
+unsigned SDWAFP16ChainOperand::computeMIChainsForPackedOps(
+    SIPeepholeSDWA &Parent, MachineInstr *ParentMI,
+    SmallVectorImpl<MachineOperand *> &DefSrcVec, const GCNSubtarget &ST) {
+  unsigned NumOfFP16Def;
+  MachineRegisterInfo *MRI = Parent.MRI;
+  const SIInstrInfo *TII = Parent.TII;
+
+  // We will go up the use-def chain for ParentMI, until we encounter the
+  // exit condition, where we don't find any such defs of use operands
+  // which satisfy convertibility to SDWA OR find such uses more than 1 as now
+  // we don't know which path to follow-up.
+  do {
+    NumOfFP16Def = 0;
+    MachineInstr *NextMIInChain = nullptr;
+    for (MachineOperand &CurrentMO : ParentMI->uses()) {
+      if (!CurrentMO.isReg() || CurrentMO.getReg().isPhysical() ||
+          !MRI->hasOneUse(CurrentMO.getReg()))
+        continue;
+
+      MachineOperand *DefCurrMO = findSingleRegDef(&CurrentMO, MRI);
+      if (!DefCurrMO)
+        continue;
+
+      MachineInstr *DefCurrMI = DefCurrMO->getParent();
+      if (!isSrcDestFP16Bits(DefCurrMI, TII) ||
+          !isConvertibleToSDWA(*DefCurrMI, ST, TII))
+        continue;
+
+      NextMIInChain = DefCurrMI;
+      DefSrcVec.push_back(DefCurrMO);
+      NumOfFP16Def++;
+    }
+
+    ParentMI = NextMIInChain;
+  } while (NumOfFP16Def == 1);
+
+  return NumOfFP16Def;
+}
+
+// Examine V_PACK_B32_F16 uses and attempt to form an FP16 chain candidate that
+// can be converted into SDWA form. This mirrors the legacy flow:
+//   Op0Initial -> ... -> Op0Final -> pack
+//   Op1Initial -> ... -> Op1Final -> pack
+// If dominance allows, the chains are canonicalized into a single queue that
+// records the order in which SDWA conversions should apply.
+std::optional<FP16PackCandidate>
+SDWAFP16ChainOperand::buildCandidate(SIPeepholeSDWA &Parent, MachineInstr &MI,
+                                     const GCNSubtarget &ST) {
+  if (MI.getOpcode() != AMDGPU::V_PACK_B32_F16_e64 || !ST.has16BitInsts())
+    return std::nullopt;
+
+  const SIInstrInfo *TII = Parent.TII;
+  MachineRegisterInfo *MRI = Parent.MRI;
+  const SIRegisterInfo *TRI = Parent.TRI;
+
+  MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+  MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+  if (!Src0 || !Src1)
+    return std::nullopt;
+
+  if (!Src0->isReg() || Src0->getReg().isPhysical() ||
+      !MRI->hasOneUse(Src0->getReg()) || !Src1->isReg() ||
+      Src1->getReg().isPhysical() || !MRI->hasOneUse(Src1->getReg()))
+    return std::nullopt;
+
+  MachineOperand *Op0 = findSingleRegDef(Src0, MRI);
+  MachineOperand *Op1 = findSingleRegDef(Src1, MRI);
+  if (!Op0 || !Op1)
+    return std::nullopt;
+
+  MachineInstr *ParentMIOp0 = Op0->getParent();
+  MachineInstr *ParentMIOp1 = Op1->getParent();
+
+  if (!isSrcDestFP16Bits(ParentMIOp0, TII) ||
+      !isSrcDestFP16Bits(ParentMIOp1, TII))
+    return std::nullopt;
+
+  if (!isConvertibleToSDWA(*ParentMIOp0, ST, TII) ||
+      !isConvertibleToSDWA(*ParentMIOp1, ST, TII))
+    return std::nullopt;
+
+  SmallVector<MachineOperand *, 8> DefSrc0Vec;
+  SmallVector<MachineOperand *, 8> DefSrc1Vec;
+  DefSrc0Vec.push_back(Op0);
+  DefSrc1Vec.push_back(Op1);
----------------
Pierre-vh wrote:

```suggestion
  SmallVector<MachineOperand *, 8> DefSrc0Vec = { Op0 };
  SmallVector<MachineOperand *, 8> DefSrc1Vec = { Op1 };
```


https://github.com/llvm/llvm-project/pull/137137