[llvm] [AMDGPU] Eliminate unnecessary packing in wider f16 vectors for sdwa/opsel-able instruction (PR #137137)

Thu Dec 11 00:42:23 PST 2025

================
@@ -1369,6 +1458,411 @@ bool SIPeepholeSDWALegacy::runOnMachineFunction(MachineFunction &MF) {
   return SIPeepholeSDWA().run(MF);
 }
 
+/// Returns true if the instruction has FP16 destination and all 16-bit sources.
+/// This is TableGen-generated via VOPSrcDestFP16Table in VOPInstructions.td.
+static bool isSrcDestFP16Bits(MachineInstr *MI, const SIInstrInfo *TII) {
+  unsigned Opcode = MI->getOpcode();
+  if (TII->isSDWA(Opcode))
+    Opcode = AMDGPU::getBasicFromSDWAOp(Opcode);
+
+  return AMDGPU::isSrcDestFP16Inst(Opcode);
+}
+
+static bool checkForRightSrcRootAccess(MachineInstr *Def0MI,
+                                       MachineInstr *Def1MI,
+                                       Register SrcRootReg,
+                                       const SIInstrInfo *TII) {
+  // The intended scenario is that Def1MI already reads the upper half from
+  // SrcRootReg via SDWA-able instruction while Def0MI still consumes the lower
+  // half from SrcRootReg without the SDWA counterpart. Any other arrangement
+  // would imply violation of SrcRootReg usage.
+  if (!TII->isSDWA(Def1MI->getOpcode()) || TII->isSDWA(Def0MI->getOpcode()))
+    return false;
+  MachineOperand *Def1DstSel =
+      TII->getNamedOperand(*Def1MI, AMDGPU::OpName::dst_sel);
+  if (!Def1DstSel || Def1DstSel->getImm() != AMDGPU::SDWA::SdwaSel::DWORD)
+    return false;
+  MachineOperand *Def1DstUnused =
+      TII->getNamedOperand(*Def1MI, AMDGPU::OpName::dst_unused);
+  if (!Def1DstUnused ||
+      Def1DstUnused->getImm() != AMDGPU::SDWA::DstUnused::UNUSED_PAD)
+    return false;
+
+  // Helper to validate whether DefMI uses SrcRootReg as the specified source
+  // operand (SrcName), and if the corresponding SDWA selection operand
+  // (SrcSelName) matches the expected SdwaSel.
+  const auto CheckSrcSel = [&](MachineInstr *DefMI, AMDGPU::OpName SrcName,
+                               AMDGPU::OpName SrcSelName,
+                               AMDGPU::SDWA::SdwaSel SdwaSel) -> bool {
+    MachineOperand *DefSrc = TII->getNamedOperand(*DefMI, SrcName);
+    if (DefSrc && DefSrc->isReg() && (DefSrc->getReg() == SrcRootReg)) {
+      MachineOperand *DefSrcSel = TII->getNamedOperand(*DefMI, SrcSelName);
+      if (SdwaSel == AMDGPU::SDWA::SdwaSel::WORD_0) {
+        if (!DefSrcSel || DefSrcSel->getImm() == SdwaSel)
+          return true;
+      } else {
+        assert(SdwaSel == AMDGPU::SDWA::SdwaSel::WORD_1 &&
+               "Not valid SDWA SrcSel operand");
+        if (DefSrcSel && DefSrcSel->getImm() == SdwaSel)
+          return true;
+      }
+    }
+    return false;
+  };
+
+  if (!CheckSrcSel(Def1MI, AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel,
+                   AMDGPU::SDWA::SdwaSel::WORD_1) &&
+      !CheckSrcSel(Def1MI, AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel,
+                   AMDGPU::SDWA::SdwaSel::WORD_1))
+    return false;
+
+  return CheckSrcSel(Def0MI, AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel,
+                     AMDGPU::SDWA::SdwaSel::WORD_0) ||
+         CheckSrcSel(Def0MI, AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel,
+                     AMDGPU::SDWA::SdwaSel::WORD_0);
+}
+
+/// Given A and B are in the same MBB, returns true if A comes before B.
+static bool dominates(MachineBasicBlock::const_iterator A,
+                      MachineBasicBlock::const_iterator B) {
+  assert(A->getParent() == B->getParent());
+  const MachineBasicBlock *MBB = A->getParent();
+  auto MBBEnd = MBB->end();
+  if (B == MBBEnd)
+    return true;
+
+  if (A == MBBEnd)
+    return false;
----------------
Pierre-vh wrote:

I don't think you need those distinct checks, the loop below will handle it naturally.

https://github.com/llvm/llvm-project/pull/137137