[llvm] [AMDGPU] Eliminate unnecessary packing in wider f16 vectors for sdwa/opsel-able instruction (PR #137137)

Tue May 13 02:11:03 PDT 2025

================
@@ -1361,6 +1375,470 @@ bool SIPeepholeSDWALegacy::runOnMachineFunction(MachineFunction &MF) {
   return SIPeepholeSDWA().run(MF);
 }
 
+static bool isSrcDestFP16Bits(MachineInstr *MI, const SIInstrInfo *TII) {
+  unsigned Opcode = MI->getOpcode();
+  if (TII->isSDWA(Opcode))
+    Opcode = AMDGPU::getBasicFromSDWAOp(Opcode);
+
+  switch (Opcode) {
+  case AMDGPU::V_CVT_F16_U16_e32:
+  case AMDGPU::V_CVT_F16_U16_e64:
+  case AMDGPU::V_CVT_F16_I16_e32:
+  case AMDGPU::V_CVT_F16_I16_e64:
+  case AMDGPU::V_RCP_F16_e64:
+  case AMDGPU::V_RCP_F16_e32:
+  case AMDGPU::V_RSQ_F16_e64:
+  case AMDGPU::V_RSQ_F16_e32:
+  case AMDGPU::V_SQRT_F16_e64:
+  case AMDGPU::V_SQRT_F16_e32:
+  case AMDGPU::V_LOG_F16_e64:
+  case AMDGPU::V_LOG_F16_e32:
+  case AMDGPU::V_EXP_F16_e64:
+  case AMDGPU::V_EXP_F16_e32:
+  case AMDGPU::V_SIN_F16_e64:
+  case AMDGPU::V_SIN_F16_e32:
+  case AMDGPU::V_COS_F16_e64:
+  case AMDGPU::V_COS_F16_e32:
+  case AMDGPU::V_FLOOR_F16_e64:
+  case AMDGPU::V_FLOOR_F16_e32:
+  case AMDGPU::V_CEIL_F16_e64:
+  case AMDGPU::V_CEIL_F16_e32:
+  case AMDGPU::V_TRUNC_F16_e64:
+  case AMDGPU::V_TRUNC_F16_e32:
+  case AMDGPU::V_RNDNE_F16_e64:
+  case AMDGPU::V_RNDNE_F16_e32:
+  case AMDGPU::V_FRACT_F16_e64:
+  case AMDGPU::V_FRACT_F16_e32:
+  case AMDGPU::V_FREXP_MANT_F16_e64:
+  case AMDGPU::V_FREXP_MANT_F16_e32:
+  case AMDGPU::V_FREXP_EXP_I16_F16_e64:
+  case AMDGPU::V_FREXP_EXP_I16_F16_e32:
+  case AMDGPU::V_LDEXP_F16_e64:
+  case AMDGPU::V_LDEXP_F16_e32:
+  case AMDGPU::V_ADD_F16_e64:
+  case AMDGPU::V_ADD_F16_e32:
+  case AMDGPU::V_SUB_F16_e64:
+  case AMDGPU::V_SUB_F16_e32:
+  case AMDGPU::V_SUBREV_F16_e64:
+  case AMDGPU::V_SUBREV_F16_e32:
+  case AMDGPU::V_MUL_F16_e64:
+  case AMDGPU::V_MUL_F16_e32:
+  case AMDGPU::V_MAX_F16_e64:
+  case AMDGPU::V_MAX_F16_e32:
+  case AMDGPU::V_MIN_F16_e64:
+  case AMDGPU::V_MIN_F16_e32:
+  case AMDGPU::V_MAD_F16_e64:
+  case AMDGPU::V_FMA_F16_e64:
+  case AMDGPU::V_DIV_FIXUP_F16_e64:
+    return true;
+  case AMDGPU::V_MADAK_F16:
+  case AMDGPU::V_MADMK_F16:
+  case AMDGPU::V_FMAMK_F16:
+  case AMDGPU::V_FMAAK_F16:
+    // NOTE : SKEPTICAL ABOUT IT
+    return false;
+  case AMDGPU::V_FMAC_F16_e32:
+  case AMDGPU::V_FMAC_F16_e64:
+  case AMDGPU::V_MAC_F16_e32:
+  case AMDGPU::V_MAC_F16_e64:
+    // As their sdwa version allow dst_sel to be equal only set to DWORD
+  default:
+    return false;
+  }
+}
+
+static bool checkForRightSrcRootAccess(MachineInstr *Def0MI,
+                                       MachineInstr *Def1MI,
+                                       Register SrcRootReg,
+                                       const SIInstrInfo *TII) {
+  // As if could, the Def1MI would have been sdwa-ed in order to access
+  // upper half, and Def0MI should not be as it accessing lower half.
+  if (!TII->isSDWA(Def1MI->getOpcode()) || TII->isSDWA(Def0MI->getOpcode()))
+    return false;
+
+  // Def1 should be writing into entire DWORD of dst, with unused part set
+  // to zero-pad.
+  MachineOperand *Def1DstSel =
+      TII->getNamedOperand(*Def1MI, AMDGPU::OpName::dst_sel);
+  if (!Def1DstSel || Def1DstSel->getImm() != AMDGPU::SDWA::SdwaSel::DWORD)
+    return false;
+  MachineOperand *Def1DstUnused =
+      TII->getNamedOperand(*Def1MI, AMDGPU::OpName::dst_unused);
+  if (!Def1DstUnused ||
+      Def1DstUnused->getImm() != AMDGPU::SDWA::DstUnused::UNUSED_PAD)
+    return false;
+
+  const auto checkSrcSel = [&](MachineInstr *DefMI, AMDGPU::OpName SrcName,
+                               AMDGPU::OpName SrcSelName,
+                               AMDGPU::SDWA::SdwaSel SdwaSel) -> bool {
+    MachineOperand *DefSrc = TII->getNamedOperand(*DefMI, SrcName);
+    if (DefSrc && DefSrc->isReg() && (DefSrc->getReg() == SrcRootReg)) {
+      MachineOperand *DefSrcSel = TII->getNamedOperand(*DefMI, SrcSelName);
+      if (SdwaSel == AMDGPU::SDWA::SdwaSel::WORD_0) {
+        if (!DefSrcSel || DefSrcSel->getImm() == SdwaSel)
+          return true;
+      } else {
+        assert(SdwaSel == AMDGPU::SDWA::SdwaSel::WORD_1 &&
+               "Not valid SDWA SrcSel operand");
+        if (DefSrcSel && DefSrcSel->getImm() == SdwaSel)
+          return true;
+      }
+    }
+    return false;
+  };
+
+  const auto checkForDef0MIAccess = [&]() -> bool {
+    if (checkSrcSel(Def0MI, AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel,
+                    AMDGPU::SDWA::SdwaSel::WORD_0))
+      return true;
+    if (checkSrcSel(Def0MI, AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel,
+                    AMDGPU::SDWA::SdwaSel::WORD_0))
+      return true;
+    return false;
+  };
+
+  if (checkSrcSel(Def1MI, AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel,
+                  AMDGPU::SDWA::SdwaSel::WORD_1))
+    if (checkForDef0MIAccess())
+      return true;
+
+  if (checkSrcSel(Def1MI, AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel,
+                  AMDGPU::SDWA::SdwaSel::WORD_1))
+    if (checkForDef0MIAccess())
+      return true;
----------------
vg0204 wrote:

I am about to make sense of it only, when at your suggested's snippet line 1502, the lambda call gets negation. Don't you think so? 

https://github.com/llvm/llvm-project/pull/137137