[llvm] [AMDGPU] [PeepholeOpt] Eliminate unnecessary packing in wider f16 vectors for sdwa/opsel-able instruction (PR #137137)

Thu Apr 24 03:24:06 PDT 2025

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Vikash Gupta (vg0204)

<details>
<summary>Changes</summary>

As the compiler has no fp16 packed instruction, so isel scalarizes each fp16 operation in wide fp16 vectors and generates separate individual fp16 results, which are later packed. Now, in post-isel pass in SIPeepholeSDWA pass, opportunistically any instructions is
eventually converted into its SDWA/OPSEL-able version.

This patch gets rids of unnecessary packing in wider fp16 vectors operation for SDWA/OPSEL-able instruction, by overwriting the partial fp16 result into same input register partially, while maintaining the sanctity of rest of bits in input register, using OPSEL dst_unused operand set as UNUSED_PRESERVED. Owing to the context of generating SDWA instructions, it is invoked at the end of the SIPeepholeSDWA pass.

#137150 provides the pre-commit testcase!

---

Patch is 150.79 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/137137.diff


20 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp (+514-2) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll (+40-48) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll (+16-24) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll (+48-52) 
- (modified) llvm/test/CodeGen/AMDGPU/fdiv.f16.ll (+8-12) 
- (modified) llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll (+21-30) 
- (modified) llvm/test/CodeGen/AMDGPU/fract-match.ll (+5-6) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll (+8-10) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.exp.ll (+6-8) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.exp10.ll (+8-10) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.exp2.ll (+14-21) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.frexp.ll (+11-12) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll (+18-26) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.log.ll (+150-110) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.log10.ll (+150-110) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.log2.ll (+44-66) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll (+2-3) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll (+8-10) 
- (modified) llvm/test/CodeGen/AMDGPU/repeated-divisor.ll (+5-7) 
- (modified) llvm/test/CodeGen/AMDGPU/roundeven.ll (+32-48) 


``````````diff

diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 22f23e4c94e2d..b8bb27430707a 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -27,6 +27,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include <optional>
+#include <queue>
 
 using namespace llvm;
 
@@ -35,6 +36,11 @@ using namespace llvm;
 STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found.");
 STATISTIC(NumSDWAInstructionsPeepholed,
           "Number of instruction converted to SDWA.");
+STATISTIC(Num16BitPackedInstructionsEliminated,
+          "Number of packed instruction eliminated.");
+STATISTIC(NumSDWAInstructionsToEliminateFP16Pack,
+          "Number of instruction converted/modified into SDWA to eliminate "
+          "FP16 packing.");
 
 namespace {
 
@@ -66,6 +72,14 @@ class SIPeepholeSDWA {
   bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
   void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const;
 
+  void eliminateFP16Packing(MachineBasicBlock &MBB, const GCNSubtarget &ST);
+  unsigned
+  computeMIChainsForPackedOps(MachineInstr *ParentMI,
+                              std::queue<MachineOperand *> &DefSrcQueue,
+                              const GCNSubtarget &ST);
+  void convertMIToSDWAWithOpsel(MachineInstr &MI, MachineOperand &SrcMO,
+                                AMDGPU::SDWA::SdwaSel OpSel);
+
 public:
   bool run(MachineFunction &MF);
 };
@@ -266,13 +280,17 @@ void SDWADstPreserveOperand::print(raw_ostream& OS) const {
 
 #endif
 
-static void copyRegOperand(MachineOperand &To, const MachineOperand &From) {
+static void copyRegOperand(MachineOperand &To, const MachineOperand &From,
+                           bool isKill = false) {
   assert(To.isReg() && From.isReg());
   To.setReg(From.getReg());
   To.setSubReg(From.getSubReg());
   To.setIsUndef(From.isUndef());
   if (To.isUse()) {
-    To.setIsKill(From.isKill());
+    if (isKill)
+      To.setIsKill(true);
+    else
+      To.setIsKill(From.isKill());
   } else {
     To.setIsDead(From.isDead());
   }
@@ -1361,6 +1379,494 @@ bool SIPeepholeSDWALegacy::runOnMachineFunction(MachineFunction &MF) {
   return SIPeepholeSDWA().run(MF);
 }
 
+static bool isSrcDestFP16Bits(MachineInstr *MI, const SIInstrInfo *TII) {
+  unsigned Opcode = MI->getOpcode();
+  if (TII->isSDWA(Opcode))
+    Opcode = AMDGPU::getBasicFromSDWAOp(Opcode);
+
+  switch (Opcode) {
+  case AMDGPU::V_CVT_F16_U16_e32:
+  case AMDGPU::V_CVT_F16_U16_e64:
+  case AMDGPU::V_CVT_F16_I16_e32:
+  case AMDGPU::V_CVT_F16_I16_e64:
+  case AMDGPU::V_RCP_F16_e64:
+  case AMDGPU::V_RCP_F16_e32:
+  case AMDGPU::V_RSQ_F16_e64:
+  case AMDGPU::V_RSQ_F16_e32:
+  case AMDGPU::V_SQRT_F16_e64:
+  case AMDGPU::V_SQRT_F16_e32:
+  case AMDGPU::V_LOG_F16_e64:
+  case AMDGPU::V_LOG_F16_e32:
+  case AMDGPU::V_EXP_F16_e64:
+  case AMDGPU::V_EXP_F16_e32:
+  case AMDGPU::V_SIN_F16_e64:
+  case AMDGPU::V_SIN_F16_e32:
+  case AMDGPU::V_COS_F16_e64:
+  case AMDGPU::V_COS_F16_e32:
+  case AMDGPU::V_FLOOR_F16_e64:
+  case AMDGPU::V_FLOOR_F16_e32:
+  case AMDGPU::V_CEIL_F16_e64:
+  case AMDGPU::V_CEIL_F16_e32:
+  case AMDGPU::V_TRUNC_F16_e64:
+  case AMDGPU::V_TRUNC_F16_e32:
+  case AMDGPU::V_RNDNE_F16_e64:
+  case AMDGPU::V_RNDNE_F16_e32:
+  case AMDGPU::V_FRACT_F16_e64:
+  case AMDGPU::V_FRACT_F16_e32:
+  case AMDGPU::V_FREXP_MANT_F16_e64:
+  case AMDGPU::V_FREXP_MANT_F16_e32:
+  case AMDGPU::V_FREXP_EXP_I16_F16_e64:
+  case AMDGPU::V_FREXP_EXP_I16_F16_e32:
+  case AMDGPU::V_LDEXP_F16_e64:
+  case AMDGPU::V_LDEXP_F16_e32:
+  case AMDGPU::V_ADD_F16_e64:
+  case AMDGPU::V_ADD_F16_e32:
+  case AMDGPU::V_SUB_F16_e64:
+  case AMDGPU::V_SUB_F16_e32:
+  case AMDGPU::V_SUBREV_F16_e64:
+  case AMDGPU::V_SUBREV_F16_e32:
+  case AMDGPU::V_MUL_F16_e64:
+  case AMDGPU::V_MUL_F16_e32:
+  case AMDGPU::V_MAX_F16_e64:
+  case AMDGPU::V_MAX_F16_e32:
+  case AMDGPU::V_MIN_F16_e64:
+  case AMDGPU::V_MIN_F16_e32:
+  case AMDGPU::V_MAD_F16_e64:
+  case AMDGPU::V_FMA_F16_e64:
+  case AMDGPU::V_DIV_FIXUP_F16_e64:
+    return true;
+  case AMDGPU::V_MADAK_F16:
+  case AMDGPU::V_MADMK_F16:
+  case AMDGPU::V_FMAMK_F16:
+  case AMDGPU::V_FMAAK_F16:
+    // NOTE : SKEPTICAL ABOUT IT
+    return false;
+  case AMDGPU::V_FMAC_F16_e32:
+  case AMDGPU::V_FMAC_F16_e64:
+  case AMDGPU::V_MAC_F16_e32:
+  case AMDGPU::V_MAC_F16_e64:
+    // As their sdwa version allow dst_sel to be equal only set to DWORD
+  default:
+    return false;
+  }
+}
+
+static bool checkForRightSrcRootAccess(MachineInstr *Def0MI,
+                                       MachineInstr *Def1MI,
+                                       Register SrcRootReg,
+                                       const SIInstrInfo *TII) {
+  // As if could, the Def1MI would have been sdwa-ed
+  if (!TII->isSDWA(Def1MI->getOpcode()))
+    return false;
+
+  MachineOperand *Def1Src0 =
+      TII->getNamedOperand(*Def1MI, AMDGPU::OpName::src0);
+  MachineOperand *Def1Src1 =
+      TII->getNamedOperand(*Def1MI, AMDGPU::OpName::src1);
+  MachineOperand *Def0Src0 =
+      TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src0);
+  MachineOperand *Def0Src1 =
+      TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src1);
+
+  if (Def1Src0 && Def1Src0->isReg() && (Def1Src0->getReg() == SrcRootReg)) {
+    MachineOperand *Def1Src0Sel =
+        TII->getNamedOperand(*Def1MI, AMDGPU::OpName::src0_sel);
+    if (!Def1Src0Sel ||
+        (Def1Src0Sel->getImm() != AMDGPU::SDWA::SdwaSel::WORD_1))
+      return false;
+
+    if (Def0Src0 && Def0Src0->isReg() && (Def0Src0->getReg() == SrcRootReg)) {
+      MachineOperand *Def0Src0Sel =
+          TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src0_sel);
+      if (!Def0Src0Sel)
+        return true;
+      if (Def0Src0Sel && Def0Src0Sel->getImm() == AMDGPU::SDWA::SdwaSel::WORD_0)
+        return true;
+    }
+
+    if (Def0Src1 && Def0Src1->isReg() && (Def0Src1->getReg() == SrcRootReg)) {
+      MachineOperand *Def0Src1Sel =
+          TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src1_sel);
+      if (!Def0Src1Sel)
+        return true;
+      if (Def0Src1Sel && Def0Src1Sel->getImm() == AMDGPU::SDWA::SdwaSel::WORD_0)
+        return true;
+    }
+  }
+
+  if (Def1Src1 && Def1Src1->isReg() && (Def1Src1->getReg() == SrcRootReg)) {
+    MachineOperand *Def1Src1Sel =
+        TII->getNamedOperand(*Def1MI, AMDGPU::OpName::src1_sel);
+    if (!Def1Src1Sel ||
+        (Def1Src1Sel->getImm() != AMDGPU::SDWA::SdwaSel::WORD_1))
+      return false;
+
+    if (Def0Src0 && Def0Src0->isReg() && (Def0Src0->getReg() == SrcRootReg)) {
+      MachineOperand *Def0Src0Sel =
+          TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src0_sel);
+      if (!Def0Src0Sel)
+        return true;
+      if (Def0Src0Sel && Def0Src0Sel->getImm() == AMDGPU::SDWA::SdwaSel::WORD_0)
+        return true;
+    }
+
+    if (Def0Src1 && Def0Src1->isReg() && (Def0Src1->getReg() == SrcRootReg)) {
+      MachineOperand *Def0Src1Sel =
+          TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src1_sel);
+      if (!Def0Src1Sel)
+        return true;
+      if (Def0Src1Sel && Def0Src1Sel->getImm() == AMDGPU::SDWA::SdwaSel::WORD_0)
+        return true;
+    }
+  }
+
+  return false;
+}
+
+/// Given A and B are in the same MBB, returns true if A comes before B.
+static bool dominates(MachineBasicBlock::const_iterator A,
+                      MachineBasicBlock::const_iterator B) {
+  assert(A->getParent() == B->getParent());
+  const MachineBasicBlock *MBB = A->getParent();
+  auto MBBEnd = MBB->end();
+  if (B == MBBEnd)
+    return true;
+
+  MachineBasicBlock::const_iterator I = MBB->begin();
+  for (; &*I != A && &*I != B; ++I)
+    ;
+
+  return &*I == A;
+}
+
+// Convert MI into its SDWA version with its Dst_Sel & SrcMO_Sel set with OpSel
+// and preserving the rest of Dst's bits.
+void SIPeepholeSDWA::convertMIToSDWAWithOpsel(MachineInstr &MI,
+                                              MachineOperand &SrcMO,
+                                              AMDGPU::SDWA::SdwaSel OpSel) {
+  LLVM_DEBUG(dbgs() << "Convert instruction:" << MI);
+
+  MachineInstr *SDWAInst;
+  if (TII->isSDWA(MI.getOpcode())) {
+    SDWAInst = &MI;
+  } else {
+    SDWAInst = createSDWAVersion(MI);
+    MI.eraseFromParent();
+  }
+
+  ConvertedInstructions.push_back(SDWAInst);
+  unsigned SDWAOpcode = SDWAInst->getOpcode();
+  ++NumSDWAInstructionsToEliminateFP16Pack;
+
+  MachineOperand *Dst = TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::vdst);
+  assert(Dst && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::vdst));
+
+  MachineOperand *DstSel =
+      TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::dst_sel);
+  assert(DstSel &&
+         AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::dst_sel));
+  DstSel->setImm(OpSel);
+
+  MachineOperand *DstUnused =
+      TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::dst_unused);
+  assert(DstUnused &&
+         AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::dst_unused));
+  assert(!(DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) &&
+         "Dst_unused should not be UNUSED_PRESERVE already");
+  DstUnused->setImm(AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE);
+
+  auto PreserveDstIdx =
+      AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst);
+  assert(PreserveDstIdx != -1);
+  auto NewSrcImplitMO = MachineOperand::CreateReg(SrcMO.getReg(), false, true);
+  copyRegOperand(NewSrcImplitMO, SrcMO);
+  SDWAInst->addOperand(NewSrcImplitMO);
+  SDWAInst->tieOperands(PreserveDstIdx, SDWAInst->getNumOperands() - 1);
+
+  MachineOperand *Src0 = TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::src0);
+  assert(Src0 && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0));
+  if (Src0->isReg() && (Src0->getReg() == SrcMO.getReg())) {
+    MachineOperand *Src0Sel =
+        TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::src0_sel);
+    assert(Src0Sel &&
+           AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_sel));
+    Src0Sel->setImm(OpSel);
+
+    LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n');
+    return;
+  }
+
+  MachineOperand *Src1 = TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::src1);
+  assert(Src1 && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1));
+  if (Src1->isReg() && (Src1->getReg() == SrcMO.getReg())) {
+    MachineOperand *Src1Sel =
+        TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::src1_sel);
+    assert(Src1Sel &&
+           AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_sel));
+    Src1Sel->setImm(OpSel);
+
+    LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n');
+    return;
+  }
+}
+
+// BackTracks the given Parent MI to look for any of its use operand that has
+// been defined by FP16 (sdwa-able) in recursive fashion.
+unsigned SIPeepholeSDWA::computeMIChainsForPackedOps(
+    MachineInstr *ParentMI, std::queue<MachineOperand *> &DefSrcQueue,
+    const GCNSubtarget &ST) {
+  unsigned NumOfFP16Def;
+  do {
+    MachineInstr *NextMIInChain = nullptr;
+    NumOfFP16Def = 0;
+    for (MachineOperand &currentMO : ParentMI->uses()) {
+      if (!currentMO.isReg() || currentMO.getReg().isPhysical() ||
+          !MRI->hasOneUse(currentMO.getReg()))
+        continue;
+
+      MachineOperand *DefCurrMO = findSingleRegDef(&currentMO, MRI);
+      if (!DefCurrMO)
+        continue;
+
+      MachineInstr *DefCurrMI = DefCurrMO->getParent();
+      if (!isSrcDestFP16Bits(DefCurrMI, TII) ||
+          !isConvertibleToSDWA(*DefCurrMI, ST, TII))
+        continue;
+
+      NextMIInChain = DefCurrMI;
+      DefSrcQueue.push(DefCurrMO);
+      NumOfFP16Def++;
+    }
+
+    if (NumOfFP16Def > 1)
+      break;
+
+    ParentMI = NextMIInChain;
+  } while (ParentMI);
+
+  return NumOfFP16Def;
+}
+
+void SIPeepholeSDWA::eliminateFP16Packing(MachineBasicBlock &MBB,
+                                          const GCNSubtarget &ST) {
+  if (!ST.has16BitInsts())
+    return;
+
+  for (MachineInstr &MI : make_early_inc_range(MBB)) {
+    if (MI.getOpcode() == AMDGPU::V_PACK_B32_F16_e64) {
+      LLVM_DEBUG(dbgs() << "\nCandidate FP16 Packed MI : " << MI << '\n');
+      std::queue<MachineOperand *> DefSrc0Queue;
+      std::queue<MachineOperand *> DefSrc1Queue;
+      MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+      MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+
+      if (!Src0->isReg() || Src0->getReg().isPhysical() ||
+          !MRI->hasOneUse(Src0->getReg()) || !Src1->isReg() ||
+          Src1->getReg().isPhysical() || !MRI->hasOneUse(Src1->getReg()))
+        continue;
+
+      MachineOperand *Op0 = findSingleRegDef(Src0, MRI);
+      MachineOperand *Op1 = findSingleRegDef(Src1, MRI);
+
+      if (!Op0 || !Op1)
+        continue;
+
+      MachineInstr *ParentMIOp0 = Op0->getParent();
+      MachineInstr *ParentMIOp1 = Op1->getParent();
+
+      if (!isSrcDestFP16Bits(ParentMIOp0, TII) ||
+          !isSrcDestFP16Bits(ParentMIOp1, TII) ||
+          !isConvertibleToSDWA(*ParentMIOp0, ST, TII) ||
+          !isConvertibleToSDWA(*ParentMIOp1, ST, TII))
+        continue;
+
+      DefSrc0Queue.push(Op0);
+      DefSrc1Queue.push(Op1);
+
+      // This checks for the given MI, that it only has exact one register MO
+      // use , that is defined by pure FP16 instruction (that is SDWA-able too)
+      unsigned NumOfFP16Def;
+
+      NumOfFP16Def = computeMIChainsForPackedOps(ParentMIOp0, DefSrc0Queue, ST);
+      if (NumOfFP16Def > 1)
+        continue;
+
+      NumOfFP16Def = computeMIChainsForPackedOps(ParentMIOp1, DefSrc1Queue, ST);
+      if (NumOfFP16Def > 1)
+        continue;
+
+      MachineInstr *Def0RootMI = (DefSrc0Queue.back())->getParent();
+      MachineInstr *Def1RootMI = (DefSrc1Queue.back())->getParent();
+      Register SrcRootMOReg = AMDGPU::NoRegister;
+
+      // Now, check if the last operation for each in of the DefSrcQueue
+      // has the common MO, that would be the source root MO for element-wise
+      // fp16 chain operations
+      for (MachineOperand &Current0MO : Def0RootMI->uses()) {
+        if (!Current0MO.isReg() || Current0MO.getReg().isPhysical())
+          continue;
+
+        for (MachineOperand &Current1MO : Def1RootMI->uses()) {
+          if (!Current1MO.isReg() || Current1MO.getReg().isPhysical())
+            continue;
+
+          if (Current0MO.getReg() == Current1MO.getReg() &&
+              Current0MO.getSubReg() == Current1MO.getSubReg()) {
+            SrcRootMOReg = Current0MO.getReg();
+            break;
+          }
+        }
+        // Found it, no more check needed, so break;
+        if (SrcRootMOReg != AMDGPU::NoRegister)
+          break;
+      }
+
+      if (SrcRootMOReg == AMDGPU::NoRegister)
+        continue;
+
+      // Also we need to ensure that each of the DefXRootMI should access the
+      // lower and upper half word of SrcRootMOReg respectively.
+      if (!checkForRightSrcRootAccess(Def0RootMI, Def1RootMI, SrcRootMOReg,
+                                      TII))
+        continue;
+
+      // The graph below represents the connection :
+      //       Op0Intial  -->  Op0x  --> ...  -->  Op0Final
+      //      /                                       \'
+      // SrcRootMO                                    v_Pack_b32_f16
+      //      \                                       /
+      //       Op1Intial  -->  Op1x  --> ...  -->  Op1Final
+      // The nomenclature is based upon above flow-graph
+      //
+      // Also for each of DefSrcXQueue :
+      // OpXIntial is at back & OpXFinal is at front
+      auto Op0FinalMI = (DefSrc0Queue.front())->getParent();
+      auto Op1FinalMI = (DefSrc1Queue.front())->getParent();
+      auto Op0IntialMI = (DefSrc0Queue.back())->getParent();
+      auto Op1IntialMI = (DefSrc1Queue.back())->getParent();
+
+      MachineOperand *FinalOutMO = nullptr;
+      std::queue<MachineOperand *> ChainedDefOps;
+      AMDGPU::SDWA::SdwaSel OpSel = AMDGPU::SDWA::SdwaSel::DWORD;
+      int NumOfElemInSecondOpChain = 0;
+
+      // Now, we will change the flow as per the dominace of MI as follows, if
+      // possible and store it in ChainedDefOps, so later can be used to convert
+      // into its SDWA version:
+      //
+      // If (dominates(Op0FinalMI, Op1IntialMI)) == TRUE
+      // SrcRootMO -> Op0Intial -> Op0x -> ... -> Op0Final
+      // -> Op1Intial -> Op1x -> ... -> Op1Final (FinalOutMO)
+      //
+      // If (dominates(Op1FinalMI, Op0IntialMI)) == TRUE
+      // SrcRootMO -> Op1Intial -> Op1x -> ... -> Op1Final
+      // -> Op0Intial -> Op0x -> ... -> Op0Final (FinalOutMO)
+      //
+      // TODO : Else, not handled!
+      // One such case is observed when multiple fp16 instruction are chained
+      // on a fp16 vector input. For Example :
+      //
+      // %1 = call <2 x half> @llvm.log.v2f16 (<2 x half> %0)
+      // %res = call <2 x half> @llvm.sin.v2f16 (<2 x half> %1)
+      // return <2 x half> %res
+      if (dominates(Op0FinalMI, Op1IntialMI)) {
+        int OpIdx = Op1IntialMI->findRegisterUseOperandIdx(SrcRootMOReg, TRI);
+        auto &MOTo = Op1IntialMI->getOperand(OpIdx);
+        auto MOFrom = DefSrc0Queue.front();
+        copyRegOperand(MOTo, *MOFrom, true);
+        FinalOutMO = DefSrc1Queue.front();
+
+        LLVM_DEBUG(dbgs() << "Updated Connecting MI : " << *Op1IntialMI
+                          << '\n');
+        OpIdx = Op0IntialMI->findRegisterUseOperandIdx(SrcRootMOReg, TRI);
+        auto &IntialInMO = Op0IntialMI->getOperand(OpIdx);
+
+        while (!DefSrc1Queue.empty()) {
+          ChainedDefOps.push(DefSrc1Queue.front());
+          DefSrc1Queue.pop();
+          NumOfElemInSecondOpChain++;
+        }
+        while (!DefSrc0Queue.empty()) {
+          ChainedDefOps.push(DefSrc0Queue.front());
+          DefSrc0Queue.pop();
+        }
+
+        ChainedDefOps.push(&IntialInMO);
+        OpSel = AMDGPU::SDWA::SdwaSel::WORD_1;
+      } else if (dominates(Op1FinalMI, Op0IntialMI)) {
+        int OpIdx = Op0IntialMI->findRegisterUseOperandIdx(SrcRootMOReg, TRI);
+        auto &MOTo = Op0IntialMI->getOperand(OpIdx);
+        auto MOFrom = DefSrc1Queue.front();
+        copyRegOperand(MOTo, *MOFrom, true);
+        FinalOutMO = DefSrc0Queue.front();
+
+        LLVM_DEBUG(dbgs() << "Updated Connecting MI : " << *Op0IntialMI
+                          << '\n');
+        OpIdx = Op1IntialMI->findRegisterUseOperandIdx(SrcRootMOReg, TRI);
+        auto &IntialInMO = Op1IntialMI->getOperand(OpIdx);
+
+        while (!DefSrc0Queue.empty()) {
+          ChainedDefOps.push(DefSrc0Queue.front());
+          DefSrc0Queue.pop();
+          NumOfElemInSecondOpChain++;
+        }
+        while (!DefSrc1Queue.empty()) {
+          ChainedDefOps.push(DefSrc1Queue.front());
+          DefSrc1Queue.pop();
+        }
+
+        ChainedDefOps.push(&IntialInMO);
+        OpSel = AMDGPU::SDWA::SdwaSel::WORD_0;
+      } else {
+        LLVM_DEBUG(dbgs() << "No Connecting MI exists" << '\n');
+        continue;
+      }
+
+      // Replace all use places of MI(v_pack) defMO with FinalOutMO.
+      MachineOperand &DefMO = MI.getOperand(0);
+      for (MachineOperand &MO :
+           make_early_inc_range(MRI->use_nodbg_operands(DefMO.getReg()))) {
+        if (!MO.isReg())
+          continue;
+
+        MO.setReg(FinalOutMO->getReg());
+        MO.setSubReg(FinalOutMO->getSubReg());
+      }
+      LLVM_DEBUG(dbgs() << "Replace uses of " << DefMO << " in " << MI
+                        << "With " << *FinalOutMO << '\n');
+
+      // Delete v_pack machine instruction
+      LLVM_DEBUG(dbgs() << "\nInstruction to be deleted : " << MI << "\n\n");
+      MI.eraseFromParent();
+      ++Num16BitPackedInstructionsEliminated;
+
+      // Convert machine instruction into SDWA-version
+      while (ChainedDefOps.size() != 1) {
+        if (NumOfElemInSecondOpChain == 0) {
+          if (OpSel == AMDGPU::SDWA::SdwaSel::WORD_0)
+            OpSel = AMDGPU::SDWA::SdwaSel::WORD_1;
+          else
+            OpSel = AMDGPU::SDWA::SdwaSel::WORD_0;
+        }
+
+        MachineInstr *DefMI = ChainedDefOps.front()->getParent();
+        ChainedDefOps.pop();
+        MachineOperand *SrcMO = ChainedDefOps.front();
+
+        // Take SrcMO (which are def) as its usage in DefMI
+        if (SrcMO->isDef()) {
+          assert(MRI->hasOneUse(SrcMO->getReg()));
+          SrcMO = findSingleRegUse(SrcMO, MRI);
+          assert(DefMI == S...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/137137