[llvm] [AMDGPU] [PeepholeOpt] Eliminate unnecessary packing in wider f16 vectors for sdwa/opsel-able instruction (PR #137137)

Vikash Gupta via llvm-commits llvm-commits at lists.llvm.org
Thu Apr 24 01:31:12 PDT 2025


https://github.com/vg0204 created https://github.com/llvm/llvm-project/pull/137137

As the compiler has no fp16 packed instruction, so isel scalarizes each fp16 operation in wide fp16 vectors and generates separate individual fp16 results, which are later packed. Now, in post-isel pass in SIPeepholeSDWA pass, opportunistically any instructions is
eventually converted into its SDWA/OPSEL-able version.

This patch gets rids of unnecessary packing in wider fp16 vectors operation for SDWA/OPSEL-able instruction, by overwriting the partial fp16 result into same input register partially, while maintaining the sanctity of rest of bits in input register, using OPSEL dst_unused operand set as UNUSED_PRESERVED. Owing to the context of generating SDWA instructions, it is invoked at the end of the SIPeepholeSDWA pass.

>From 32fbd612efb2d006904c1f964a91b2ed77d6a24f Mon Sep 17 00:00:00 2001
From: vikashgu <Vikash.Gupta at amd.com>
Date: Wed, 23 Apr 2025 12:13:49 +0000
Subject: [PATCH 1/2] [AMDGPU][PeepholeOpt] Eliminate unnecessary packing in
 fp16 vector operations for SDWA/OPSEL-able instruction

As the compiler has no fp16 packed instruction,
so isel scalarizes each fp16 operation in wide fp16 vectors and generates
separate individual fp16 results, which are later packed. Now, in post-
isel pass in SIPeepholeSDWA pass, opportunistically any instructions is
eventually converted into its SDWA/OPSEL-able version.

This patch gets rids of unnecessary packing in wider fp16 vectors
operation for SDWA/OPSEL-able instruction, by overwriting the partial
fp16 result into same input register partially, while maintaining the
sanctity of rest of bits in input register, using OPSEL dst_unused
operand set as UNUSED_PRESERVED. Owing to the context of generating SDWA
instructions, it is invoked at the end of the SIPeepholeSDWA pass.
---
 llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 504 +++++++++++++++++++++-
 1 file changed, 502 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 22f23e4c94e2d..295726ddb7394 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -27,6 +27,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include <optional>
+#include <queue>
 
 using namespace llvm;
 
@@ -35,6 +36,11 @@ using namespace llvm;
 STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found.");
 STATISTIC(NumSDWAInstructionsPeepholed,
           "Number of instruction converted to SDWA.");
+STATISTIC(Num16BitPackedInstructionsEliminated,
+          "Number of packed instruction eliminated.");
+STATISTIC(NumSDWAInstructionsToEliminateFP16Pack,
+          "Number of instruction converted/modified into SDWA to eliminate "
+          "FP16 packing.");
 
 namespace {
 
@@ -66,6 +72,14 @@ class SIPeepholeSDWA {
   bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
   void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const;
 
+  void eliminateFP16Packing(MachineBasicBlock &MBB, const GCNSubtarget &ST);
+  unsigned
+  computeMIChainsForPackedOps(MachineInstr *ParentMI,
+                              std::queue<MachineOperand *> &DefSrcQueue,
+                              const GCNSubtarget &ST);
+  void convertMIToSDWAWithOpsel(MachineInstr &MI, MachineOperand &SrcMO,
+                                AMDGPU::SDWA::SdwaSel OpSel);
+
 public:
   bool run(MachineFunction &MF);
 };
@@ -266,13 +280,17 @@ void SDWADstPreserveOperand::print(raw_ostream& OS) const {
 
 #endif
 
-static void copyRegOperand(MachineOperand &To, const MachineOperand &From) {
+static void copyRegOperand(MachineOperand &To, const MachineOperand &From,
+                           bool isKill = false) {
   assert(To.isReg() && From.isReg());
   To.setReg(From.getReg());
   To.setSubReg(From.getSubReg());
   To.setIsUndef(From.isUndef());
   if (To.isUse()) {
-    To.setIsKill(From.isKill());
+    if (isKill)
+      To.setIsKill(true);
+    else
+      To.setIsKill(From.isKill());
   } else {
     To.setIsDead(From.isDead());
   }
@@ -1361,6 +1379,482 @@ bool SIPeepholeSDWALegacy::runOnMachineFunction(MachineFunction &MF) {
   return SIPeepholeSDWA().run(MF);
 }
 
+static bool isSrcDestFP16Bits(MachineInstr *MI, const SIInstrInfo *TII) {
+  unsigned Opcode = MI->getOpcode();
+  if (TII->isSDWA(Opcode))
+    Opcode = AMDGPU::getBasicFromSDWAOp(Opcode);
+
+  switch (Opcode) {
+  case AMDGPU::V_CVT_F16_U16_e32:
+  case AMDGPU::V_CVT_F16_U16_e64:
+  case AMDGPU::V_CVT_F16_I16_e32:
+  case AMDGPU::V_CVT_F16_I16_e64:
+  case AMDGPU::V_RCP_F16_e64:
+  case AMDGPU::V_RCP_F16_e32:
+  case AMDGPU::V_RSQ_F16_e64:
+  case AMDGPU::V_RSQ_F16_e32:
+  case AMDGPU::V_SQRT_F16_e64:
+  case AMDGPU::V_SQRT_F16_e32:
+  case AMDGPU::V_LOG_F16_e64:
+  case AMDGPU::V_LOG_F16_e32:
+  case AMDGPU::V_EXP_F16_e64:
+  case AMDGPU::V_EXP_F16_e32:
+  case AMDGPU::V_SIN_F16_e64:
+  case AMDGPU::V_SIN_F16_e32:
+  case AMDGPU::V_COS_F16_e64:
+  case AMDGPU::V_COS_F16_e32:
+  case AMDGPU::V_FLOOR_F16_e64:
+  case AMDGPU::V_FLOOR_F16_e32:
+  case AMDGPU::V_CEIL_F16_e64:
+  case AMDGPU::V_CEIL_F16_e32:
+  case AMDGPU::V_TRUNC_F16_e64:
+  case AMDGPU::V_TRUNC_F16_e32:
+  case AMDGPU::V_RNDNE_F16_e64:
+  case AMDGPU::V_RNDNE_F16_e32:
+  case AMDGPU::V_FRACT_F16_e64:
+  case AMDGPU::V_FRACT_F16_e32:
+  case AMDGPU::V_FREXP_MANT_F16_e64:
+  case AMDGPU::V_FREXP_MANT_F16_e32:
+  case AMDGPU::V_FREXP_EXP_I16_F16_e64:
+  case AMDGPU::V_FREXP_EXP_I16_F16_e32:
+  case AMDGPU::V_LDEXP_F16_e64:
+  case AMDGPU::V_LDEXP_F16_e32:
+  case AMDGPU::V_ADD_F16_e64:
+  case AMDGPU::V_ADD_F16_e32:
+  case AMDGPU::V_SUB_F16_e64:
+  case AMDGPU::V_SUB_F16_e32:
+  case AMDGPU::V_SUBREV_F16_e64:
+  case AMDGPU::V_SUBREV_F16_e32:
+  case AMDGPU::V_MUL_F16_e64:
+  case AMDGPU::V_MUL_F16_e32:
+  case AMDGPU::V_MAX_F16_e64:
+  case AMDGPU::V_MAX_F16_e32:
+  case AMDGPU::V_MIN_F16_e64:
+  case AMDGPU::V_MIN_F16_e32:
+  case AMDGPU::V_MAD_F16_e64:
+  case AMDGPU::V_FMA_F16_e64:
+  case AMDGPU::V_DIV_FIXUP_F16_e64:
+    return true;
+  case AMDGPU::V_MADAK_F16:
+  case AMDGPU::V_MADMK_F16:
+  case AMDGPU::V_FMAMK_F16:
+  case AMDGPU::V_FMAAK_F16:
+    // NOTE : SKEPTICAL ABOUT IT
+    return false;
+  case AMDGPU::V_FMAC_F16_e32:
+  case AMDGPU::V_FMAC_F16_e64:
+  case AMDGPU::V_MAC_F16_e32:
+  case AMDGPU::V_MAC_F16_e64:
+    // As their sdwa version allow dst_sel to be equal only set to DWORD
+  default:
+    return false;
+  }
+}
+
+static bool checkForRightSrcRootAccess(MachineInstr *Def0MI,
+                                       MachineInstr *Def1MI,
+                                       Register SrcRootReg,
+                                       const SIInstrInfo *TII) {
+  // As if could, the Def1MI would have been sdwa-ed
+  if(!TII->isSDWA(Def1MI->getOpcode()))
+    return false;
+
+  MachineOperand *Def1Src0 = TII->getNamedOperand(*Def1MI, AMDGPU::OpName::src0);
+  MachineOperand *Def1Src1 = TII->getNamedOperand(*Def1MI, AMDGPU::OpName::src1);
+  MachineOperand *Def0Src0 = TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src0);
+  MachineOperand *Def0Src1 = TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src1);
+  
+  if(Def1Src0 && Def1Src0->isReg() && (Def1Src0->getReg() == SrcRootReg)) {
+    MachineOperand *Def1Src0Sel = TII->getNamedOperand(*Def1MI, AMDGPU::OpName::src0_sel);
+    if(!Def1Src0Sel || (Def1Src0Sel->getImm() != AMDGPU::SDWA::SdwaSel::WORD_1))
+      return false;
+
+    if(Def0Src0 && Def0Src0->isReg() && (Def0Src0->getReg() == SrcRootReg)) {
+      MachineOperand *Def0Src0Sel = TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src0_sel);
+      if(!Def0Src0Sel)
+        return true;
+      if(Def0Src0Sel && Def0Src0Sel->getImm() == AMDGPU::SDWA::SdwaSel::WORD_0)
+        return true;
+    }
+
+    if(Def0Src1 && Def0Src1->isReg() && (Def0Src1->getReg() == SrcRootReg)) {
+      MachineOperand *Def0Src1Sel = TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src1_sel);
+      if(!Def0Src1Sel)
+        return true;
+      if(Def0Src1Sel && Def0Src1Sel->getImm() == AMDGPU::SDWA::SdwaSel::WORD_0)
+        return true;
+    }
+  }
+
+  if(Def1Src1 && Def1Src1->isReg() && (Def1Src1->getReg() == SrcRootReg)) {
+    MachineOperand *Def1Src1Sel = TII->getNamedOperand(*Def1MI, AMDGPU::OpName::src1_sel);
+    if(!Def1Src1Sel || (Def1Src1Sel->getImm() != AMDGPU::SDWA::SdwaSel::WORD_1))
+      return false;
+
+    if(Def0Src0 && Def0Src0->isReg() && (Def0Src0->getReg() == SrcRootReg)) {
+      MachineOperand *Def0Src0Sel = TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src0_sel);
+      if(!Def0Src0Sel)
+        return true;
+      if(Def0Src0Sel && Def0Src0Sel->getImm() == AMDGPU::SDWA::SdwaSel::WORD_0)
+        return true;
+    }
+
+    if(Def0Src1 && Def0Src1->isReg() && (Def0Src1->getReg() == SrcRootReg)) {
+      MachineOperand *Def0Src1Sel = TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src1_sel);
+      if(!Def0Src1Sel)
+        return true;
+      if(Def0Src1Sel && Def0Src1Sel->getImm() == AMDGPU::SDWA::SdwaSel::WORD_0)
+        return true;
+    }
+  }
+
+  return false;
+}
+
+/// Given A and B are in the same MBB, returns true if A comes before B.
+static bool dominates(MachineBasicBlock::const_iterator A,
+                      MachineBasicBlock::const_iterator B) {
+  assert(A->getParent() == B->getParent());
+  const MachineBasicBlock *MBB = A->getParent();
+  auto MBBEnd = MBB->end();
+  if (B == MBBEnd)
+    return true;
+
+  MachineBasicBlock::const_iterator I = MBB->begin();
+  for (; &*I != A && &*I != B; ++I)
+    ;
+
+  return &*I == A;
+}
+
+// Convert MI into its SDWA version with its Dst_Sel & SrcMO_Sel set with OpSel
+// and preserving the rest of Dst's bits.
+void SIPeepholeSDWA::convertMIToSDWAWithOpsel(MachineInstr &MI,
+                                              MachineOperand &SrcMO,
+                                              AMDGPU::SDWA::SdwaSel OpSel) {
+  LLVM_DEBUG(dbgs() << "Convert instruction:" << MI);
+
+  MachineInstr *SDWAInst;
+  if (TII->isSDWA(MI.getOpcode())) {
+    SDWAInst = &MI;
+  } else {
+    SDWAInst = createSDWAVersion(MI);
+    MI.eraseFromParent();
+  }
+
+  ConvertedInstructions.push_back(SDWAInst);
+  unsigned SDWAOpcode = SDWAInst->getOpcode();
+  ++NumSDWAInstructionsToEliminateFP16Pack;
+
+  MachineOperand *Dst = TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::vdst);
+  assert(Dst && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::vdst));
+
+  MachineOperand *DstSel =
+      TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::dst_sel);
+  assert(DstSel &&
+         AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::dst_sel));
+  DstSel->setImm(OpSel);
+
+  MachineOperand *DstUnused =
+      TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::dst_unused);
+  assert(DstUnused &&
+         AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::dst_unused));
+  assert(!(DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) &&
+         "Dst_unused should not be UNUSED_PRESERVE already");
+  DstUnused->setImm(AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE);
+
+  auto PreserveDstIdx =
+      AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst);
+  assert(PreserveDstIdx != -1);
+  auto NewSrcImplitMO = MachineOperand::CreateReg(SrcMO.getReg(), false, true);
+  copyRegOperand(NewSrcImplitMO, SrcMO);
+  SDWAInst->addOperand(NewSrcImplitMO);
+  SDWAInst->tieOperands(PreserveDstIdx, SDWAInst->getNumOperands() - 1);
+
+  MachineOperand *Src0 = TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::src0);
+  assert(Src0 && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0));
+  if (Src0->isReg() && (Src0->getReg() == SrcMO.getReg())) {
+    MachineOperand *Src0Sel =
+        TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::src0_sel);
+    assert(Src0Sel &&
+           AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_sel));
+    Src0Sel->setImm(OpSel);
+
+    LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n');
+    return;
+  }
+
+  MachineOperand *Src1 = TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::src1);
+  assert(Src1 && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1));
+  if (Src1->isReg() && (Src1->getReg() == SrcMO.getReg())) {
+    MachineOperand *Src1Sel =
+        TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::src1_sel);
+    assert(Src1Sel &&
+           AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_sel));
+    Src1Sel->setImm(OpSel);
+
+    LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n');
+    return;
+  }
+}
+
+// BackTracks the given Parent MI to look for any of its use operand that has
+// been defined by FP16 (sdwa-able) in recursive fashion.
+unsigned SIPeepholeSDWA::computeMIChainsForPackedOps(
+    MachineInstr *ParentMI, std::queue<MachineOperand *> &DefSrcQueue,
+    const GCNSubtarget &ST) {
+  unsigned NumOfFP16Def;
+  do {
+    MachineInstr *NextMIInChain = nullptr;
+    NumOfFP16Def = 0;
+    for (MachineOperand &currentMO : ParentMI->uses()) {
+      if (!currentMO.isReg() || currentMO.getReg().isPhysical() ||
+          !MRI->hasOneUse(currentMO.getReg()))
+        continue;
+
+      MachineOperand *DefCurrMO = findSingleRegDef(&currentMO, MRI);
+      if (!DefCurrMO)
+        continue;
+
+      MachineInstr *DefCurrMI = DefCurrMO->getParent();
+      if (!isSrcDestFP16Bits(DefCurrMI, TII) ||
+          !isConvertibleToSDWA(*DefCurrMI, ST, TII))
+        continue;
+
+      NextMIInChain = DefCurrMI;
+      DefSrcQueue.push(DefCurrMO);
+      NumOfFP16Def++;
+    }
+
+    if (NumOfFP16Def > 1)
+      break;
+
+    ParentMI = NextMIInChain;
+  } while (ParentMI);
+
+  return NumOfFP16Def;
+}
+
+void SIPeepholeSDWA::eliminateFP16Packing(MachineBasicBlock &MBB,
+                                          const GCNSubtarget &ST) {
+  if (!ST.has16BitInsts())
+    return;
+
+  for (MachineInstr &MI : make_early_inc_range(MBB)) {
+    if (MI.getOpcode() == AMDGPU::V_PACK_B32_F16_e64) {
+      LLVM_DEBUG(dbgs() << "\nCandidate FP16 Packed MI : " << MI << '\n');
+      std::queue<MachineOperand *> DefSrc0Queue;
+      std::queue<MachineOperand *> DefSrc1Queue;
+      MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+      MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+
+      if (!Src0->isReg() || Src0->getReg().isPhysical() ||
+          !MRI->hasOneUse(Src0->getReg()) || !Src1->isReg() ||
+          Src1->getReg().isPhysical() || !MRI->hasOneUse(Src1->getReg()))
+        continue;
+
+      MachineOperand *Op0 = findSingleRegDef(Src0, MRI);
+      MachineOperand *Op1 = findSingleRegDef(Src1, MRI);
+
+      if (!Op0 || !Op1)
+        continue;
+
+      MachineInstr *ParentMIOp0 = Op0->getParent();
+      MachineInstr *ParentMIOp1 = Op1->getParent();
+
+      if (!isSrcDestFP16Bits(ParentMIOp0, TII) ||
+          !isSrcDestFP16Bits(ParentMIOp1, TII) ||
+          !isConvertibleToSDWA(*ParentMIOp0, ST, TII) ||
+          !isConvertibleToSDWA(*ParentMIOp1, ST, TII))
+        continue;
+
+      DefSrc0Queue.push(Op0);
+      DefSrc1Queue.push(Op1);
+
+      // This checks for the given MI, that it only has exact one register MO
+      // use , that is defined by pure FP16 instruction (that is SDWA-able too)
+      unsigned NumOfFP16Def;
+
+      NumOfFP16Def = computeMIChainsForPackedOps(ParentMIOp0, DefSrc0Queue, ST);
+      if (NumOfFP16Def > 1)
+        continue;
+
+      NumOfFP16Def = computeMIChainsForPackedOps(ParentMIOp1, DefSrc1Queue, ST);
+      if (NumOfFP16Def > 1)
+        continue;
+
+      MachineInstr *Def0RootMI = (DefSrc0Queue.back())->getParent();
+      MachineInstr *Def1RootMI = (DefSrc1Queue.back())->getParent();
+      Register SrcRootMOReg = AMDGPU::NoRegister;
+
+      // Now, check if the last operation for each in of the DefSrcQueue
+      // has the common MO, that would be the source root MO for element-wise
+      // fp16 chain operations
+      for (MachineOperand &Current0MO : Def0RootMI->uses()) {
+        if (!Current0MO.isReg() || Current0MO.getReg().isPhysical())
+          continue;
+
+        for (MachineOperand &Current1MO : Def1RootMI->uses()) {
+          if (!Current1MO.isReg() || Current1MO.getReg().isPhysical())
+            continue;
+
+          if (Current0MO.getReg() == Current1MO.getReg() &&
+              Current0MO.getSubReg() == Current1MO.getSubReg()) {
+            SrcRootMOReg = Current0MO.getReg();
+            break;
+          }
+        }
+        // Found it, no more check needed, so break;
+        if (SrcRootMOReg != AMDGPU::NoRegister)
+          break;
+      }
+
+      if (SrcRootMOReg == AMDGPU::NoRegister)
+        continue;
+
+      // Also we need to ensure that each of the DefXRootMI should access the 
+      // lower and upper half word of SrcRootMOReg respectively.
+      if (!checkForRightSrcRootAccess(Def0RootMI, Def1RootMI, SrcRootMOReg,
+                                      TII))
+        continue;
+
+      // The graph below represents the connection :
+      //       Op0Intial  -->  Op0x  --> ...  -->  Op0Final
+      //      /                                       \'
+      // SrcRootMO                                    v_Pack_b32_f16
+      //      \                                       /
+      //       Op1Intial  -->  Op1x  --> ...  -->  Op1Final
+      // The nomenclature is based upon above flow-graph
+      //
+      // Also for each of DefSrcXQueue :
+      // OpXIntial is at back & OpXFinal is at front
+      auto Op0FinalMI = (DefSrc0Queue.front())->getParent();
+      auto Op1FinalMI = (DefSrc1Queue.front())->getParent();
+      auto Op0IntialMI = (DefSrc0Queue.back())->getParent();
+      auto Op1IntialMI = (DefSrc1Queue.back())->getParent();
+
+      MachineOperand *FinalOutMO = nullptr;
+      std::queue<MachineOperand *> ChainedDefOps;
+      AMDGPU::SDWA::SdwaSel OpSel = AMDGPU::SDWA::SdwaSel::DWORD;
+      int NumOfElemInSecondOpChain = 0;
+
+      // Now, we will change the flow as per the dominace of MI as follows, if
+      // possible and store it in ChainedDefOps, so later can be used to convert
+      // into its SDWA version:
+      //
+      // If (dominates(Op0FinalMI, Op1IntialMI)) == TRUE
+      // SrcRootMO -> Op0Intial -> Op0x -> ... -> Op0Final
+      // -> Op1Intial -> Op1x -> ... -> Op1Final (FinalOutMO)
+      //
+      // If (dominates(Op1FinalMI, Op0IntialMI)) == TRUE
+      // SrcRootMO -> Op1Intial -> Op1x -> ... -> Op1Final
+      // -> Op0Intial -> Op0x -> ... -> Op0Final (FinalOutMO)
+      //
+      // TODO : Else, not handled!
+      // One such case is observed when multiple fp16 instruction are chained
+      // on a fp16 vector input. For Example :
+      //
+      // %1 = call <2 x half> @llvm.log.v2f16 (<2 x half> %0)
+      // %res = call <2 x half> @llvm.sin.v2f16 (<2 x half> %1)
+      // return <2 x half> %res
+      if (dominates(Op0FinalMI, Op1IntialMI)) {
+        int OpIdx = Op1IntialMI->findRegisterUseOperandIdx(SrcRootMOReg, TRI);
+        auto &MOTo = Op1IntialMI->getOperand(OpIdx);
+        auto MOFrom = DefSrc0Queue.front();
+        copyRegOperand(MOTo, *MOFrom, true);
+        FinalOutMO = DefSrc1Queue.front();
+
+        LLVM_DEBUG(dbgs() << "Updated Connecting MI : " << *Op1IntialMI
+                          << '\n');
+        OpIdx = Op0IntialMI->findRegisterUseOperandIdx(SrcRootMOReg, TRI);
+        auto &IntialInMO = Op0IntialMI->getOperand(OpIdx);
+
+        while (!DefSrc1Queue.empty()) {
+          ChainedDefOps.push(DefSrc1Queue.front());
+          DefSrc1Queue.pop();
+          NumOfElemInSecondOpChain++;
+        }
+        while (!DefSrc0Queue.empty()) {
+          ChainedDefOps.push(DefSrc0Queue.front());
+          DefSrc0Queue.pop();
+        }
+
+        ChainedDefOps.push(&IntialInMO);
+        OpSel = AMDGPU::SDWA::SdwaSel::WORD_1;
+      } else if (dominates(Op1FinalMI, Op0IntialMI)) {
+        int OpIdx = Op0IntialMI->findRegisterUseOperandIdx(SrcRootMOReg, TRI);
+        auto &MOTo = Op0IntialMI->getOperand(OpIdx);
+        auto MOFrom = DefSrc1Queue.front();
+        copyRegOperand(MOTo, *MOFrom, true);
+        FinalOutMO = DefSrc0Queue.front();
+
+        LLVM_DEBUG(dbgs() << "Updated Connecting MI : " << *Op0IntialMI
+                          << '\n');
+        OpIdx = Op1IntialMI->findRegisterUseOperandIdx(SrcRootMOReg, TRI);
+        auto &IntialInMO = Op1IntialMI->getOperand(OpIdx);
+
+        while (!DefSrc0Queue.empty()) {
+          ChainedDefOps.push(DefSrc0Queue.front());
+          DefSrc0Queue.pop();
+          NumOfElemInSecondOpChain++;
+        }
+        while (!DefSrc1Queue.empty()) {
+          ChainedDefOps.push(DefSrc1Queue.front());
+          DefSrc1Queue.pop();
+        }
+
+        ChainedDefOps.push(&IntialInMO);
+        OpSel = AMDGPU::SDWA::SdwaSel::WORD_0;
+      } else {
+        LLVM_DEBUG(dbgs() << "No Connecting MI exists" << '\n');
+        continue;
+      }
+
+      // Replace all use places of MI(v_pack) defMO with FinalOutMO.
+      MachineOperand &DefMO = MI.getOperand(0);
+      for (MachineOperand &MO :
+           make_early_inc_range(MRI->use_nodbg_operands(DefMO.getReg()))) {
+        if (!MO.isReg())
+          continue;
+
+        MO.setReg(FinalOutMO->getReg());
+        MO.setSubReg(FinalOutMO->getSubReg());
+      }
+      LLVM_DEBUG(dbgs() << "Replace uses of " << DefMO << " in " << MI
+                        << "With " << *FinalOutMO << '\n');
+
+      // Delete v_pack machine instruction
+      LLVM_DEBUG(dbgs() << "\nInstruction to be deleted : " << MI << "\n\n");
+      MI.eraseFromParent();
+      ++Num16BitPackedInstructionsEliminated;
+
+      // Convert machine instruction into SDWA-version
+      while (ChainedDefOps.size() != 1) {
+        if (NumOfElemInSecondOpChain == 0) {
+          if (OpSel == AMDGPU::SDWA::SdwaSel::WORD_0)
+            OpSel = AMDGPU::SDWA::SdwaSel::WORD_1;
+          else
+            OpSel = AMDGPU::SDWA::SdwaSel::WORD_0;
+        }
+
+        MachineInstr *DefMI = ChainedDefOps.front()->getParent();
+        ChainedDefOps.pop();
+        MachineOperand *SrcMO = ChainedDefOps.front();
+
+        // Take SrcMO (which are def) as its usage in DefMI
+        if (SrcMO->isDef()) {
+          assert(MRI->hasOneUse(SrcMO->getReg()));
+          SrcMO = findSingleRegUse(SrcMO, MRI);
+          assert(DefMI == SrcMO->getParent() && "the only use is not in DefMI");
+        }
+
+        convertMIToSDWAWithOpsel(*DefMI, *SrcMO, OpSel);
+        NumOfElemInSecondOpChain--;
+      }
+    }
+  }
+}
+
 bool SIPeepholeSDWA::run(MachineFunction &MF) {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
 
@@ -1418,6 +1912,12 @@ bool SIPeepholeSDWA::run(MachineFunction &MF) {
       while (!ConvertedInstructions.empty())
         legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST);
     } while (Changed);
+
+    // Process each v_pack_b32_fp16 instruction in MBB.
+    eliminateFP16Packing(MBB, ST);
+    Ret |= !ConvertedInstructions.empty();
+    while (!ConvertedInstructions.empty())
+      legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST);
   }
 
   return Ret;

>From a9b7e60024ec9801bc3df0be6a76b9f93ead7120 Mon Sep 17 00:00:00 2001
From: vikashgu <Vikash.Gupta at amd.com>
Date: Thu, 24 Apr 2025 06:56:22 +0000
Subject: [PATCH 2/2] Update the LIT tests to accomodate the patch effects.

---
 .../AMDGPU/GlobalISel/combine-fma-sub-mul.ll  |  88 +++---
 .../GlobalISel/combine-fma-sub-neg-mul.ll     |  40 ++-
 .../CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll     | 100 ++++---
 llvm/test/CodeGen/AMDGPU/fdiv.f16.ll          |  20 +-
 .../AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll |  51 ++--
 llvm/test/CodeGen/AMDGPU/fract-match.ll       |  11 +-
 llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll      |  18 +-
 llvm/test/CodeGen/AMDGPU/llvm.exp.ll          |  14 +-
 llvm/test/CodeGen/AMDGPU/llvm.exp10.ll        |  18 +-
 llvm/test/CodeGen/AMDGPU/llvm.exp2.ll         |  35 +--
 llvm/test/CodeGen/AMDGPU/llvm.frexp.ll        |  23 +-
 llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll        |  44 ++-
 llvm/test/CodeGen/AMDGPU/llvm.log.ll          | 260 ++++++++++--------
 llvm/test/CodeGen/AMDGPU/llvm.log10.ll        | 260 ++++++++++--------
 llvm/test/CodeGen/AMDGPU/llvm.log2.ll         | 110 +++-----
 llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll     |   5 +-
 llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll      |  18 +-
 llvm/test/CodeGen/AMDGPU/repeated-divisor.ll  |  12 +-
 llvm/test/CodeGen/AMDGPU/roundeven.ll         |  80 +++---
 19 files changed, 594 insertions(+), 613 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll
index e8e29c3d4b526..e7cc28eeabffb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll
@@ -545,12 +545,10 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> %
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v3
-; GFX9-NEXT:    v_sub_f16_e32 v2, v0, v4
-; GFX9-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_sub_f16_e32 v3, v1, v5
-; GFX9-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX9-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX9-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-CONTRACT-LABEL: test_v4f16_sub_mul:
@@ -565,12 +563,10 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> %
 ; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX9-DENORM-NEXT:    v_pk_mul_f16 v1, v1, v3
-; GFX9-DENORM-NEXT:    v_sub_f16_e32 v2, v0, v4
-; GFX9-DENORM-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-DENORM-NEXT:    v_sub_f16_e32 v3, v1, v5
-; GFX9-DENORM-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-DENORM-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX9-DENORM-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX9-DENORM-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-DENORM-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-DENORM-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-DENORM-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_v4f16_sub_mul:
@@ -578,12 +574,10 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> %
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v3
-; GFX10-NEXT:    v_sub_f16_e32 v2, v0, v4
-; GFX10-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_sub_f16_e32 v3, v1, v5
-; GFX10-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX10-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX10-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_v4f16_sub_mul:
@@ -598,12 +592,10 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> %
 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX10-DENORM-NEXT:    v_pk_mul_f16 v1, v1, v3
-; GFX10-DENORM-NEXT:    v_sub_f16_e32 v2, v0, v4
-; GFX10-DENORM-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-DENORM-NEXT:    v_sub_f16_e32 v3, v1, v5
-; GFX10-DENORM-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-DENORM-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX10-DENORM-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX10-DENORM-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DENORM-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DENORM-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-DENORM-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-CONTRACT-LABEL: test_v4f16_sub_mul:
@@ -644,12 +636,12 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v3
-; GFX9-NEXT:    v_sub_f16_e32 v2, v4, v0
-; GFX9-NEXT:    v_sub_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_sub_f16_e32 v3, v5, v1
-; GFX9-NEXT:    v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX9-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX9-NEXT:    v_sub_f16_sdwa v4, v4, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_sub_f16_sdwa v5, v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_sub_f16_sdwa v4, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_sub_f16_sdwa v5, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-CONTRACT-LABEL: test_v4f16_sub_mul_rhs:
@@ -664,12 +656,12 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal
 ; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX9-DENORM-NEXT:    v_pk_mul_f16 v1, v1, v3
-; GFX9-DENORM-NEXT:    v_sub_f16_e32 v2, v4, v0
-; GFX9-DENORM-NEXT:    v_sub_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-DENORM-NEXT:    v_sub_f16_e32 v3, v5, v1
-; GFX9-DENORM-NEXT:    v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-DENORM-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX9-DENORM-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX9-DENORM-NEXT:    v_sub_f16_sdwa v4, v4, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-DENORM-NEXT:    v_sub_f16_sdwa v5, v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-DENORM-NEXT:    v_sub_f16_sdwa v4, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-DENORM-NEXT:    v_sub_f16_sdwa v5, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-DENORM-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-DENORM-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_v4f16_sub_mul_rhs:
@@ -677,12 +669,12 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v3
-; GFX10-NEXT:    v_sub_f16_e32 v2, v4, v0
-; GFX10-NEXT:    v_sub_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_sub_f16_e32 v3, v5, v1
-; GFX10-NEXT:    v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX10-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX10-NEXT:    v_sub_f16_sdwa v4, v4, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_sub_f16_sdwa v5, v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_sub_f16_sdwa v4, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_sub_f16_sdwa v5, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_v4f16_sub_mul_rhs:
@@ -697,12 +689,12 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal
 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX10-DENORM-NEXT:    v_pk_mul_f16 v1, v1, v3
-; GFX10-DENORM-NEXT:    v_sub_f16_e32 v2, v4, v0
-; GFX10-DENORM-NEXT:    v_sub_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-DENORM-NEXT:    v_sub_f16_e32 v3, v5, v1
-; GFX10-DENORM-NEXT:    v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-DENORM-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX10-DENORM-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX10-DENORM-NEXT:    v_sub_f16_sdwa v4, v4, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DENORM-NEXT:    v_sub_f16_sdwa v5, v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DENORM-NEXT:    v_sub_f16_sdwa v4, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-DENORM-NEXT:    v_sub_f16_sdwa v5, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-DENORM-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-DENORM-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-CONTRACT-LABEL: test_v4f16_sub_mul_rhs:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll
index 70f961e2777af..ad11c9b5f28ca 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll
@@ -221,12 +221,10 @@ define <4 x half> @test_v4f16_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
-; GFX9-NEXT:    v_sub_f16_e32 v2, v0, v4
-; GFX9-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_sub_f16_e32 v3, v1, v5
-; GFX9-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX9-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX9-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-CONTRACT-LABEL: test_v4f16_sub_ext_neg_mul:
@@ -241,12 +239,10 @@ define <4 x half> @test_v4f16_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x
 ; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX9-DENORM-NEXT:    v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
-; GFX9-DENORM-NEXT:    v_sub_f16_e32 v2, v0, v4
-; GFX9-DENORM-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-DENORM-NEXT:    v_sub_f16_e32 v3, v1, v5
-; GFX9-DENORM-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-DENORM-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX9-DENORM-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX9-DENORM-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-DENORM-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-DENORM-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-DENORM-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_v4f16_sub_ext_neg_mul:
@@ -254,12 +250,10 @@ define <4 x half> @test_v4f16_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
-; GFX10-NEXT:    v_sub_f16_e32 v2, v0, v4
-; GFX10-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_sub_f16_e32 v3, v1, v5
-; GFX10-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX10-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX10-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_v4f16_sub_ext_neg_mul:
@@ -274,12 +268,10 @@ define <4 x half> @test_v4f16_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x
 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX10-DENORM-NEXT:    v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
-; GFX10-DENORM-NEXT:    v_sub_f16_e32 v2, v0, v4
-; GFX10-DENORM-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-DENORM-NEXT:    v_sub_f16_e32 v3, v1, v5
-; GFX10-DENORM-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-DENORM-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX10-DENORM-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX10-DENORM-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DENORM-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DENORM-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-DENORM-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %a = fmul <4 x half> %x, %y
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
index 5ba036c386a40..56827df6f027c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
@@ -1101,21 +1101,21 @@ define <2 x half> @v_fdiv_v2f16_afn(<2 x half> %a, <2 x half> %b) {
 ; GFX9-LABEL: v_fdiv_v2f16_afn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_rcp_f16_e32 v2, v1
-; GFX9-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_mul_f16_e32 v2, v0, v2
-; GFX9-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX9-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0
+; GFX9-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT:    v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_v2f16_afn:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_rcp_f16_e32 v2, v1
-; GFX10-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_mul_f16_e32 v2, v0, v2
-; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX10-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_v2f16_afn:
@@ -2782,17 +2782,15 @@ define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) {
 ; GFX9-LABEL: v_rcp_v2f16_arcp:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_rcp_f16_e32 v1, v0
-; GFX9-NEXT:    v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX9-NEXT:    v_rcp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_rcp_v2f16_arcp:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_rcp_f16_e32 v1, v0
-; GFX10-NEXT:    v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX10-NEXT:    v_rcp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_rcp_v2f16_arcp:
@@ -2834,17 +2832,15 @@ define <2 x half> @v_rcp_v2f16_arcp_afn(<2 x half> %x) {
 ; GFX9-LABEL: v_rcp_v2f16_arcp_afn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_rcp_f16_e32 v1, v0
-; GFX9-NEXT:    v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX9-NEXT:    v_rcp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_rcp_v2f16_arcp_afn:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_rcp_f16_e32 v1, v0
-; GFX10-NEXT:    v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX10-NEXT:    v_rcp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_rcp_v2f16_arcp_afn:
@@ -3192,21 +3188,21 @@ define <2 x half> @v_fdiv_v2f16_afn_ulp25(<2 x half> %a, <2 x half> %b) {
 ; GFX9-LABEL: v_fdiv_v2f16_afn_ulp25:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_rcp_f16_e32 v2, v1
-; GFX9-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_mul_f16_e32 v2, v0, v2
-; GFX9-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX9-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0
+; GFX9-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT:    v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_v2f16_afn_ulp25:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_rcp_f16_e32 v2, v1
-; GFX10-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_mul_f16_e32 v2, v0, v2
-; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX10-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_v2f16_afn_ulp25:
@@ -3310,21 +3306,21 @@ define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) {
 ; GFX9-LABEL: v_fdiv_v2f16_arcp_ulp25:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_rcp_f16_e32 v2, v1
-; GFX9-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_mul_f16_e32 v2, v0, v2
-; GFX9-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX9-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0
+; GFX9-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT:    v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_v2f16_arcp_ulp25:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_rcp_f16_e32 v2, v1
-; GFX10-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_mul_f16_e32 v2, v0, v2
-; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX10-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_v2f16_arcp_ulp25:
@@ -3372,21 +3368,21 @@ define <2 x half> @v_fdiv_v2f16_arcp_afn_ulp25(<2 x half> %a, <2 x half> %b) {
 ; GFX9-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_rcp_f16_e32 v2, v1
-; GFX9-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_mul_f16_e32 v2, v0, v2
-; GFX9-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX9-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0
+; GFX9-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT:    v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_rcp_f16_e32 v2, v1
-; GFX10-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_mul_f16_e32 v2, v0, v2
-; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX10-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
index c43731893c2d7..12d7f9d4af8c4 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
@@ -2188,17 +2188,15 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
 ; GFX9-LABEL: v_rsq_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_rsq_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_rsq_f16_e32 v0, v0
-; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT:    v_rsq_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT:    v_rsq_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_rsq_v2f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_rsq_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_rsq_f16_e32 v0, v0
-; GFX10-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-NEXT:    v_rsq_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_rsq_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_rsq_v2f16:
@@ -2398,17 +2396,15 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
 ; GFX9-LABEL: v_neg_rsq_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_rsq_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_rsq_f16_e32 v0, v0
-; GFX9-NEXT:    v_pack_b32_f16 v0, -v0, -v1
+; GFX9-NEXT:    v_rsq_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT:    v_rsq_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_neg_rsq_v2f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_rsq_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_rsq_f16_e32 v0, v0
-; GFX10-NEXT:    v_pack_b32_f16 v0, -v0, -v1
+; GFX10-NEXT:    v_rsq_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_rsq_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_neg_rsq_v2f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
index 63ba18a5433aa..68a3db1472aa2 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -187,22 +187,18 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
 ; GFX10-LABEL: fmul_pow2_8xhalf:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_pk_lshlrev_b16 v3, v3, 1 op_sel_hi:[1,0]
-; GFX10-NEXT:    v_pk_lshlrev_b16 v2, v2, 1 op_sel_hi:[1,0]
-; GFX10-NEXT:    v_pk_lshlrev_b16 v1, v1, 1 op_sel_hi:[1,0]
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, v0, 1 op_sel_hi:[1,0]
-; GFX10-NEXT:    v_cvt_f16_u16_e32 v4, v3
-; GFX10-NEXT:    v_cvt_f16_u16_e32 v5, v2
-; GFX10-NEXT:    v_cvt_f16_u16_e32 v6, v1
-; GFX10-NEXT:    v_cvt_f16_u16_e32 v7, v0
-; GFX10-NEXT:    v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_cvt_f16_u16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_cvt_f16_u16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_cvt_f16_u16_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_pack_b32_f16 v0, v7, v0
-; GFX10-NEXT:    v_pack_b32_f16 v1, v6, v1
-; GFX10-NEXT:    v_pack_b32_f16 v2, v5, v2
-; GFX10-NEXT:    v_pack_b32_f16 v3, v4, v3
+; GFX10-NEXT:    v_pk_lshlrev_b16 v1, v1, 1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_lshlrev_b16 v2, v2, 1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_lshlrev_b16 v3, v3, 1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_cvt_f16_u16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_cvt_f16_u16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_cvt_f16_u16_sdwa v2, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_cvt_f16_u16_sdwa v3, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_cvt_f16_u16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f16_u16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f16_u16_sdwa v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f16_u16_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX10-NEXT:    v_pk_mul_f16 v0, 0x7000, v0 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_mul_f16 v1, 0x7000, v1 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_mul_f16 v2, 0x7000, v2 op_sel_hi:[0,1]
@@ -302,18 +298,14 @@ define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0x7000
-; GFX10-NEXT:    v_ldexp_f16_e32 v5, 0x7000, v3
-; GFX10-NEXT:    v_ldexp_f16_e32 v6, 0x7000, v2
-; GFX10-NEXT:    v_ldexp_f16_e32 v7, 0x7000, v1
-; GFX10-NEXT:    v_ldexp_f16_e32 v8, 0x7000, v0
-; GFX10-NEXT:    v_ldexp_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    v_ldexp_f16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    v_ldexp_f16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    v_ldexp_f16_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    v_pack_b32_f16 v0, v8, v0
-; GFX10-NEXT:    v_pack_b32_f16 v1, v7, v1
-; GFX10-NEXT:    v_pack_b32_f16 v2, v6, v2
-; GFX10-NEXT:    v_pack_b32_f16 v3, v5, v3
+; GFX10-NEXT:    v_ldexp_f16_sdwa v0, v4, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_ldexp_f16_sdwa v1, v4, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_ldexp_f16_sdwa v2, v4, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_ldexp_f16_sdwa v3, v4, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_ldexp_f16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_ldexp_f16_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_ldexp_f16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_ldexp_f16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: fmul_pow2_ldexp_8xhalf:
@@ -1085,9 +1077,8 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, v0, 2 op_sel_hi:[1,0]
-; GFX10-NEXT:    v_cvt_f16_u16_e32 v1, v0
-; GFX10-NEXT:    v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX10-NEXT:    v_cvt_f16_u16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_cvt_f16_u16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX10-NEXT:    v_pk_mul_f16 v0, 0x4b80, v0 op_sel_hi:[0,1]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/fract-match.ll b/llvm/test/CodeGen/AMDGPU/fract-match.ll
index d957ba93e4fb3..1ca358b90c58a 100644
--- a/llvm/test/CodeGen/AMDGPU/fract-match.ll
+++ b/llvm/test/CodeGen/AMDGPU/fract-match.ll
@@ -1587,9 +1587,8 @@ define <2 x half> @basic_fract_v2f16_nonan(<2 x half> nofpclass(nan) %x) {
 ; GFX8-LABEL: basic_fract_v2f16_nonan:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_fract_f16_e32 v1, v0
-; GFX8-NEXT:    v_fract_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX8-NEXT:    v_fract_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_fract_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: basic_fract_v2f16_nonan:
@@ -2610,15 +2609,15 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) writeon
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_movk_i32 s6, 0x204
-; GFX8-NEXT:    v_floor_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_floor_f16_e32 v4, v0
 ; GFX8-NEXT:    v_fract_f16_sdwa v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_cmp_class_f16_sdwa s[4:5], v0, s6 src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_pack_b32_f16 v3, v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX8-NEXT:    v_fract_f16_e32 v4, v0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, 0, s[4:5]
 ; GFX8-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, s6
+; GFX8-NEXT:    v_floor_f16_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v4, 0, s[4:5]
+; GFX8-NEXT:    v_floor_f16_sdwa v3, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX8-NEXT:    v_pack_b32_f16 v0, v0, v5
 ; GFX8-NEXT:    global_store_dword v[1:2], v3, off
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
index 8c5bc4a33a303..a8ddc564e4b51 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
@@ -188,11 +188,10 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mul_f16_e32 v3, 0.15915494, v1
-; GFX9-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_cos_f16_e32 v2, v3
-; GFX9-NEXT:    v_cos_f16_e32 v1, v1
-; GFX9-NEXT:    v_pack_b32_f16 v1, v2, v1
+; GFX9-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_cos_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -204,11 +203,10 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mul_f16_e32 v3, 0.15915494, v1
-; GFX10-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_cos_f16_e32 v2, v3
-; GFX10-NEXT:    v_cos_f16_e32 v1, v1
-; GFX10-NEXT:    v_pack_b32_f16 v1, v2, v1
+; GFX10-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_cos_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
index fdccacf372dfa..a3ee9655f40a4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
@@ -6719,9 +6719,8 @@ define <2 x half> @v_exp_v2f16_fast(<2 x half> %in) {
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x3dc5
 ; GFX900-SDAG-NEXT:    v_pk_mul_f16 v0, v0, s4 op_sel_hi:[1,0]
-; GFX900-SDAG-NEXT:    v_exp_f16_e32 v1, v0
-; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp_v2f16_fast:
@@ -6904,13 +6903,12 @@ define <3 x half> @v_exp_v3f16_afn(<3 x half> %in) {
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x3dc5
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v2, 0x3dc5, v0
-; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX900-SDAG-NEXT:    v_exp_f16_e32 v2, v2
-; GFX900-SDAG-NEXT:    v_exp_f16_e32 v0, v0
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x3dc5, v1
 ; GFX900-SDAG-NEXT:    v_exp_f16_e32 v1, v1
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp_v3f16_afn:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
index 0c2e6f82c9115..2dff6e21f8a17 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
@@ -6811,11 +6811,10 @@ define <2 x half> @v_exp10_v2f16_fast(<2 x half> %in) {
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x3dc5
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x3dc5, v0
-; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX900-SDAG-NEXT:    v_exp_f16_e32 v1, v1
-; GFX900-SDAG-NEXT:    v_exp_f16_e32 v0, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp10_v2f16_fast:
@@ -6998,13 +6997,12 @@ define <3 x half> @v_exp10_v3f16_afn(<3 x half> %in) {
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x3dc5
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v2, 0x3dc5, v0
-; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX900-SDAG-NEXT:    v_exp_f16_e32 v2, v2
-; GFX900-SDAG-NEXT:    v_exp_f16_e32 v0, v0
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x3dc5, v1
 ; GFX900-SDAG-NEXT:    v_exp_f16_e32 v1, v1
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp10_v3f16_afn:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
index c34113a5dfab0..e6e518d13b5f7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
@@ -3335,9 +3335,8 @@ define <2 x half> @v_exp2_v2f16(<2 x half> %in) {
 ; GFX900-SDAG-LABEL: v_exp2_v2f16:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_exp_f16_e32 v0, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp2_v2f16:
@@ -3413,9 +3412,8 @@ define <2 x half> @v_exp2_fabs_v2f16(<2 x half> %in) {
 ; GFX900-SDAG-LABEL: v_exp2_fabs_v2f16:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v1, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_exp_f16_e64 v0, |v0|
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v0, |v0| dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v0, |v0| dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp2_fabs_v2f16:
@@ -3497,9 +3495,8 @@ define <2 x half> @v_exp2_fneg_fabs_v2f16(<2 x half> %in) {
 ; GFX900-SDAG-LABEL: v_exp2_fneg_fabs_v2f16:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v1, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_exp_f16_e64 v0, -|v0|
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v0, -|v0| dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v0, -|v0| dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp2_fneg_fabs_v2f16:
@@ -3582,9 +3579,8 @@ define <2 x half> @v_exp2_fneg_v2f16(<2 x half> %in) {
 ; GFX900-SDAG-LABEL: v_exp2_fneg_v2f16:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v1, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_exp_f16_e64 v0, -v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v0, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v0, -v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp2_fneg_v2f16:
@@ -3656,9 +3652,8 @@ define <2 x half> @v_exp2_v2f16_fast(<2 x half> %in) {
 ; GFX900-SDAG-LABEL: v_exp2_v2f16_fast:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_exp_f16_e32 v0, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp2_v2f16_fast:
@@ -3738,10 +3733,9 @@ define <3 x half> @v_exp_v3f16(<3 x half> %in) {
 ; GFX900-SDAG-LABEL: v_exp_v3f16:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_exp_f16_e32 v0, v0
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    v_exp_f16_e32 v1, v1
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp_v3f16:
@@ -3822,10 +3816,9 @@ define <3 x half> @v_exp2_v3f16_afn(<3 x half> %in) {
 ; GFX900-SDAG-LABEL: v_exp2_v3f16_afn:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_exp_f16_e32 v0, v0
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    v_exp_f16_e32 v1, v1
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp2_v3f16_afn:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
index f44faf4f7edba..c7dcbcfde6d89 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
@@ -411,9 +411,9 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) {
 ; GFX9-SDAG-LABEL: test_frexp_v2f16_v2i32:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_frexp_mant_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-SDAG-NEXT:    v_frexp_mant_f16_e32 v2, v0
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v3, v2, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-SDAG-NEXT:    v_frexp_mant_f16_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_frexp_mant_f16_sdwa v3, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX9-SDAG-NEXT:    v_frexp_exp_i16_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-SDAG-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
 ; GFX9-SDAG-NEXT:    v_bfe_i32 v2, v1, 0, 16
@@ -522,13 +522,14 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) {
 ; GFX9-GISEL-LABEL: test_frexp_v2f16_v2i32:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_frexp_mant_f16_e32 v3, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-GISEL-NEXT:    v_frexp_mant_f16_sdwa v3, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX9-GISEL-NEXT:    v_frexp_exp_i16_f16_e32 v1, v0
-; GFX9-GISEL-NEXT:    v_frexp_mant_f16_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_frexp_mant_f16_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX9-GISEL-NEXT:    v_frexp_exp_i16_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 16
 ; GFX9-GISEL-NEXT:    v_bfe_i32 v2, v0, 0, 16
-; GFX9-GISEL-NEXT:    v_pack_b32_f16 v0, v3, v4
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-TRUE16-LABEL: test_frexp_v2f16_v2i32:
@@ -628,9 +629,8 @@ define <2 x half> @test_frexp_v2f16_v2i32_only_use_fract(<2 x half> %a) {
 ; GFX9-SDAG-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_frexp_mant_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-SDAG-NEXT:    v_frexp_mant_f16_e32 v0, v0
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-SDAG-NEXT:    v_frexp_mant_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_frexp_mant_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-TRUE16-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
@@ -698,9 +698,8 @@ define <2 x half> @test_frexp_v2f16_v2i32_only_use_fract(<2 x half> %a) {
 ; GFX9-GISEL-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_frexp_mant_f16_e32 v1, v0
-; GFX9-GISEL-NEXT:    v_frexp_mant_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_frexp_mant_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-GISEL-NEXT:    v_frexp_mant_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-TRUE16-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
index 0e66b0af99f34..fe05fdf1226ec 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
@@ -461,10 +461,9 @@ define <2 x half> @test_ldexp_v2f16_v2i32(<2 x half> %a, <2 x i32> %b) {
 ; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x7fff
 ; GFX9-SDAG-NEXT:    v_med3_i32 v2, v2, s4, v3
+; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-SDAG-NEXT:    v_med3_i32 v1, v1, s4, v3
-; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-TRUE16-LABEL: test_ldexp_v2f16_v2i32:
@@ -585,9 +584,8 @@ define <2 x half> @test_ldexp_v2f16_v2i16(<2 x half> %a, <2 x i16> %b) {
 ; GFX9-SDAG-LABEL: test_ldexp_v2f16_v2i16:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-TRUE16-LABEL: test_ldexp_v2f16_v2i16:
@@ -697,12 +695,11 @@ define <3 x half> @test_ldexp_v3f16_v3i32(<3 x half> %a, <3 x i32> %b) {
 ; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x7fff
 ; GFX9-SDAG-NEXT:    v_med3_i32 v3, v3, s4, v5
-; GFX9-SDAG-NEXT:    v_med3_i32 v2, v2, s4, v5
 ; GFX9-SDAG-NEXT:    v_med3_i32 v4, v4, s4, v5
-; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v2
+; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-SDAG-NEXT:    v_med3_i32 v2, v2, s4, v5
 ; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v1, v1, v4
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v3
+; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-TRUE16-LABEL: test_ldexp_v3f16_v3i32:
@@ -844,10 +841,9 @@ define <3 x half> @test_ldexp_v3f16_v3i16(<3 x half> %a, <3 x i16> %b) {
 ; GFX9-SDAG-LABEL: test_ldexp_v3f16_v3i16:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v4, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v2
+; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v1, v1, v3
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v4
+; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-TRUE16-LABEL: test_ldexp_v3f16_v3i16:
@@ -973,15 +969,13 @@ define <4 x half> @test_ldexp_v4f16_v4i32(<4 x half> %a, <4 x i32> %b) {
 ; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v6, 0x7fff
 ; GFX9-SDAG-NEXT:    v_med3_i32 v5, v5, s4, v6
-; GFX9-SDAG-NEXT:    v_med3_i32 v4, v4, s4, v6
 ; GFX9-SDAG-NEXT:    v_med3_i32 v3, v3, s4, v6
+; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-SDAG-NEXT:    v_med3_i32 v4, v4, s4, v6
+; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-SDAG-NEXT:    v_med3_i32 v2, v2, s4, v6
-; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v1, v1, v4
-; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v2
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v3
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v5
+; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v1, v1, v4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-TRUE16-LABEL: test_ldexp_v4f16_v4i32:
@@ -1157,12 +1151,10 @@ define <4 x half> @test_ldexp_v4f16_v4i16(<4 x half> %a, <4 x i16> %b) {
 ; GFX9-SDAG-LABEL: test_ldexp_v4f16_v4i16:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v1, v1, v3
-; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v2
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v5
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v4
+; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-TRUE16-LABEL: test_ldexp_v4f16_v4i16:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
index 1dd6a7926029e..bb23fdd26402d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
@@ -6870,15 +6870,25 @@ define <2 x half> @v_log_v2f16(<2 x half> %in) {
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_log_v2f16:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_log_f16_e32 v1, v0
-; GFX900-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-NEXT:    v_pack_b32_f16 v0, v1, v0
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_log_v2f16:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_log_v2f16:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x398c
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log_v2f16:
 ; GFX1100-SDAG-TRUE16:       ; %bb.0:
@@ -7005,22 +7015,22 @@ define <2 x half> @v_log_fabs_v2f16(<2 x half> %in) {
 ; GFX900-SDAG-LABEL: v_log_fabs_v2f16:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_log_f16_e64 v1, |v0|
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, |v0| dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, |v0| dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log_fabs_v2f16:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x398c
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log_fabs_v2f16:
@@ -7157,22 +7167,22 @@ define <2 x half> @v_log_fneg_fabs_v2f16(<2 x half> %in) {
 ; GFX900-SDAG-LABEL: v_log_fneg_fabs_v2f16:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_log_f16_e64 v1, -|v0|
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, -|v0| dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, -|v0| dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log_fneg_fabs_v2f16:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_or_b32_e32 v0, 0x80008000, v0
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x398c
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log_fneg_fabs_v2f16:
@@ -7310,22 +7320,22 @@ define <2 x half> @v_log_fneg_v2f16(<2 x half> %in) {
 ; GFX900-SDAG-LABEL: v_log_fneg_v2f16:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_log_f16_e64 v1, -v0
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, -v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log_fneg_v2f16:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x398c
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log_fneg_v2f16:
@@ -7449,15 +7459,25 @@ define <2 x half> @v_log_v2f16_fast(<2 x half> %in) {
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_log_v2f16_fast:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_log_f16_e32 v1, v0
-; GFX900-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-NEXT:    v_pack_b32_f16 v0, v1, v0
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_log_v2f16_fast:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_log_v2f16_fast:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x398c
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log_v2f16_fast:
 ; GFX1100-SDAG-TRUE16:       ; %bb.0:
@@ -7576,17 +7596,29 @@ define <3 x half> @v_log_v3f16(<3 x half> %in) {
 ; VI-NEXT:    v_or_b32_e32 v0, v2, v0
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_log_v3f16:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_log_f16_e32 v2, v0
-; GFX900-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT:    v_log_f16_e32 v1, v1
-; GFX900-NEXT:    v_mul_f16_e32 v2, 0x398c, v2
-; GFX900-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_log_v3f16:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT:    v_log_f16_e32 v1, v1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_log_v3f16:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x398c
+; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log_v3f16:
 ; GFX1100-SDAG-TRUE16:       ; %bb.0:
@@ -7715,17 +7747,29 @@ define <3 x half> @v_log_v3f16_fast(<3 x half> %in) {
 ; VI-NEXT:    v_or_b32_e32 v0, v2, v0
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_log_v3f16_fast:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_log_f16_e32 v2, v0
-; GFX900-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT:    v_log_f16_e32 v1, v1
-; GFX900-NEXT:    v_mul_f16_e32 v2, 0x398c, v2
-; GFX900-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_log_v3f16_fast:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT:    v_log_f16_e32 v1, v1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_log_v3f16_fast:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x398c
+; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log_v3f16_fast:
 ; GFX1100-SDAG-TRUE16:       ; %bb.0:
@@ -7886,31 +7930,29 @@ define <4 x half> @v_log_v4f16(<4 x half> %in) {
 ; GFX900-SDAG-LABEL: v_log_v4f16:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_log_f16_e32 v2, v1
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_log_f16_e32 v3, v0
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v2, 0x398c, v2
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v3, 0x398c, v3
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v3, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v1, v2, v1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log_v4f16:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v2, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v3, v1
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v2, 0x398c, v2
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v3, 0x398c, v3
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x398c
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log_v4f16:
@@ -8089,31 +8131,29 @@ define <4 x half> @v_log_v4f16_fast(<4 x half> %in) {
 ; GFX900-SDAG-LABEL: v_log_v4f16_fast:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_log_f16_e32 v2, v1
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_log_f16_e32 v3, v0
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v2, 0x398c, v2
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v3, 0x398c, v3
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v3, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v1, v2, v1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log_v4f16_fast:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v2, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v3, v1
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v2, 0x398c, v2
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v3, 0x398c, v3
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x398c
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log_v4f16_fast:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
index 86a58d26c6ae5..dcf789e26de54 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
@@ -6870,15 +6870,25 @@ define <2 x half> @v_log10_v2f16(<2 x half> %in) {
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_log10_v2f16:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_log_f16_e32 v1, v0
-; GFX900-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-NEXT:    v_pack_b32_f16 v0, v1, v0
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_log10_v2f16:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_log10_v2f16:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x34d1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log10_v2f16:
 ; GFX1100-SDAG-TRUE16:       ; %bb.0:
@@ -7005,22 +7015,22 @@ define <2 x half> @v_log10_fabs_v2f16(<2 x half> %in) {
 ; GFX900-SDAG-LABEL: v_log10_fabs_v2f16:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_log_f16_e64 v1, |v0|
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, |v0| dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, |v0| dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log10_fabs_v2f16:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x34d1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log10_fabs_v2f16:
@@ -7157,22 +7167,22 @@ define <2 x half> @v_log10_fneg_fabs_v2f16(<2 x half> %in) {
 ; GFX900-SDAG-LABEL: v_log10_fneg_fabs_v2f16:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_log_f16_e64 v1, -|v0|
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, -|v0| dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, -|v0| dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log10_fneg_fabs_v2f16:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_or_b32_e32 v0, 0x80008000, v0
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x34d1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log10_fneg_fabs_v2f16:
@@ -7310,22 +7320,22 @@ define <2 x half> @v_log10_fneg_v2f16(<2 x half> %in) {
 ; GFX900-SDAG-LABEL: v_log10_fneg_v2f16:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_log_f16_e64 v1, -v0
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, -v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log10_fneg_v2f16:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x34d1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log10_fneg_v2f16:
@@ -7449,15 +7459,25 @@ define <2 x half> @v_log10_v2f16_fast(<2 x half> %in) {
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_log10_v2f16_fast:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_log_f16_e32 v1, v0
-; GFX900-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-NEXT:    v_pack_b32_f16 v0, v1, v0
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_log10_v2f16_fast:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_log10_v2f16_fast:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x34d1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log10_v2f16_fast:
 ; GFX1100-SDAG-TRUE16:       ; %bb.0:
@@ -7576,17 +7596,29 @@ define <3 x half> @v_log10_v3f16(<3 x half> %in) {
 ; VI-NEXT:    v_or_b32_e32 v0, v2, v0
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_log10_v3f16:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_log_f16_e32 v2, v0
-; GFX900-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT:    v_log_f16_e32 v1, v1
-; GFX900-NEXT:    v_mul_f16_e32 v2, 0x34d1, v2
-; GFX900-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_log10_v3f16:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT:    v_log_f16_e32 v1, v1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_log10_v3f16:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x34d1
+; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log10_v3f16:
 ; GFX1100-SDAG-TRUE16:       ; %bb.0:
@@ -7715,17 +7747,29 @@ define <3 x half> @v_log10_v3f16_fast(<3 x half> %in) {
 ; VI-NEXT:    v_or_b32_e32 v0, v2, v0
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_log10_v3f16_fast:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_log_f16_e32 v2, v0
-; GFX900-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT:    v_log_f16_e32 v1, v1
-; GFX900-NEXT:    v_mul_f16_e32 v2, 0x34d1, v2
-; GFX900-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_log10_v3f16_fast:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT:    v_log_f16_e32 v1, v1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_log10_v3f16_fast:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x34d1
+; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log10_v3f16_fast:
 ; GFX1100-SDAG-TRUE16:       ; %bb.0:
@@ -7886,31 +7930,29 @@ define <4 x half> @v_log10_v4f16(<4 x half> %in) {
 ; GFX900-SDAG-LABEL: v_log10_v4f16:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_log_f16_e32 v2, v1
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_log_f16_e32 v3, v0
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v2, 0x34d1, v2
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v3, 0x34d1, v3
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v3, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v1, v2, v1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log10_v4f16:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v2, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v3, v1
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v2, 0x34d1, v2
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v3, 0x34d1, v3
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x34d1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log10_v4f16:
@@ -8089,31 +8131,29 @@ define <4 x half> @v_log10_v4f16_fast(<4 x half> %in) {
 ; GFX900-SDAG-LABEL: v_log10_v4f16_fast:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_log_f16_e32 v2, v1
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_log_f16_e32 v3, v0
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v2, 0x34d1, v2
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v3, 0x34d1, v3
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v3, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v1, v2, v1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log10_v4f16_fast:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v2, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v3, v1
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v2, 0x34d1, v2
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v3, 0x34d1, v3
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x34d1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log10_v4f16_fast:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
index ea88f77f98735..5544fd764e841 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -4249,17 +4249,15 @@ define <2 x half> @v_log2_v2f16(<2 x half> %in) {
 ; GFX900-SDAG-LABEL: v_log2_v2f16:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_log_f16_e32 v0, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log2_v2f16:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log2_v2f16:
@@ -4367,18 +4365,16 @@ define <2 x half> @v_log2_fabs_v2f16(<2 x half> %in) {
 ; GFX900-SDAG-LABEL: v_log2_fabs_v2f16:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_log_f16_e64 v0, |v0|
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, |v0| dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, |v0| dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log2_fabs_v2f16:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log2_fabs_v2f16:
@@ -4494,18 +4490,16 @@ define <2 x half> @v_log2_fneg_fabs_v2f16(<2 x half> %in) {
 ; GFX900-SDAG-LABEL: v_log2_fneg_fabs_v2f16:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_log_f16_e64 v0, -|v0|
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, -|v0| dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, -|v0| dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log2_fneg_fabs_v2f16:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_or_b32_e32 v0, 0x80008000, v0
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log2_fneg_fabs_v2f16:
@@ -4622,18 +4616,16 @@ define <2 x half> @v_log2_fneg_v2f16(<2 x half> %in) {
 ; GFX900-SDAG-LABEL: v_log2_fneg_v2f16:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_log_f16_e64 v0, -v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, -v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log2_fneg_v2f16:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log2_fneg_v2f16:
@@ -4739,17 +4731,15 @@ define <2 x half> @v_log2_v2f16_fast(<2 x half> %in) {
 ; GFX900-SDAG-LABEL: v_log2_v2f16_fast:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_log_f16_e32 v0, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log2_v2f16_fast:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log2_v2f16_fast:
@@ -4861,19 +4851,17 @@ define <3 x half> @v_log2_v3f16(<3 x half> %in) {
 ; GFX900-SDAG-LABEL: v_log2_v3f16:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_log_f16_e32 v0, v0
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    v_log_f16_e32 v1, v1
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log2_v3f16:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v2, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v1
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log2_v3f16:
@@ -4989,19 +4977,17 @@ define <3 x half> @v_log2_v3f16_fast(<3 x half> %in) {
 ; GFX900-SDAG-LABEL: v_log2_v3f16_fast:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_log_f16_e32 v0, v0
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    v_log_f16_e32 v1, v1
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log2_v3f16_fast:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v2, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v1
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log2_v3f16_fast:
@@ -5129,23 +5115,19 @@ define <4 x half> @v_log2_v4f16(<4 x half> %in) {
 ; GFX900-SDAG-LABEL: v_log2_v4f16:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_log_f16_e32 v0, v0
-; GFX900-SDAG-NEXT:    v_log_f16_e32 v1, v1
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v3
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v2
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log2_v4f16:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v2, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v3, v1
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log2_v4f16:
@@ -5284,23 +5266,19 @@ define <4 x half> @v_log2_v4f16_fast(<4 x half> %in) {
 ; GFX900-SDAG-LABEL: v_log2_v4f16_fast:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_log_f16_e32 v0, v0
-; GFX900-SDAG-NEXT:    v_log_f16_e32 v1, v1
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v3
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v2
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log2_v4f16_fast:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v2, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v3, v1
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log2_v4f16_fast:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
index de12f2b246f57..c6fee73f4580d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
@@ -198,9 +198,8 @@ define amdgpu_kernel void @rint_v2f16(
 ; GFX9-NEXT:    s_mov_b32 s4, s0
 ; GFX9-NEXT:    s_mov_b32 s5, s1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_rndne_f16_e32 v1, v0
-; GFX9-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX9-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
index 1a426096da197..92e8dce75222a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
@@ -188,11 +188,10 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mul_f16_e32 v3, 0.15915494, v1
-; GFX9-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_sin_f16_e32 v2, v3
-; GFX9-NEXT:    v_sin_f16_e32 v1, v1
-; GFX9-NEXT:    v_pack_b32_f16 v1, v2, v1
+; GFX9-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_sin_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -204,11 +203,10 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mul_f16_e32 v3, 0.15915494, v1
-; GFX10-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_sin_f16_e32 v2, v3
-; GFX10-NEXT:    v_sin_f16_e32 v1, v1
-; GFX10-NEXT:    v_pack_b32_f16 v1, v2, v1
+; GFX10-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_sin_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll b/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll
index 04eea20993608..9f31bde8086d0 100644
--- a/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll
+++ b/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll
@@ -829,9 +829,8 @@ define <4 x half> @v_repeat_divisor_v2f16_x2(<2 x half> %x, <2 x half> %y, <2 x
 ; GFX9-LABEL: v_repeat_divisor_v2f16_x2:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_rcp_f16_sdwa v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_rcp_f16_e32 v2, v2
-; GFX9-NEXT:    v_pack_b32_f16 v2, v2, v3
+; GFX9-NEXT:    v_rcp_f16_sdwa v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT:    v_rcp_f16_sdwa v2, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -932,15 +931,14 @@ define <6 x half> @v_repeat_divisor_v3f16_x2(<3 x half> %x, <3 x half> %y, <3 x
 ; GFX9-LABEL: v_repeat_divisor_v3f16_x2:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_rcp_f16_sdwa v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_rcp_f16_e32 v4, v4
+; GFX9-NEXT:    v_rcp_f16_sdwa v4, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX9-NEXT:    v_rcp_f16_e32 v5, v5
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7e00
-; GFX9-NEXT:    v_pack_b32_f16 v4, v4, v6
+; GFX9-NEXT:    v_rcp_f16_sdwa v4, v4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX9-NEXT:    v_pack_b32_f16 v5, v5, s4
-; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v4
 ; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v5
 ; GFX9-NEXT:    v_pk_mul_f16 v3, v3, v5
+; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v4
 ; GFX9-NEXT:    v_pk_mul_f16 v4, v2, v4
 ; GFX9-NEXT:    v_alignbit_b32 v2, v3, v4, 16
 ; GFX9-NEXT:    v_pack_b32_f16 v1, v1, v4
diff --git a/llvm/test/CodeGen/AMDGPU/roundeven.ll b/llvm/test/CodeGen/AMDGPU/roundeven.ll
index 59a1fe041bf90..97358044abdaa 100644
--- a/llvm/test/CodeGen/AMDGPU/roundeven.ll
+++ b/llvm/test/CodeGen/AMDGPU/roundeven.ll
@@ -460,17 +460,15 @@ define <2 x half> @v_roundeven_v2f16(<2 x half> %x) {
 ; GFX9-LABEL: v_roundeven_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_rndne_f16_e32 v1, v0
-; GFX9-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX9-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_roundeven_v2f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_rndne_f16_e32 v1, v0
-; GFX10-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX10-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_roundeven_v2f16:
@@ -523,17 +521,15 @@ define <2 x half> @v_roundeven_v2f16(<2 x half> %x) {
 ; SDAG_GFX9-LABEL: v_roundeven_v2f16:
 ; SDAG_GFX9:       ; %bb.0:
 ; SDAG_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG_GFX9-NEXT:    v_rndne_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG_GFX9-NEXT:    v_rndne_f16_e32 v0, v0
-; SDAG_GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
+; SDAG_GFX9-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; SDAG_GFX9-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; SDAG_GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX10-LABEL: v_roundeven_v2f16:
 ; SDAG_GFX10:       ; %bb.0:
 ; SDAG_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG_GFX10-NEXT:    v_rndne_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG_GFX10-NEXT:    v_rndne_f16_e32 v0, v0
-; SDAG_GFX10-NEXT:    v_pack_b32_f16 v0, v0, v1
+; SDAG_GFX10-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; SDAG_GFX10-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; SDAG_GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX11-TRUE16-LABEL: v_roundeven_v2f16:
@@ -602,18 +598,16 @@ define <2 x half> @v_roundeven_v2f16_fneg(<2 x half> %x) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
-; GFX9-NEXT:    v_rndne_f16_e32 v1, v0
-; GFX9-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX9-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_roundeven_v2f16_fneg:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
-; GFX10-NEXT:    v_rndne_f16_e32 v1, v0
-; GFX10-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX10-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_roundeven_v2f16_fneg:
@@ -676,17 +670,15 @@ define <2 x half> @v_roundeven_v2f16_fneg(<2 x half> %x) {
 ; SDAG_GFX9-LABEL: v_roundeven_v2f16_fneg:
 ; SDAG_GFX9:       ; %bb.0:
 ; SDAG_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG_GFX9-NEXT:    v_rndne_f16_sdwa v1, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG_GFX9-NEXT:    v_rndne_f16_e64 v0, -v0
-; SDAG_GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
+; SDAG_GFX9-NEXT:    v_rndne_f16_sdwa v0, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; SDAG_GFX9-NEXT:    v_rndne_f16_sdwa v0, -v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; SDAG_GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX10-LABEL: v_roundeven_v2f16_fneg:
 ; SDAG_GFX10:       ; %bb.0:
 ; SDAG_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG_GFX10-NEXT:    v_rndne_f16_sdwa v1, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG_GFX10-NEXT:    v_rndne_f16_e64 v0, -v0
-; SDAG_GFX10-NEXT:    v_pack_b32_f16 v0, v0, v1
+; SDAG_GFX10-NEXT:    v_rndne_f16_sdwa v0, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; SDAG_GFX10-NEXT:    v_rndne_f16_sdwa v0, -v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; SDAG_GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX11-TRUE16-LABEL: v_roundeven_v2f16_fneg:
@@ -759,23 +751,19 @@ define <4 x half> @v_roundeven_v4f16(<4 x half> %x) {
 ; GFX9-LABEL: v_roundeven_v4f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_rndne_f16_e32 v2, v0
-; GFX9-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_rndne_f16_e32 v3, v1
-; GFX9-NEXT:    v_rndne_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX9-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX9-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_rndne_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT:    v_rndne_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_roundeven_v4f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_rndne_f16_e32 v2, v0
-; GFX10-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_rndne_f16_e32 v3, v1
-; GFX10-NEXT:    v_rndne_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX10-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX10-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_rndne_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_rndne_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_roundeven_v4f16:
@@ -850,23 +838,19 @@ define <4 x half> @v_roundeven_v4f16(<4 x half> %x) {
 ; SDAG_GFX9-LABEL: v_roundeven_v4f16:
 ; SDAG_GFX9:       ; %bb.0:
 ; SDAG_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG_GFX9-NEXT:    v_rndne_f16_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG_GFX9-NEXT:    v_rndne_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG_GFX9-NEXT:    v_rndne_f16_e32 v1, v1
-; SDAG_GFX9-NEXT:    v_rndne_f16_e32 v0, v0
-; SDAG_GFX9-NEXT:    v_pack_b32_f16 v0, v0, v3
-; SDAG_GFX9-NEXT:    v_pack_b32_f16 v1, v1, v2
+; SDAG_GFX9-NEXT:    v_rndne_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; SDAG_GFX9-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; SDAG_GFX9-NEXT:    v_rndne_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; SDAG_GFX9-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; SDAG_GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX10-LABEL: v_roundeven_v4f16:
 ; SDAG_GFX10:       ; %bb.0:
 ; SDAG_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG_GFX10-NEXT:    v_rndne_f16_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG_GFX10-NEXT:    v_rndne_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG_GFX10-NEXT:    v_rndne_f16_e32 v0, v0
-; SDAG_GFX10-NEXT:    v_rndne_f16_e32 v1, v1
-; SDAG_GFX10-NEXT:    v_pack_b32_f16 v0, v0, v3
-; SDAG_GFX10-NEXT:    v_pack_b32_f16 v1, v1, v2
+; SDAG_GFX10-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; SDAG_GFX10-NEXT:    v_rndne_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; SDAG_GFX10-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; SDAG_GFX10-NEXT:    v_rndne_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; SDAG_GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX11-TRUE16-LABEL: v_roundeven_v4f16:



More information about the llvm-commits mailing list