[llvm] [AMDGPU] Eliminate unnecessary packing in wider f16 vectors for sdwa/opsel-able instruction (PR #137137)

Vikash Gupta via llvm-commits llvm-commits at lists.llvm.org
Tue Jun 3 04:44:51 PDT 2025


https://github.com/vg0204 updated https://github.com/llvm/llvm-project/pull/137137

>From 5d3e2aa0e897131d2e464d8898e5b901817fecc6 Mon Sep 17 00:00:00 2001
From: vikashgu <Vikash.Gupta at amd.com>
Date: Wed, 23 Apr 2025 12:13:49 +0000
Subject: [PATCH 1/8] [AMDGPU][PeepholeOpt] Eliminate unnecessary packing in
 fp16 vector operations for SDWA/OPSEL-able instruction

As the compiler has no fp16 packed instruction,
so isel scalarizes each fp16 operation in wide fp16 vectors and generates
separate individual fp16 results, which are later packed. Now, in post-
isel pass in SIPeepholeSDWA pass, opportunistically any instructions is
eventually converted into its SDWA/OPSEL-able version.

This patch gets rids of unnecessary packing in wider fp16 vectors
operation for SDWA/OPSEL-able instruction, by overwriting the partial
fp16 result into same input register partially, while maintaining the
sanctity of rest of bits in input register, using OPSEL dst_unused
operand set as UNUSED_PRESERVED. Owing to the context of generating SDWA
instructions, it is invoked at the end of the SIPeepholeSDWA pass.
---
 llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 516 +++++++++++++++++++++-
 1 file changed, 514 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index eb18677780803..930dc9aac5a8d 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -27,6 +27,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include <optional>
+#include <queue>
 
 using namespace llvm;
 
@@ -35,6 +36,11 @@ using namespace llvm;
 STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found.");
 STATISTIC(NumSDWAInstructionsPeepholed,
           "Number of instruction converted to SDWA.");
+STATISTIC(Num16BitPackedInstructionsEliminated,
+          "Number of packed instruction eliminated.");
+STATISTIC(NumSDWAInstructionsToEliminateFP16Pack,
+          "Number of instruction converted/modified into SDWA to eliminate "
+          "FP16 packing.");
 
 namespace {
 
@@ -67,6 +73,14 @@ class SIPeepholeSDWA {
   bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
   void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const;
 
+  void eliminateFP16Packing(MachineBasicBlock &MBB, const GCNSubtarget &ST);
+  unsigned
+  computeMIChainsForPackedOps(MachineInstr *ParentMI,
+                              std::queue<MachineOperand *> &DefSrcQueue,
+                              const GCNSubtarget &ST);
+  void convertMIToSDWAWithOpsel(MachineInstr &MI, MachineOperand &SrcMO,
+                                AMDGPU::SDWA::SdwaSel OpSel);
+
 public:
   bool run(MachineFunction &MF);
 };
@@ -267,13 +281,17 @@ void SDWADstPreserveOperand::print(raw_ostream& OS) const {
 
 #endif
 
-static void copyRegOperand(MachineOperand &To, const MachineOperand &From) {
+static void copyRegOperand(MachineOperand &To, const MachineOperand &From,
+                           bool isKill = false) {
   assert(To.isReg() && From.isReg());
   To.setReg(From.getReg());
   To.setSubReg(From.getSubReg());
   To.setIsUndef(From.isUndef());
   if (To.isUse()) {
-    To.setIsKill(From.isKill());
+    if (isKill)
+      To.setIsKill(true);
+    else
+      To.setIsKill(From.isKill());
   } else {
     To.setIsDead(From.isDead());
   }
@@ -1386,6 +1404,494 @@ bool SIPeepholeSDWALegacy::runOnMachineFunction(MachineFunction &MF) {
   return SIPeepholeSDWA().run(MF);
 }
 
+static bool isSrcDestFP16Bits(MachineInstr *MI, const SIInstrInfo *TII) {
+  unsigned Opcode = MI->getOpcode();
+  if (TII->isSDWA(Opcode))
+    Opcode = AMDGPU::getBasicFromSDWAOp(Opcode);
+
+  switch (Opcode) {
+  case AMDGPU::V_CVT_F16_U16_e32:
+  case AMDGPU::V_CVT_F16_U16_e64:
+  case AMDGPU::V_CVT_F16_I16_e32:
+  case AMDGPU::V_CVT_F16_I16_e64:
+  case AMDGPU::V_RCP_F16_e64:
+  case AMDGPU::V_RCP_F16_e32:
+  case AMDGPU::V_RSQ_F16_e64:
+  case AMDGPU::V_RSQ_F16_e32:
+  case AMDGPU::V_SQRT_F16_e64:
+  case AMDGPU::V_SQRT_F16_e32:
+  case AMDGPU::V_LOG_F16_e64:
+  case AMDGPU::V_LOG_F16_e32:
+  case AMDGPU::V_EXP_F16_e64:
+  case AMDGPU::V_EXP_F16_e32:
+  case AMDGPU::V_SIN_F16_e64:
+  case AMDGPU::V_SIN_F16_e32:
+  case AMDGPU::V_COS_F16_e64:
+  case AMDGPU::V_COS_F16_e32:
+  case AMDGPU::V_FLOOR_F16_e64:
+  case AMDGPU::V_FLOOR_F16_e32:
+  case AMDGPU::V_CEIL_F16_e64:
+  case AMDGPU::V_CEIL_F16_e32:
+  case AMDGPU::V_TRUNC_F16_e64:
+  case AMDGPU::V_TRUNC_F16_e32:
+  case AMDGPU::V_RNDNE_F16_e64:
+  case AMDGPU::V_RNDNE_F16_e32:
+  case AMDGPU::V_FRACT_F16_e64:
+  case AMDGPU::V_FRACT_F16_e32:
+  case AMDGPU::V_FREXP_MANT_F16_e64:
+  case AMDGPU::V_FREXP_MANT_F16_e32:
+  case AMDGPU::V_FREXP_EXP_I16_F16_e64:
+  case AMDGPU::V_FREXP_EXP_I16_F16_e32:
+  case AMDGPU::V_LDEXP_F16_e64:
+  case AMDGPU::V_LDEXP_F16_e32:
+  case AMDGPU::V_ADD_F16_e64:
+  case AMDGPU::V_ADD_F16_e32:
+  case AMDGPU::V_SUB_F16_e64:
+  case AMDGPU::V_SUB_F16_e32:
+  case AMDGPU::V_SUBREV_F16_e64:
+  case AMDGPU::V_SUBREV_F16_e32:
+  case AMDGPU::V_MUL_F16_e64:
+  case AMDGPU::V_MUL_F16_e32:
+  case AMDGPU::V_MAX_F16_e64:
+  case AMDGPU::V_MAX_F16_e32:
+  case AMDGPU::V_MIN_F16_e64:
+  case AMDGPU::V_MIN_F16_e32:
+  case AMDGPU::V_MAD_F16_e64:
+  case AMDGPU::V_FMA_F16_e64:
+  case AMDGPU::V_DIV_FIXUP_F16_e64:
+    return true;
+  case AMDGPU::V_MADAK_F16:
+  case AMDGPU::V_MADMK_F16:
+  case AMDGPU::V_FMAMK_F16:
+  case AMDGPU::V_FMAAK_F16:
+    // NOTE : SKEPTICAL ABOUT IT
+    return false;
+  case AMDGPU::V_FMAC_F16_e32:
+  case AMDGPU::V_FMAC_F16_e64:
+  case AMDGPU::V_MAC_F16_e32:
+  case AMDGPU::V_MAC_F16_e64:
+    // As their sdwa version allow dst_sel to be equal only set to DWORD
+  default:
+    return false;
+  }
+}
+
+static bool checkForRightSrcRootAccess(MachineInstr *Def0MI,
+                                       MachineInstr *Def1MI,
+                                       Register SrcRootReg,
+                                       const SIInstrInfo *TII) {
+  // As if could, the Def1MI would have been sdwa-ed
+  if (!TII->isSDWA(Def1MI->getOpcode()))
+    return false;
+
+  MachineOperand *Def1Src0 =
+      TII->getNamedOperand(*Def1MI, AMDGPU::OpName::src0);
+  MachineOperand *Def1Src1 =
+      TII->getNamedOperand(*Def1MI, AMDGPU::OpName::src1);
+  MachineOperand *Def0Src0 =
+      TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src0);
+  MachineOperand *Def0Src1 =
+      TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src1);
+
+  if (Def1Src0 && Def1Src0->isReg() && (Def1Src0->getReg() == SrcRootReg)) {
+    MachineOperand *Def1Src0Sel =
+        TII->getNamedOperand(*Def1MI, AMDGPU::OpName::src0_sel);
+    if (!Def1Src0Sel ||
+        (Def1Src0Sel->getImm() != AMDGPU::SDWA::SdwaSel::WORD_1))
+      return false;
+
+    if (Def0Src0 && Def0Src0->isReg() && (Def0Src0->getReg() == SrcRootReg)) {
+      MachineOperand *Def0Src0Sel =
+          TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src0_sel);
+      if (!Def0Src0Sel)
+        return true;
+      if (Def0Src0Sel && Def0Src0Sel->getImm() == AMDGPU::SDWA::SdwaSel::WORD_0)
+        return true;
+    }
+
+    if (Def0Src1 && Def0Src1->isReg() && (Def0Src1->getReg() == SrcRootReg)) {
+      MachineOperand *Def0Src1Sel =
+          TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src1_sel);
+      if (!Def0Src1Sel)
+        return true;
+      if (Def0Src1Sel && Def0Src1Sel->getImm() == AMDGPU::SDWA::SdwaSel::WORD_0)
+        return true;
+    }
+  }
+
+  if (Def1Src1 && Def1Src1->isReg() && (Def1Src1->getReg() == SrcRootReg)) {
+    MachineOperand *Def1Src1Sel =
+        TII->getNamedOperand(*Def1MI, AMDGPU::OpName::src1_sel);
+    if (!Def1Src1Sel ||
+        (Def1Src1Sel->getImm() != AMDGPU::SDWA::SdwaSel::WORD_1))
+      return false;
+
+    if (Def0Src0 && Def0Src0->isReg() && (Def0Src0->getReg() == SrcRootReg)) {
+      MachineOperand *Def0Src0Sel =
+          TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src0_sel);
+      if (!Def0Src0Sel)
+        return true;
+      if (Def0Src0Sel && Def0Src0Sel->getImm() == AMDGPU::SDWA::SdwaSel::WORD_0)
+        return true;
+    }
+
+    if (Def0Src1 && Def0Src1->isReg() && (Def0Src1->getReg() == SrcRootReg)) {
+      MachineOperand *Def0Src1Sel =
+          TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src1_sel);
+      if (!Def0Src1Sel)
+        return true;
+      if (Def0Src1Sel && Def0Src1Sel->getImm() == AMDGPU::SDWA::SdwaSel::WORD_0)
+        return true;
+    }
+  }
+
+  return false;
+}
+
+/// Given A and B are in the same MBB, returns true if A comes before B.
+static bool dominates(MachineBasicBlock::const_iterator A,
+                      MachineBasicBlock::const_iterator B) {
+  assert(A->getParent() == B->getParent());
+  const MachineBasicBlock *MBB = A->getParent();
+  auto MBBEnd = MBB->end();
+  if (B == MBBEnd)
+    return true;
+
+  MachineBasicBlock::const_iterator I = MBB->begin();
+  for (; &*I != A && &*I != B; ++I)
+    ;
+
+  return &*I == A;
+}
+
+// Convert MI into its SDWA version with its Dst_Sel & SrcMO_Sel set with OpSel
+// and preserving the rest of Dst's bits.
+void SIPeepholeSDWA::convertMIToSDWAWithOpsel(MachineInstr &MI,
+                                              MachineOperand &SrcMO,
+                                              AMDGPU::SDWA::SdwaSel OpSel) {
+  LLVM_DEBUG(dbgs() << "Convert instruction:" << MI);
+
+  MachineInstr *SDWAInst;
+  if (TII->isSDWA(MI.getOpcode())) {
+    SDWAInst = &MI;
+  } else {
+    SDWAInst = createSDWAVersion(MI);
+    MI.eraseFromParent();
+  }
+
+  ConvertedInstructions.push_back(SDWAInst);
+  unsigned SDWAOpcode = SDWAInst->getOpcode();
+  ++NumSDWAInstructionsToEliminateFP16Pack;
+
+  MachineOperand *Dst = TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::vdst);
+  assert(Dst && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::vdst));
+
+  MachineOperand *DstSel =
+      TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::dst_sel);
+  assert(DstSel &&
+         AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::dst_sel));
+  DstSel->setImm(OpSel);
+
+  MachineOperand *DstUnused =
+      TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::dst_unused);
+  assert(DstUnused &&
+         AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::dst_unused));
+  assert(!(DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) &&
+         "Dst_unused should not be UNUSED_PRESERVE already");
+  DstUnused->setImm(AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE);
+
+  auto PreserveDstIdx =
+      AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst);
+  assert(PreserveDstIdx != -1);
+  auto NewSrcImplitMO = MachineOperand::CreateReg(SrcMO.getReg(), false, true);
+  copyRegOperand(NewSrcImplitMO, SrcMO);
+  SDWAInst->addOperand(NewSrcImplitMO);
+  SDWAInst->tieOperands(PreserveDstIdx, SDWAInst->getNumOperands() - 1);
+
+  MachineOperand *Src0 = TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::src0);
+  assert(Src0 && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0));
+  if (Src0->isReg() && (Src0->getReg() == SrcMO.getReg())) {
+    MachineOperand *Src0Sel =
+        TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::src0_sel);
+    assert(Src0Sel &&
+           AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_sel));
+    Src0Sel->setImm(OpSel);
+
+    LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n');
+    return;
+  }
+
+  MachineOperand *Src1 = TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::src1);
+  assert(Src1 && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1));
+  if (Src1->isReg() && (Src1->getReg() == SrcMO.getReg())) {
+    MachineOperand *Src1Sel =
+        TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::src1_sel);
+    assert(Src1Sel &&
+           AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_sel));
+    Src1Sel->setImm(OpSel);
+
+    LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n');
+    return;
+  }
+}
+
+// BackTracks the given Parent MI to look for any of its use operand that has
+// been defined by FP16 (sdwa-able) in recursive fashion.
+unsigned SIPeepholeSDWA::computeMIChainsForPackedOps(
+    MachineInstr *ParentMI, std::queue<MachineOperand *> &DefSrcQueue,
+    const GCNSubtarget &ST) {
+  unsigned NumOfFP16Def;
+  do {
+    MachineInstr *NextMIInChain = nullptr;
+    NumOfFP16Def = 0;
+    for (MachineOperand &currentMO : ParentMI->uses()) {
+      if (!currentMO.isReg() || currentMO.getReg().isPhysical() ||
+          !MRI->hasOneUse(currentMO.getReg()))
+        continue;
+
+      MachineOperand *DefCurrMO = findSingleRegDef(&currentMO, MRI);
+      if (!DefCurrMO)
+        continue;
+
+      MachineInstr *DefCurrMI = DefCurrMO->getParent();
+      if (!isSrcDestFP16Bits(DefCurrMI, TII) ||
+          !isConvertibleToSDWA(*DefCurrMI, ST, TII))
+        continue;
+
+      NextMIInChain = DefCurrMI;
+      DefSrcQueue.push(DefCurrMO);
+      NumOfFP16Def++;
+    }
+
+    if (NumOfFP16Def > 1)
+      break;
+
+    ParentMI = NextMIInChain;
+  } while (ParentMI);
+
+  return NumOfFP16Def;
+}
+
+void SIPeepholeSDWA::eliminateFP16Packing(MachineBasicBlock &MBB,
+                                          const GCNSubtarget &ST) {
+  if (!ST.has16BitInsts())
+    return;
+
+  for (MachineInstr &MI : make_early_inc_range(MBB)) {
+    if (MI.getOpcode() == AMDGPU::V_PACK_B32_F16_e64) {
+      LLVM_DEBUG(dbgs() << "\nCandidate FP16 Packed MI : " << MI << '\n');
+      std::queue<MachineOperand *> DefSrc0Queue;
+      std::queue<MachineOperand *> DefSrc1Queue;
+      MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+      MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+
+      if (!Src0->isReg() || Src0->getReg().isPhysical() ||
+          !MRI->hasOneUse(Src0->getReg()) || !Src1->isReg() ||
+          Src1->getReg().isPhysical() || !MRI->hasOneUse(Src1->getReg()))
+        continue;
+
+      MachineOperand *Op0 = findSingleRegDef(Src0, MRI);
+      MachineOperand *Op1 = findSingleRegDef(Src1, MRI);
+
+      if (!Op0 || !Op1)
+        continue;
+
+      MachineInstr *ParentMIOp0 = Op0->getParent();
+      MachineInstr *ParentMIOp1 = Op1->getParent();
+
+      if (!isSrcDestFP16Bits(ParentMIOp0, TII) ||
+          !isSrcDestFP16Bits(ParentMIOp1, TII) ||
+          !isConvertibleToSDWA(*ParentMIOp0, ST, TII) ||
+          !isConvertibleToSDWA(*ParentMIOp1, ST, TII))
+        continue;
+
+      DefSrc0Queue.push(Op0);
+      DefSrc1Queue.push(Op1);
+
+      // This checks for the given MI, that it only has exact one register MO
+      // use , that is defined by pure FP16 instruction (that is SDWA-able too)
+      unsigned NumOfFP16Def;
+
+      NumOfFP16Def = computeMIChainsForPackedOps(ParentMIOp0, DefSrc0Queue, ST);
+      if (NumOfFP16Def > 1)
+        continue;
+
+      NumOfFP16Def = computeMIChainsForPackedOps(ParentMIOp1, DefSrc1Queue, ST);
+      if (NumOfFP16Def > 1)
+        continue;
+
+      MachineInstr *Def0RootMI = (DefSrc0Queue.back())->getParent();
+      MachineInstr *Def1RootMI = (DefSrc1Queue.back())->getParent();
+      Register SrcRootMOReg = AMDGPU::NoRegister;
+
+      // Now, check if the last operation for each in of the DefSrcQueue
+      // has the common MO, that would be the source root MO for element-wise
+      // fp16 chain operations
+      for (MachineOperand &Current0MO : Def0RootMI->uses()) {
+        if (!Current0MO.isReg() || Current0MO.getReg().isPhysical())
+          continue;
+
+        for (MachineOperand &Current1MO : Def1RootMI->uses()) {
+          if (!Current1MO.isReg() || Current1MO.getReg().isPhysical())
+            continue;
+
+          if (Current0MO.getReg() == Current1MO.getReg() &&
+              Current0MO.getSubReg() == Current1MO.getSubReg()) {
+            SrcRootMOReg = Current0MO.getReg();
+            break;
+          }
+        }
+        // Found it, no more check needed, so break;
+        if (SrcRootMOReg != AMDGPU::NoRegister)
+          break;
+      }
+
+      if (SrcRootMOReg == AMDGPU::NoRegister)
+        continue;
+
+      // Also we need to ensure that each of the DefXRootMI should access the
+      // lower and upper half word of SrcRootMOReg respectively.
+      if (!checkForRightSrcRootAccess(Def0RootMI, Def1RootMI, SrcRootMOReg,
+                                      TII))
+        continue;
+
+      // The graph below represents the connection :
+      //       Op0Intial  -->  Op0x  --> ...  -->  Op0Final
+      //      /                                       \'
+      // SrcRootMO                                    v_Pack_b32_f16
+      //      \                                       /
+      //       Op1Intial  -->  Op1x  --> ...  -->  Op1Final
+      // The nomenclature is based upon above flow-graph
+      //
+      // Also for each of DefSrcXQueue :
+      // OpXIntial is at back & OpXFinal is at front
+      auto Op0FinalMI = (DefSrc0Queue.front())->getParent();
+      auto Op1FinalMI = (DefSrc1Queue.front())->getParent();
+      auto Op0IntialMI = (DefSrc0Queue.back())->getParent();
+      auto Op1IntialMI = (DefSrc1Queue.back())->getParent();
+
+      MachineOperand *FinalOutMO = nullptr;
+      std::queue<MachineOperand *> ChainedDefOps;
+      AMDGPU::SDWA::SdwaSel OpSel = AMDGPU::SDWA::SdwaSel::DWORD;
+      int NumOfElemInSecondOpChain = 0;
+
+      // Now, we will change the flow as per the dominace of MI as follows, if
+      // possible and store it in ChainedDefOps, so later can be used to convert
+      // into its SDWA version:
+      //
+      // If (dominates(Op0FinalMI, Op1IntialMI)) == TRUE
+      // SrcRootMO -> Op0Intial -> Op0x -> ... -> Op0Final
+      // -> Op1Intial -> Op1x -> ... -> Op1Final (FinalOutMO)
+      //
+      // If (dominates(Op1FinalMI, Op0IntialMI)) == TRUE
+      // SrcRootMO -> Op1Intial -> Op1x -> ... -> Op1Final
+      // -> Op0Intial -> Op0x -> ... -> Op0Final (FinalOutMO)
+      //
+      // TODO : Else, not handled!
+      // One such case is observed when multiple fp16 instruction are chained
+      // on a fp16 vector input. For Example :
+      //
+      // %1 = call <2 x half> @llvm.log.v2f16 (<2 x half> %0)
+      // %res = call <2 x half> @llvm.sin.v2f16 (<2 x half> %1)
+      // return <2 x half> %res
+      if (dominates(Op0FinalMI, Op1IntialMI)) {
+        int OpIdx = Op1IntialMI->findRegisterUseOperandIdx(SrcRootMOReg, TRI);
+        auto &MOTo = Op1IntialMI->getOperand(OpIdx);
+        auto MOFrom = DefSrc0Queue.front();
+        copyRegOperand(MOTo, *MOFrom, true);
+        FinalOutMO = DefSrc1Queue.front();
+
+        LLVM_DEBUG(dbgs() << "Updated Connecting MI : " << *Op1IntialMI
+                          << '\n');
+        OpIdx = Op0IntialMI->findRegisterUseOperandIdx(SrcRootMOReg, TRI);
+        auto &IntialInMO = Op0IntialMI->getOperand(OpIdx);
+
+        while (!DefSrc1Queue.empty()) {
+          ChainedDefOps.push(DefSrc1Queue.front());
+          DefSrc1Queue.pop();
+          NumOfElemInSecondOpChain++;
+        }
+        while (!DefSrc0Queue.empty()) {
+          ChainedDefOps.push(DefSrc0Queue.front());
+          DefSrc0Queue.pop();
+        }
+
+        ChainedDefOps.push(&IntialInMO);
+        OpSel = AMDGPU::SDWA::SdwaSel::WORD_1;
+      } else if (dominates(Op1FinalMI, Op0IntialMI)) {
+        int OpIdx = Op0IntialMI->findRegisterUseOperandIdx(SrcRootMOReg, TRI);
+        auto &MOTo = Op0IntialMI->getOperand(OpIdx);
+        auto MOFrom = DefSrc1Queue.front();
+        copyRegOperand(MOTo, *MOFrom, true);
+        FinalOutMO = DefSrc0Queue.front();
+
+        LLVM_DEBUG(dbgs() << "Updated Connecting MI : " << *Op0IntialMI
+                          << '\n');
+        OpIdx = Op1IntialMI->findRegisterUseOperandIdx(SrcRootMOReg, TRI);
+        auto &IntialInMO = Op1IntialMI->getOperand(OpIdx);
+
+        while (!DefSrc0Queue.empty()) {
+          ChainedDefOps.push(DefSrc0Queue.front());
+          DefSrc0Queue.pop();
+          NumOfElemInSecondOpChain++;
+        }
+        while (!DefSrc1Queue.empty()) {
+          ChainedDefOps.push(DefSrc1Queue.front());
+          DefSrc1Queue.pop();
+        }
+
+        ChainedDefOps.push(&IntialInMO);
+        OpSel = AMDGPU::SDWA::SdwaSel::WORD_0;
+      } else {
+        LLVM_DEBUG(dbgs() << "No Connecting MI exists" << '\n');
+        continue;
+      }
+
+      // Replace all use places of MI(v_pack) defMO with FinalOutMO.
+      MachineOperand &DefMO = MI.getOperand(0);
+      for (MachineOperand &MO :
+           make_early_inc_range(MRI->use_nodbg_operands(DefMO.getReg()))) {
+        if (!MO.isReg())
+          continue;
+
+        MO.setReg(FinalOutMO->getReg());
+        MO.setSubReg(FinalOutMO->getSubReg());
+      }
+      LLVM_DEBUG(dbgs() << "Replace uses of " << DefMO << " in " << MI
+                        << "With " << *FinalOutMO << '\n');
+
+      // Delete v_pack machine instruction
+      LLVM_DEBUG(dbgs() << "\nInstruction to be deleted : " << MI << "\n\n");
+      MI.eraseFromParent();
+      ++Num16BitPackedInstructionsEliminated;
+
+      // Convert machine instruction into SDWA-version
+      while (ChainedDefOps.size() != 1) {
+        if (NumOfElemInSecondOpChain == 0) {
+          if (OpSel == AMDGPU::SDWA::SdwaSel::WORD_0)
+            OpSel = AMDGPU::SDWA::SdwaSel::WORD_1;
+          else
+            OpSel = AMDGPU::SDWA::SdwaSel::WORD_0;
+        }
+
+        MachineInstr *DefMI = ChainedDefOps.front()->getParent();
+        ChainedDefOps.pop();
+        MachineOperand *SrcMO = ChainedDefOps.front();
+
+        // Take SrcMO (which are def) as its usage in DefMI
+        if (SrcMO->isDef()) {
+          assert(MRI->hasOneUse(SrcMO->getReg()));
+          SrcMO = findSingleRegUse(SrcMO, MRI);
+          assert(DefMI == SrcMO->getParent() && "the only use is not in DefMI");
+        }
+
+        convertMIToSDWAWithOpsel(*DefMI, *SrcMO, OpSel);
+        NumOfElemInSecondOpChain--;
+      }
+    }
+  }
+}
+
 bool SIPeepholeSDWA::run(MachineFunction &MF) {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
 
@@ -1451,6 +1957,12 @@ bool SIPeepholeSDWA::run(MachineFunction &MF) {
       while (!ConvertedInstructions.empty())
         legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST);
     } while (Changed);
+
+    // Process each v_pack_b32_fp16 instruction in MBB.
+    eliminateFP16Packing(MBB, ST);
+    Ret |= !ConvertedInstructions.empty();
+    while (!ConvertedInstructions.empty())
+      legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST);
   }
 
   return Ret;

>From 3aae24a6f3f104efe57fc8bf3920ff40179f23ed Mon Sep 17 00:00:00 2001
From: vikashgu <Vikash.Gupta at amd.com>
Date: Thu, 24 Apr 2025 06:56:22 +0000
Subject: [PATCH 2/8] Update the LIT tests to accomodate the patch effects.

---
 .../AMDGPU/GlobalISel/combine-fma-sub-mul.ll  |  88 +++---
 .../GlobalISel/combine-fma-sub-neg-mul.ll     |  40 ++-
 .../CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll     | 100 ++++---
 llvm/test/CodeGen/AMDGPU/fdiv.f16.ll          |  20 +-
 .../AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll |  51 ++--
 llvm/test/CodeGen/AMDGPU/fract-match.ll       |  11 +-
 llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll      |  18 +-
 llvm/test/CodeGen/AMDGPU/llvm.exp.ll          |  14 +-
 llvm/test/CodeGen/AMDGPU/llvm.exp10.ll        |  18 +-
 llvm/test/CodeGen/AMDGPU/llvm.exp2.ll         |  35 +--
 llvm/test/CodeGen/AMDGPU/llvm.frexp.ll        |  23 +-
 llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll        |  44 ++-
 llvm/test/CodeGen/AMDGPU/llvm.log.ll          | 260 ++++++++++--------
 llvm/test/CodeGen/AMDGPU/llvm.log10.ll        | 260 ++++++++++--------
 llvm/test/CodeGen/AMDGPU/llvm.log2.ll         | 110 +++-----
 llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll     |   5 +-
 llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll      |  18 +-
 llvm/test/CodeGen/AMDGPU/repeated-divisor.ll  |  12 +-
 llvm/test/CodeGen/AMDGPU/roundeven.ll         |  80 +++---
 19 files changed, 594 insertions(+), 613 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll
index 99bdcdd1f31e5..6bf833f067b32 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll
@@ -545,12 +545,10 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> %
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v3
-; GFX9-NEXT:    v_sub_f16_e32 v2, v0, v4
-; GFX9-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_sub_f16_e32 v3, v1, v5
-; GFX9-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX9-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX9-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-CONTRACT-LABEL: test_v4f16_sub_mul:
@@ -565,12 +563,10 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> %
 ; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX9-DENORM-NEXT:    v_pk_mul_f16 v1, v1, v3
-; GFX9-DENORM-NEXT:    v_sub_f16_e32 v2, v0, v4
-; GFX9-DENORM-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-DENORM-NEXT:    v_sub_f16_e32 v3, v1, v5
-; GFX9-DENORM-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-DENORM-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX9-DENORM-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX9-DENORM-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-DENORM-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-DENORM-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-DENORM-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_v4f16_sub_mul:
@@ -578,12 +574,10 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> %
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v3
-; GFX10-NEXT:    v_sub_f16_e32 v2, v0, v4
-; GFX10-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_sub_f16_e32 v3, v1, v5
-; GFX10-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX10-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX10-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_v4f16_sub_mul:
@@ -598,12 +592,10 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> %
 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX10-DENORM-NEXT:    v_pk_mul_f16 v1, v1, v3
-; GFX10-DENORM-NEXT:    v_sub_f16_e32 v2, v0, v4
-; GFX10-DENORM-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-DENORM-NEXT:    v_sub_f16_e32 v3, v1, v5
-; GFX10-DENORM-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-DENORM-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX10-DENORM-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX10-DENORM-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DENORM-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DENORM-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-DENORM-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-CONTRACT-LABEL: test_v4f16_sub_mul:
@@ -644,12 +636,12 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v3
-; GFX9-NEXT:    v_sub_f16_e32 v2, v4, v0
-; GFX9-NEXT:    v_sub_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_sub_f16_e32 v3, v5, v1
-; GFX9-NEXT:    v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX9-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX9-NEXT:    v_sub_f16_sdwa v4, v4, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_sub_f16_sdwa v5, v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_sub_f16_sdwa v4, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_sub_f16_sdwa v5, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-CONTRACT-LABEL: test_v4f16_sub_mul_rhs:
@@ -664,12 +656,12 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal
 ; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX9-DENORM-NEXT:    v_pk_mul_f16 v1, v1, v3
-; GFX9-DENORM-NEXT:    v_sub_f16_e32 v2, v4, v0
-; GFX9-DENORM-NEXT:    v_sub_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-DENORM-NEXT:    v_sub_f16_e32 v3, v5, v1
-; GFX9-DENORM-NEXT:    v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-DENORM-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX9-DENORM-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX9-DENORM-NEXT:    v_sub_f16_sdwa v4, v4, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-DENORM-NEXT:    v_sub_f16_sdwa v5, v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-DENORM-NEXT:    v_sub_f16_sdwa v4, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-DENORM-NEXT:    v_sub_f16_sdwa v5, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-DENORM-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-DENORM-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_v4f16_sub_mul_rhs:
@@ -677,12 +669,12 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v3
-; GFX10-NEXT:    v_sub_f16_e32 v2, v4, v0
-; GFX10-NEXT:    v_sub_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_sub_f16_e32 v3, v5, v1
-; GFX10-NEXT:    v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX10-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX10-NEXT:    v_sub_f16_sdwa v4, v4, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_sub_f16_sdwa v5, v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_sub_f16_sdwa v4, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_sub_f16_sdwa v5, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_v4f16_sub_mul_rhs:
@@ -697,12 +689,12 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal
 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX10-DENORM-NEXT:    v_pk_mul_f16 v1, v1, v3
-; GFX10-DENORM-NEXT:    v_sub_f16_e32 v2, v4, v0
-; GFX10-DENORM-NEXT:    v_sub_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-DENORM-NEXT:    v_sub_f16_e32 v3, v5, v1
-; GFX10-DENORM-NEXT:    v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-DENORM-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX10-DENORM-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX10-DENORM-NEXT:    v_sub_f16_sdwa v4, v4, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DENORM-NEXT:    v_sub_f16_sdwa v5, v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DENORM-NEXT:    v_sub_f16_sdwa v4, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-DENORM-NEXT:    v_sub_f16_sdwa v5, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-DENORM-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-DENORM-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-CONTRACT-LABEL: test_v4f16_sub_mul_rhs:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll
index 70f961e2777af..ad11c9b5f28ca 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll
@@ -221,12 +221,10 @@ define <4 x half> @test_v4f16_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
-; GFX9-NEXT:    v_sub_f16_e32 v2, v0, v4
-; GFX9-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_sub_f16_e32 v3, v1, v5
-; GFX9-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX9-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX9-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-CONTRACT-LABEL: test_v4f16_sub_ext_neg_mul:
@@ -241,12 +239,10 @@ define <4 x half> @test_v4f16_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x
 ; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX9-DENORM-NEXT:    v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
-; GFX9-DENORM-NEXT:    v_sub_f16_e32 v2, v0, v4
-; GFX9-DENORM-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-DENORM-NEXT:    v_sub_f16_e32 v3, v1, v5
-; GFX9-DENORM-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-DENORM-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX9-DENORM-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX9-DENORM-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-DENORM-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-DENORM-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-DENORM-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_v4f16_sub_ext_neg_mul:
@@ -254,12 +250,10 @@ define <4 x half> @test_v4f16_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
-; GFX10-NEXT:    v_sub_f16_e32 v2, v0, v4
-; GFX10-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_sub_f16_e32 v3, v1, v5
-; GFX10-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX10-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX10-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_v4f16_sub_ext_neg_mul:
@@ -274,12 +268,10 @@ define <4 x half> @test_v4f16_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x
 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX10-DENORM-NEXT:    v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
-; GFX10-DENORM-NEXT:    v_sub_f16_e32 v2, v0, v4
-; GFX10-DENORM-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-DENORM-NEXT:    v_sub_f16_e32 v3, v1, v5
-; GFX10-DENORM-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-DENORM-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX10-DENORM-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX10-DENORM-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DENORM-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DENORM-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-DENORM-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %a = fmul <4 x half> %x, %y
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
index 870a7482f0c97..f3ead3e96e5d3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
@@ -1101,21 +1101,21 @@ define <2 x half> @v_fdiv_v2f16_afn(<2 x half> %a, <2 x half> %b) {
 ; GFX9-LABEL: v_fdiv_v2f16_afn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_rcp_f16_e32 v2, v1
-; GFX9-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_mul_f16_e32 v2, v0, v2
-; GFX9-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX9-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0
+; GFX9-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT:    v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_v2f16_afn:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_rcp_f16_e32 v2, v1
-; GFX10-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_mul_f16_e32 v2, v0, v2
-; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX10-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_v2f16_afn:
@@ -2782,17 +2782,15 @@ define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) {
 ; GFX9-LABEL: v_rcp_v2f16_arcp:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_rcp_f16_e32 v1, v0
-; GFX9-NEXT:    v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX9-NEXT:    v_rcp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_rcp_v2f16_arcp:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_rcp_f16_e32 v1, v0
-; GFX10-NEXT:    v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX10-NEXT:    v_rcp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_rcp_v2f16_arcp:
@@ -2834,17 +2832,15 @@ define <2 x half> @v_rcp_v2f16_arcp_afn(<2 x half> %x) {
 ; GFX9-LABEL: v_rcp_v2f16_arcp_afn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_rcp_f16_e32 v1, v0
-; GFX9-NEXT:    v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX9-NEXT:    v_rcp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_rcp_v2f16_arcp_afn:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_rcp_f16_e32 v1, v0
-; GFX10-NEXT:    v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX10-NEXT:    v_rcp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_rcp_v2f16_arcp_afn:
@@ -3192,21 +3188,21 @@ define <2 x half> @v_fdiv_v2f16_afn_ulp25(<2 x half> %a, <2 x half> %b) {
 ; GFX9-LABEL: v_fdiv_v2f16_afn_ulp25:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_rcp_f16_e32 v2, v1
-; GFX9-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_mul_f16_e32 v2, v0, v2
-; GFX9-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX9-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0
+; GFX9-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT:    v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_v2f16_afn_ulp25:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_rcp_f16_e32 v2, v1
-; GFX10-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_mul_f16_e32 v2, v0, v2
-; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX10-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_v2f16_afn_ulp25:
@@ -3310,21 +3306,21 @@ define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) {
 ; GFX9-LABEL: v_fdiv_v2f16_arcp_ulp25:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_rcp_f16_e32 v2, v1
-; GFX9-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_mul_f16_e32 v2, v0, v2
-; GFX9-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX9-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0
+; GFX9-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT:    v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_v2f16_arcp_ulp25:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_rcp_f16_e32 v2, v1
-; GFX10-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_mul_f16_e32 v2, v0, v2
-; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX10-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_v2f16_arcp_ulp25:
@@ -3372,21 +3368,21 @@ define <2 x half> @v_fdiv_v2f16_arcp_afn_ulp25(<2 x half> %a, <2 x half> %b) {
 ; GFX9-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_rcp_f16_e32 v2, v1
-; GFX9-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_mul_f16_e32 v2, v0, v2
-; GFX9-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX9-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0
+; GFX9-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT:    v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_rcp_f16_e32 v2, v1
-; GFX10-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_mul_f16_e32 v2, v0, v2
-; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX10-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
index c43731893c2d7..12d7f9d4af8c4 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
@@ -2188,17 +2188,15 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
 ; GFX9-LABEL: v_rsq_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_rsq_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_rsq_f16_e32 v0, v0
-; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT:    v_rsq_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT:    v_rsq_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_rsq_v2f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_rsq_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_rsq_f16_e32 v0, v0
-; GFX10-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-NEXT:    v_rsq_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_rsq_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_rsq_v2f16:
@@ -2398,17 +2396,15 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
 ; GFX9-LABEL: v_neg_rsq_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_rsq_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_rsq_f16_e32 v0, v0
-; GFX9-NEXT:    v_pack_b32_f16 v0, -v0, -v1
+; GFX9-NEXT:    v_rsq_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT:    v_rsq_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_neg_rsq_v2f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_rsq_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_rsq_f16_e32 v0, v0
-; GFX10-NEXT:    v_pack_b32_f16 v0, -v0, -v1
+; GFX10-NEXT:    v_rsq_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_rsq_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_neg_rsq_v2f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
index 63ba18a5433aa..68a3db1472aa2 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -187,22 +187,18 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
 ; GFX10-LABEL: fmul_pow2_8xhalf:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_pk_lshlrev_b16 v3, v3, 1 op_sel_hi:[1,0]
-; GFX10-NEXT:    v_pk_lshlrev_b16 v2, v2, 1 op_sel_hi:[1,0]
-; GFX10-NEXT:    v_pk_lshlrev_b16 v1, v1, 1 op_sel_hi:[1,0]
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, v0, 1 op_sel_hi:[1,0]
-; GFX10-NEXT:    v_cvt_f16_u16_e32 v4, v3
-; GFX10-NEXT:    v_cvt_f16_u16_e32 v5, v2
-; GFX10-NEXT:    v_cvt_f16_u16_e32 v6, v1
-; GFX10-NEXT:    v_cvt_f16_u16_e32 v7, v0
-; GFX10-NEXT:    v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_cvt_f16_u16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_cvt_f16_u16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_cvt_f16_u16_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_pack_b32_f16 v0, v7, v0
-; GFX10-NEXT:    v_pack_b32_f16 v1, v6, v1
-; GFX10-NEXT:    v_pack_b32_f16 v2, v5, v2
-; GFX10-NEXT:    v_pack_b32_f16 v3, v4, v3
+; GFX10-NEXT:    v_pk_lshlrev_b16 v1, v1, 1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_lshlrev_b16 v2, v2, 1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_lshlrev_b16 v3, v3, 1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_cvt_f16_u16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_cvt_f16_u16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_cvt_f16_u16_sdwa v2, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_cvt_f16_u16_sdwa v3, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_cvt_f16_u16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f16_u16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f16_u16_sdwa v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f16_u16_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX10-NEXT:    v_pk_mul_f16 v0, 0x7000, v0 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_mul_f16 v1, 0x7000, v1 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_mul_f16 v2, 0x7000, v2 op_sel_hi:[0,1]
@@ -302,18 +298,14 @@ define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0x7000
-; GFX10-NEXT:    v_ldexp_f16_e32 v5, 0x7000, v3
-; GFX10-NEXT:    v_ldexp_f16_e32 v6, 0x7000, v2
-; GFX10-NEXT:    v_ldexp_f16_e32 v7, 0x7000, v1
-; GFX10-NEXT:    v_ldexp_f16_e32 v8, 0x7000, v0
-; GFX10-NEXT:    v_ldexp_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    v_ldexp_f16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    v_ldexp_f16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    v_ldexp_f16_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    v_pack_b32_f16 v0, v8, v0
-; GFX10-NEXT:    v_pack_b32_f16 v1, v7, v1
-; GFX10-NEXT:    v_pack_b32_f16 v2, v6, v2
-; GFX10-NEXT:    v_pack_b32_f16 v3, v5, v3
+; GFX10-NEXT:    v_ldexp_f16_sdwa v0, v4, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_ldexp_f16_sdwa v1, v4, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_ldexp_f16_sdwa v2, v4, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_ldexp_f16_sdwa v3, v4, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_ldexp_f16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_ldexp_f16_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_ldexp_f16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_ldexp_f16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: fmul_pow2_ldexp_8xhalf:
@@ -1085,9 +1077,8 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, v0, 2 op_sel_hi:[1,0]
-; GFX10-NEXT:    v_cvt_f16_u16_e32 v1, v0
-; GFX10-NEXT:    v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX10-NEXT:    v_cvt_f16_u16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_cvt_f16_u16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX10-NEXT:    v_pk_mul_f16 v0, 0x4b80, v0 op_sel_hi:[0,1]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/fract-match.ll b/llvm/test/CodeGen/AMDGPU/fract-match.ll
index 4ee48716439bd..4290ab71b0d62 100644
--- a/llvm/test/CodeGen/AMDGPU/fract-match.ll
+++ b/llvm/test/CodeGen/AMDGPU/fract-match.ll
@@ -1698,9 +1698,8 @@ define <2 x half> @basic_fract_v2f16_nonan(<2 x half> nofpclass(nan) %x) {
 ; GFX8-LABEL: basic_fract_v2f16_nonan:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_fract_f16_e32 v1, v0
-; GFX8-NEXT:    v_fract_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX8-NEXT:    v_fract_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_fract_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: basic_fract_v2f16_nonan:
@@ -2721,15 +2720,15 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) writeon
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_movk_i32 s6, 0x204
-; GFX8-NEXT:    v_floor_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_floor_f16_e32 v4, v0
 ; GFX8-NEXT:    v_fract_f16_sdwa v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_cmp_class_f16_sdwa s[4:5], v0, s6 src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_pack_b32_f16 v3, v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX8-NEXT:    v_fract_f16_e32 v4, v0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, 0, s[4:5]
 ; GFX8-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, s6
+; GFX8-NEXT:    v_floor_f16_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v4, 0, s[4:5]
+; GFX8-NEXT:    v_floor_f16_sdwa v3, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX8-NEXT:    v_pack_b32_f16 v0, v0, v5
 ; GFX8-NEXT:    global_store_dword v[1:2], v3, off
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
index 8c5bc4a33a303..a8ddc564e4b51 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
@@ -188,11 +188,10 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mul_f16_e32 v3, 0.15915494, v1
-; GFX9-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_cos_f16_e32 v2, v3
-; GFX9-NEXT:    v_cos_f16_e32 v1, v1
-; GFX9-NEXT:    v_pack_b32_f16 v1, v2, v1
+; GFX9-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_cos_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -204,11 +203,10 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mul_f16_e32 v3, 0.15915494, v1
-; GFX10-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_cos_f16_e32 v2, v3
-; GFX10-NEXT:    v_cos_f16_e32 v1, v1
-; GFX10-NEXT:    v_pack_b32_f16 v1, v2, v1
+; GFX10-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_cos_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
index 978f223aafb94..c48df8b4f5b09 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
@@ -6717,9 +6717,8 @@ define <2 x half> @v_exp_v2f16_fast(<2 x half> %in) {
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x3dc5
 ; GFX900-SDAG-NEXT:    v_pk_mul_f16 v0, v0, s4 op_sel_hi:[1,0]
-; GFX900-SDAG-NEXT:    v_exp_f16_e32 v1, v0
-; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp_v2f16_fast:
@@ -6898,13 +6897,12 @@ define <3 x half> @v_exp_v3f16_afn(<3 x half> %in) {
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x3dc5
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v2, 0x3dc5, v0
-; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX900-SDAG-NEXT:    v_exp_f16_e32 v2, v2
-; GFX900-SDAG-NEXT:    v_exp_f16_e32 v0, v0
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x3dc5, v1
 ; GFX900-SDAG-NEXT:    v_exp_f16_e32 v1, v1
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp_v3f16_afn:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
index 70c3787bac9a1..3e2a3d2f4e3dd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
@@ -6809,11 +6809,10 @@ define <2 x half> @v_exp10_v2f16_fast(<2 x half> %in) {
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x3dc5
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x3dc5, v0
-; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX900-SDAG-NEXT:    v_exp_f16_e32 v1, v1
-; GFX900-SDAG-NEXT:    v_exp_f16_e32 v0, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp10_v2f16_fast:
@@ -6992,13 +6991,12 @@ define <3 x half> @v_exp10_v3f16_afn(<3 x half> %in) {
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x3dc5
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v2, 0x3dc5, v0
-; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX900-SDAG-NEXT:    v_exp_f16_e32 v2, v2
-; GFX900-SDAG-NEXT:    v_exp_f16_e32 v0, v0
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x3dc5, v1
 ; GFX900-SDAG-NEXT:    v_exp_f16_e32 v1, v1
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp10_v3f16_afn:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
index 15bcab9f774e4..b7f180fefac94 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
@@ -3333,9 +3333,8 @@ define <2 x half> @v_exp2_v2f16(<2 x half> %in) {
 ; GFX900-SDAG-LABEL: v_exp2_v2f16:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_exp_f16_e32 v0, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp2_v2f16:
@@ -3411,9 +3410,8 @@ define <2 x half> @v_exp2_fabs_v2f16(<2 x half> %in) {
 ; GFX900-SDAG-LABEL: v_exp2_fabs_v2f16:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v1, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_exp_f16_e64 v0, |v0|
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v0, |v0| dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v0, |v0| dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp2_fabs_v2f16:
@@ -3495,9 +3493,8 @@ define <2 x half> @v_exp2_fneg_fabs_v2f16(<2 x half> %in) {
 ; GFX900-SDAG-LABEL: v_exp2_fneg_fabs_v2f16:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v1, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_exp_f16_e64 v0, -|v0|
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v0, -|v0| dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v0, -|v0| dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp2_fneg_fabs_v2f16:
@@ -3580,9 +3577,8 @@ define <2 x half> @v_exp2_fneg_v2f16(<2 x half> %in) {
 ; GFX900-SDAG-LABEL: v_exp2_fneg_v2f16:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v1, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_exp_f16_e64 v0, -v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v0, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v0, -v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp2_fneg_v2f16:
@@ -3650,9 +3646,8 @@ define <2 x half> @v_exp2_v2f16_fast(<2 x half> %in) {
 ; GFX900-SDAG-LABEL: v_exp2_v2f16_fast:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_exp_f16_e32 v0, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp2_v2f16_fast:
@@ -3732,10 +3727,9 @@ define <3 x half> @v_exp_v3f16(<3 x half> %in) {
 ; GFX900-SDAG-LABEL: v_exp_v3f16:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_exp_f16_e32 v0, v0
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    v_exp_f16_e32 v1, v1
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp_v3f16:
@@ -3816,10 +3810,9 @@ define <3 x half> @v_exp2_v3f16_afn(<3 x half> %in) {
 ; GFX900-SDAG-LABEL: v_exp2_v3f16_afn:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_exp_f16_e32 v0, v0
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    v_exp_f16_e32 v1, v1
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp2_v3f16_afn:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
index f44faf4f7edba..c7dcbcfde6d89 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
@@ -411,9 +411,9 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) {
 ; GFX9-SDAG-LABEL: test_frexp_v2f16_v2i32:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_frexp_mant_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-SDAG-NEXT:    v_frexp_mant_f16_e32 v2, v0
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v3, v2, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-SDAG-NEXT:    v_frexp_mant_f16_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_frexp_mant_f16_sdwa v3, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX9-SDAG-NEXT:    v_frexp_exp_i16_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-SDAG-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
 ; GFX9-SDAG-NEXT:    v_bfe_i32 v2, v1, 0, 16
@@ -522,13 +522,14 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) {
 ; GFX9-GISEL-LABEL: test_frexp_v2f16_v2i32:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_frexp_mant_f16_e32 v3, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-GISEL-NEXT:    v_frexp_mant_f16_sdwa v3, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX9-GISEL-NEXT:    v_frexp_exp_i16_f16_e32 v1, v0
-; GFX9-GISEL-NEXT:    v_frexp_mant_f16_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_frexp_mant_f16_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX9-GISEL-NEXT:    v_frexp_exp_i16_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 16
 ; GFX9-GISEL-NEXT:    v_bfe_i32 v2, v0, 0, 16
-; GFX9-GISEL-NEXT:    v_pack_b32_f16 v0, v3, v4
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-TRUE16-LABEL: test_frexp_v2f16_v2i32:
@@ -628,9 +629,8 @@ define <2 x half> @test_frexp_v2f16_v2i32_only_use_fract(<2 x half> %a) {
 ; GFX9-SDAG-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_frexp_mant_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-SDAG-NEXT:    v_frexp_mant_f16_e32 v0, v0
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-SDAG-NEXT:    v_frexp_mant_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_frexp_mant_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-TRUE16-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
@@ -698,9 +698,8 @@ define <2 x half> @test_frexp_v2f16_v2i32_only_use_fract(<2 x half> %a) {
 ; GFX9-GISEL-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_frexp_mant_f16_e32 v1, v0
-; GFX9-GISEL-NEXT:    v_frexp_mant_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_frexp_mant_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-GISEL-NEXT:    v_frexp_mant_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-TRUE16-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
index 0e66b0af99f34..fe05fdf1226ec 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
@@ -461,10 +461,9 @@ define <2 x half> @test_ldexp_v2f16_v2i32(<2 x half> %a, <2 x i32> %b) {
 ; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x7fff
 ; GFX9-SDAG-NEXT:    v_med3_i32 v2, v2, s4, v3
+; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-SDAG-NEXT:    v_med3_i32 v1, v1, s4, v3
-; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-TRUE16-LABEL: test_ldexp_v2f16_v2i32:
@@ -585,9 +584,8 @@ define <2 x half> @test_ldexp_v2f16_v2i16(<2 x half> %a, <2 x i16> %b) {
 ; GFX9-SDAG-LABEL: test_ldexp_v2f16_v2i16:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-TRUE16-LABEL: test_ldexp_v2f16_v2i16:
@@ -697,12 +695,11 @@ define <3 x half> @test_ldexp_v3f16_v3i32(<3 x half> %a, <3 x i32> %b) {
 ; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x7fff
 ; GFX9-SDAG-NEXT:    v_med3_i32 v3, v3, s4, v5
-; GFX9-SDAG-NEXT:    v_med3_i32 v2, v2, s4, v5
 ; GFX9-SDAG-NEXT:    v_med3_i32 v4, v4, s4, v5
-; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v2
+; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-SDAG-NEXT:    v_med3_i32 v2, v2, s4, v5
 ; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v1, v1, v4
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v3
+; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-TRUE16-LABEL: test_ldexp_v3f16_v3i32:
@@ -844,10 +841,9 @@ define <3 x half> @test_ldexp_v3f16_v3i16(<3 x half> %a, <3 x i16> %b) {
 ; GFX9-SDAG-LABEL: test_ldexp_v3f16_v3i16:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v4, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v2
+; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v1, v1, v3
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v4
+; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-TRUE16-LABEL: test_ldexp_v3f16_v3i16:
@@ -973,15 +969,13 @@ define <4 x half> @test_ldexp_v4f16_v4i32(<4 x half> %a, <4 x i32> %b) {
 ; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v6, 0x7fff
 ; GFX9-SDAG-NEXT:    v_med3_i32 v5, v5, s4, v6
-; GFX9-SDAG-NEXT:    v_med3_i32 v4, v4, s4, v6
 ; GFX9-SDAG-NEXT:    v_med3_i32 v3, v3, s4, v6
+; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-SDAG-NEXT:    v_med3_i32 v4, v4, s4, v6
+; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-SDAG-NEXT:    v_med3_i32 v2, v2, s4, v6
-; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v1, v1, v4
-; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v2
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v3
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v5
+; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v1, v1, v4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-TRUE16-LABEL: test_ldexp_v4f16_v4i32:
@@ -1157,12 +1151,10 @@ define <4 x half> @test_ldexp_v4f16_v4i16(<4 x half> %a, <4 x i16> %b) {
 ; GFX9-SDAG-LABEL: test_ldexp_v4f16_v4i16:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v1, v1, v3
-; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v2
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v5
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v4
+; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-TRUE16-LABEL: test_ldexp_v4f16_v4i16:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
index 0d5846a4a4985..d64b7fed6a127 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
@@ -6869,15 +6869,25 @@ define <2 x half> @v_log_v2f16(<2 x half> %in) {
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_log_v2f16:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_log_f16_e32 v1, v0
-; GFX900-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-NEXT:    v_pack_b32_f16 v0, v1, v0
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_log_v2f16:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_log_v2f16:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x398c
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log_v2f16:
 ; GFX1100-SDAG-TRUE16:       ; %bb.0:
@@ -7004,22 +7014,22 @@ define <2 x half> @v_log_fabs_v2f16(<2 x half> %in) {
 ; GFX900-SDAG-LABEL: v_log_fabs_v2f16:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_log_f16_e64 v1, |v0|
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, |v0| dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, |v0| dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log_fabs_v2f16:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x398c
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log_fabs_v2f16:
@@ -7156,22 +7166,22 @@ define <2 x half> @v_log_fneg_fabs_v2f16(<2 x half> %in) {
 ; GFX900-SDAG-LABEL: v_log_fneg_fabs_v2f16:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_log_f16_e64 v1, -|v0|
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, -|v0| dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, -|v0| dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log_fneg_fabs_v2f16:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_or_b32_e32 v0, 0x80008000, v0
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x398c
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log_fneg_fabs_v2f16:
@@ -7309,22 +7319,22 @@ define <2 x half> @v_log_fneg_v2f16(<2 x half> %in) {
 ; GFX900-SDAG-LABEL: v_log_fneg_v2f16:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_log_f16_e64 v1, -v0
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, -v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log_fneg_v2f16:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x398c
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log_fneg_v2f16:
@@ -7444,15 +7454,25 @@ define <2 x half> @v_log_v2f16_fast(<2 x half> %in) {
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_log_v2f16_fast:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_log_f16_e32 v1, v0
-; GFX900-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-NEXT:    v_pack_b32_f16 v0, v1, v0
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_log_v2f16_fast:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_log_v2f16_fast:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x398c
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log_v2f16_fast:
 ; GFX1100-SDAG-TRUE16:       ; %bb.0:
@@ -7571,17 +7591,29 @@ define <3 x half> @v_log_v3f16(<3 x half> %in) {
 ; VI-NEXT:    v_or_b32_e32 v0, v2, v0
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_log_v3f16:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_log_f16_e32 v2, v0
-; GFX900-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT:    v_log_f16_e32 v1, v1
-; GFX900-NEXT:    v_mul_f16_e32 v2, 0x398c, v2
-; GFX900-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_log_v3f16:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT:    v_log_f16_e32 v1, v1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_log_v3f16:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x398c
+; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log_v3f16:
 ; GFX1100-SDAG-TRUE16:       ; %bb.0:
@@ -7704,17 +7736,29 @@ define <3 x half> @v_log_v3f16_fast(<3 x half> %in) {
 ; VI-NEXT:    v_or_b32_e32 v0, v2, v0
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_log_v3f16_fast:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_log_f16_e32 v2, v0
-; GFX900-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT:    v_log_f16_e32 v1, v1
-; GFX900-NEXT:    v_mul_f16_e32 v2, 0x398c, v2
-; GFX900-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_log_v3f16_fast:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT:    v_log_f16_e32 v1, v1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_log_v3f16_fast:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x398c
+; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log_v3f16_fast:
 ; GFX1100-SDAG-TRUE16:       ; %bb.0:
@@ -7875,31 +7919,29 @@ define <4 x half> @v_log_v4f16(<4 x half> %in) {
 ; GFX900-SDAG-LABEL: v_log_v4f16:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_log_f16_e32 v2, v1
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_log_f16_e32 v3, v0
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v2, 0x398c, v2
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v3, 0x398c, v3
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v3, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v1, v2, v1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log_v4f16:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v2, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v3, v1
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v2, 0x398c, v2
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v3, 0x398c, v3
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x398c
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log_v4f16:
@@ -8070,31 +8112,29 @@ define <4 x half> @v_log_v4f16_fast(<4 x half> %in) {
 ; GFX900-SDAG-LABEL: v_log_v4f16_fast:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_log_f16_e32 v2, v1
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_log_f16_e32 v3, v0
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v2, 0x398c, v2
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v3, 0x398c, v3
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v3, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v1, v2, v1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log_v4f16_fast:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v2, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v3, v1
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v2, 0x398c, v2
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v3, 0x398c, v3
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x398c
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log_v4f16_fast:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
index 8006876dbe3ff..8a9f1c24dfad8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
@@ -6869,15 +6869,25 @@ define <2 x half> @v_log10_v2f16(<2 x half> %in) {
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_log10_v2f16:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_log_f16_e32 v1, v0
-; GFX900-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-NEXT:    v_pack_b32_f16 v0, v1, v0
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_log10_v2f16:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_log10_v2f16:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x34d1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log10_v2f16:
 ; GFX1100-SDAG-TRUE16:       ; %bb.0:
@@ -7004,22 +7014,22 @@ define <2 x half> @v_log10_fabs_v2f16(<2 x half> %in) {
 ; GFX900-SDAG-LABEL: v_log10_fabs_v2f16:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_log_f16_e64 v1, |v0|
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, |v0| dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, |v0| dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log10_fabs_v2f16:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x34d1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log10_fabs_v2f16:
@@ -7156,22 +7166,22 @@ define <2 x half> @v_log10_fneg_fabs_v2f16(<2 x half> %in) {
 ; GFX900-SDAG-LABEL: v_log10_fneg_fabs_v2f16:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_log_f16_e64 v1, -|v0|
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, -|v0| dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, -|v0| dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log10_fneg_fabs_v2f16:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_or_b32_e32 v0, 0x80008000, v0
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x34d1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log10_fneg_fabs_v2f16:
@@ -7309,22 +7319,22 @@ define <2 x half> @v_log10_fneg_v2f16(<2 x half> %in) {
 ; GFX900-SDAG-LABEL: v_log10_fneg_v2f16:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_log_f16_e64 v1, -v0
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, -v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log10_fneg_v2f16:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x34d1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log10_fneg_v2f16:
@@ -7444,15 +7454,25 @@ define <2 x half> @v_log10_v2f16_fast(<2 x half> %in) {
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_log10_v2f16_fast:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_log_f16_e32 v1, v0
-; GFX900-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-NEXT:    v_pack_b32_f16 v0, v1, v0
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_log10_v2f16_fast:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_log10_v2f16_fast:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x34d1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log10_v2f16_fast:
 ; GFX1100-SDAG-TRUE16:       ; %bb.0:
@@ -7571,17 +7591,29 @@ define <3 x half> @v_log10_v3f16(<3 x half> %in) {
 ; VI-NEXT:    v_or_b32_e32 v0, v2, v0
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_log10_v3f16:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_log_f16_e32 v2, v0
-; GFX900-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT:    v_log_f16_e32 v1, v1
-; GFX900-NEXT:    v_mul_f16_e32 v2, 0x34d1, v2
-; GFX900-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_log10_v3f16:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT:    v_log_f16_e32 v1, v1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_log10_v3f16:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x34d1
+; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log10_v3f16:
 ; GFX1100-SDAG-TRUE16:       ; %bb.0:
@@ -7704,17 +7736,29 @@ define <3 x half> @v_log10_v3f16_fast(<3 x half> %in) {
 ; VI-NEXT:    v_or_b32_e32 v0, v2, v0
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_log10_v3f16_fast:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_log_f16_e32 v2, v0
-; GFX900-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT:    v_log_f16_e32 v1, v1
-; GFX900-NEXT:    v_mul_f16_e32 v2, 0x34d1, v2
-; GFX900-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_log10_v3f16_fast:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT:    v_log_f16_e32 v1, v1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_log10_v3f16_fast:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x34d1
+; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log10_v3f16_fast:
 ; GFX1100-SDAG-TRUE16:       ; %bb.0:
@@ -7875,31 +7919,29 @@ define <4 x half> @v_log10_v4f16(<4 x half> %in) {
 ; GFX900-SDAG-LABEL: v_log10_v4f16:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_log_f16_e32 v2, v1
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_log_f16_e32 v3, v0
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v2, 0x34d1, v2
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v3, 0x34d1, v3
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v3, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v1, v2, v1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log10_v4f16:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v2, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v3, v1
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v2, 0x34d1, v2
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v3, 0x34d1, v3
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x34d1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log10_v4f16:
@@ -8070,31 +8112,29 @@ define <4 x half> @v_log10_v4f16_fast(<4 x half> %in) {
 ; GFX900-SDAG-LABEL: v_log10_v4f16_fast:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_log_f16_e32 v2, v1
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_log_f16_e32 v3, v0
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v2, 0x34d1, v2
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v3, 0x34d1, v3
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v3, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v1, v2, v1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log10_v4f16_fast:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v2, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v3, v1
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v2, 0x34d1, v2
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v3, 0x34d1, v3
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x34d1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log10_v4f16_fast:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
index c1ac74e5094b0..11a59b3262c1b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -4247,17 +4247,15 @@ define <2 x half> @v_log2_v2f16(<2 x half> %in) {
 ; GFX900-SDAG-LABEL: v_log2_v2f16:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_log_f16_e32 v0, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log2_v2f16:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log2_v2f16:
@@ -4365,18 +4363,16 @@ define <2 x half> @v_log2_fabs_v2f16(<2 x half> %in) {
 ; GFX900-SDAG-LABEL: v_log2_fabs_v2f16:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_log_f16_e64 v0, |v0|
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, |v0| dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, |v0| dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log2_fabs_v2f16:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log2_fabs_v2f16:
@@ -4492,18 +4488,16 @@ define <2 x half> @v_log2_fneg_fabs_v2f16(<2 x half> %in) {
 ; GFX900-SDAG-LABEL: v_log2_fneg_fabs_v2f16:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_log_f16_e64 v0, -|v0|
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, -|v0| dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, -|v0| dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log2_fneg_fabs_v2f16:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_or_b32_e32 v0, 0x80008000, v0
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log2_fneg_fabs_v2f16:
@@ -4620,18 +4614,16 @@ define <2 x half> @v_log2_fneg_v2f16(<2 x half> %in) {
 ; GFX900-SDAG-LABEL: v_log2_fneg_v2f16:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_log_f16_e64 v0, -v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, -v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log2_fneg_v2f16:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log2_fneg_v2f16:
@@ -4733,17 +4725,15 @@ define <2 x half> @v_log2_v2f16_fast(<2 x half> %in) {
 ; GFX900-SDAG-LABEL: v_log2_v2f16_fast:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_log_f16_e32 v0, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log2_v2f16_fast:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log2_v2f16_fast:
@@ -4855,19 +4845,17 @@ define <3 x half> @v_log2_v3f16(<3 x half> %in) {
 ; GFX900-SDAG-LABEL: v_log2_v3f16:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_log_f16_e32 v0, v0
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    v_log_f16_e32 v1, v1
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log2_v3f16:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v2, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v1
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log2_v3f16:
@@ -4977,19 +4965,17 @@ define <3 x half> @v_log2_v3f16_fast(<3 x half> %in) {
 ; GFX900-SDAG-LABEL: v_log2_v3f16_fast:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_log_f16_e32 v0, v0
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    v_log_f16_e32 v1, v1
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log2_v3f16_fast:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v2, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v1
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log2_v3f16_fast:
@@ -5117,23 +5103,19 @@ define <4 x half> @v_log2_v4f16(<4 x half> %in) {
 ; GFX900-SDAG-LABEL: v_log2_v4f16:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_log_f16_e32 v0, v0
-; GFX900-SDAG-NEXT:    v_log_f16_e32 v1, v1
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v3
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v2
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log2_v4f16:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v2, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v3, v1
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log2_v4f16:
@@ -5264,23 +5246,19 @@ define <4 x half> @v_log2_v4f16_fast(<4 x half> %in) {
 ; GFX900-SDAG-LABEL: v_log2_v4f16_fast:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_log_f16_e32 v0, v0
-; GFX900-SDAG-NEXT:    v_log_f16_e32 v1, v1
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v3
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v2
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log2_v4f16_fast:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v2, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v3, v1
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log2_v4f16_fast:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
index de12f2b246f57..c6fee73f4580d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
@@ -198,9 +198,8 @@ define amdgpu_kernel void @rint_v2f16(
 ; GFX9-NEXT:    s_mov_b32 s4, s0
 ; GFX9-NEXT:    s_mov_b32 s5, s1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_rndne_f16_e32 v1, v0
-; GFX9-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX9-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
index 1a426096da197..92e8dce75222a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
@@ -188,11 +188,10 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mul_f16_e32 v3, 0.15915494, v1
-; GFX9-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_sin_f16_e32 v2, v3
-; GFX9-NEXT:    v_sin_f16_e32 v1, v1
-; GFX9-NEXT:    v_pack_b32_f16 v1, v2, v1
+; GFX9-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_sin_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -204,11 +203,10 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mul_f16_e32 v3, 0.15915494, v1
-; GFX10-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_sin_f16_e32 v2, v3
-; GFX10-NEXT:    v_sin_f16_e32 v1, v1
-; GFX10-NEXT:    v_pack_b32_f16 v1, v2, v1
+; GFX10-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_sin_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll b/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll
index d34d2050b157e..68a3349e9b80a 100644
--- a/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll
+++ b/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll
@@ -829,9 +829,8 @@ define <4 x half> @v_repeat_divisor_v2f16_x2(<2 x half> %x, <2 x half> %y, <2 x
 ; GFX9-LABEL: v_repeat_divisor_v2f16_x2:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_rcp_f16_sdwa v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_rcp_f16_e32 v2, v2
-; GFX9-NEXT:    v_pack_b32_f16 v2, v2, v3
+; GFX9-NEXT:    v_rcp_f16_sdwa v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT:    v_rcp_f16_sdwa v2, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -932,15 +931,14 @@ define <6 x half> @v_repeat_divisor_v3f16_x2(<3 x half> %x, <3 x half> %y, <3 x
 ; GFX9-LABEL: v_repeat_divisor_v3f16_x2:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_rcp_f16_sdwa v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_rcp_f16_e32 v4, v4
+; GFX9-NEXT:    v_rcp_f16_sdwa v4, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX9-NEXT:    v_rcp_f16_e32 v5, v5
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7e00
-; GFX9-NEXT:    v_pack_b32_f16 v4, v4, v6
+; GFX9-NEXT:    v_rcp_f16_sdwa v4, v4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX9-NEXT:    v_pack_b32_f16 v5, v5, s4
-; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v4
 ; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v5
 ; GFX9-NEXT:    v_pk_mul_f16 v3, v3, v5
+; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v4
 ; GFX9-NEXT:    v_pk_mul_f16 v4, v2, v4
 ; GFX9-NEXT:    v_alignbit_b32 v2, v3, v4, 16
 ; GFX9-NEXT:    v_pack_b32_f16 v1, v1, v4
diff --git a/llvm/test/CodeGen/AMDGPU/roundeven.ll b/llvm/test/CodeGen/AMDGPU/roundeven.ll
index 59a1fe041bf90..97358044abdaa 100644
--- a/llvm/test/CodeGen/AMDGPU/roundeven.ll
+++ b/llvm/test/CodeGen/AMDGPU/roundeven.ll
@@ -460,17 +460,15 @@ define <2 x half> @v_roundeven_v2f16(<2 x half> %x) {
 ; GFX9-LABEL: v_roundeven_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_rndne_f16_e32 v1, v0
-; GFX9-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX9-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_roundeven_v2f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_rndne_f16_e32 v1, v0
-; GFX10-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX10-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_roundeven_v2f16:
@@ -523,17 +521,15 @@ define <2 x half> @v_roundeven_v2f16(<2 x half> %x) {
 ; SDAG_GFX9-LABEL: v_roundeven_v2f16:
 ; SDAG_GFX9:       ; %bb.0:
 ; SDAG_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG_GFX9-NEXT:    v_rndne_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG_GFX9-NEXT:    v_rndne_f16_e32 v0, v0
-; SDAG_GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
+; SDAG_GFX9-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; SDAG_GFX9-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; SDAG_GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX10-LABEL: v_roundeven_v2f16:
 ; SDAG_GFX10:       ; %bb.0:
 ; SDAG_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG_GFX10-NEXT:    v_rndne_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG_GFX10-NEXT:    v_rndne_f16_e32 v0, v0
-; SDAG_GFX10-NEXT:    v_pack_b32_f16 v0, v0, v1
+; SDAG_GFX10-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; SDAG_GFX10-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; SDAG_GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX11-TRUE16-LABEL: v_roundeven_v2f16:
@@ -602,18 +598,16 @@ define <2 x half> @v_roundeven_v2f16_fneg(<2 x half> %x) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
-; GFX9-NEXT:    v_rndne_f16_e32 v1, v0
-; GFX9-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX9-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_roundeven_v2f16_fneg:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
-; GFX10-NEXT:    v_rndne_f16_e32 v1, v0
-; GFX10-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX10-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_roundeven_v2f16_fneg:
@@ -676,17 +670,15 @@ define <2 x half> @v_roundeven_v2f16_fneg(<2 x half> %x) {
 ; SDAG_GFX9-LABEL: v_roundeven_v2f16_fneg:
 ; SDAG_GFX9:       ; %bb.0:
 ; SDAG_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG_GFX9-NEXT:    v_rndne_f16_sdwa v1, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG_GFX9-NEXT:    v_rndne_f16_e64 v0, -v0
-; SDAG_GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
+; SDAG_GFX9-NEXT:    v_rndne_f16_sdwa v0, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; SDAG_GFX9-NEXT:    v_rndne_f16_sdwa v0, -v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; SDAG_GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX10-LABEL: v_roundeven_v2f16_fneg:
 ; SDAG_GFX10:       ; %bb.0:
 ; SDAG_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG_GFX10-NEXT:    v_rndne_f16_sdwa v1, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG_GFX10-NEXT:    v_rndne_f16_e64 v0, -v0
-; SDAG_GFX10-NEXT:    v_pack_b32_f16 v0, v0, v1
+; SDAG_GFX10-NEXT:    v_rndne_f16_sdwa v0, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; SDAG_GFX10-NEXT:    v_rndne_f16_sdwa v0, -v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; SDAG_GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX11-TRUE16-LABEL: v_roundeven_v2f16_fneg:
@@ -759,23 +751,19 @@ define <4 x half> @v_roundeven_v4f16(<4 x half> %x) {
 ; GFX9-LABEL: v_roundeven_v4f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_rndne_f16_e32 v2, v0
-; GFX9-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_rndne_f16_e32 v3, v1
-; GFX9-NEXT:    v_rndne_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX9-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX9-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_rndne_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT:    v_rndne_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_roundeven_v4f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_rndne_f16_e32 v2, v0
-; GFX10-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_rndne_f16_e32 v3, v1
-; GFX10-NEXT:    v_rndne_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX10-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX10-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_rndne_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_rndne_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_roundeven_v4f16:
@@ -850,23 +838,19 @@ define <4 x half> @v_roundeven_v4f16(<4 x half> %x) {
 ; SDAG_GFX9-LABEL: v_roundeven_v4f16:
 ; SDAG_GFX9:       ; %bb.0:
 ; SDAG_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG_GFX9-NEXT:    v_rndne_f16_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG_GFX9-NEXT:    v_rndne_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG_GFX9-NEXT:    v_rndne_f16_e32 v1, v1
-; SDAG_GFX9-NEXT:    v_rndne_f16_e32 v0, v0
-; SDAG_GFX9-NEXT:    v_pack_b32_f16 v0, v0, v3
-; SDAG_GFX9-NEXT:    v_pack_b32_f16 v1, v1, v2
+; SDAG_GFX9-NEXT:    v_rndne_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; SDAG_GFX9-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; SDAG_GFX9-NEXT:    v_rndne_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; SDAG_GFX9-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; SDAG_GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX10-LABEL: v_roundeven_v4f16:
 ; SDAG_GFX10:       ; %bb.0:
 ; SDAG_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG_GFX10-NEXT:    v_rndne_f16_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG_GFX10-NEXT:    v_rndne_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG_GFX10-NEXT:    v_rndne_f16_e32 v0, v0
-; SDAG_GFX10-NEXT:    v_rndne_f16_e32 v1, v1
-; SDAG_GFX10-NEXT:    v_pack_b32_f16 v0, v0, v3
-; SDAG_GFX10-NEXT:    v_pack_b32_f16 v1, v1, v2
+; SDAG_GFX10-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; SDAG_GFX10-NEXT:    v_rndne_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; SDAG_GFX10-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; SDAG_GFX10-NEXT:    v_rndne_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; SDAG_GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX11-TRUE16-LABEL: v_roundeven_v4f16:

>From 8b1230d47383eefd82974b709e61a821e88bf24a Mon Sep 17 00:00:00 2001
From: vikashgu <Vikash.Gupta at amd.com>
Date: Thu, 24 Apr 2025 10:13:40 +0000
Subject: [PATCH 3/8] [AMDGPU][NFC] Added Pre-commit tests for #137137

This adds llc LIT test for vector fp16 operations like log, exp, etc.
Its act as the pre-commit test for github PR#137137.
---
 llvm/test/CodeGen/AMDGPU/vector-fp16.ll | 2758 +++++++++++++++++++++++
 1 file changed, 2758 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/vector-fp16.ll

diff --git a/llvm/test/CodeGen/AMDGPU/vector-fp16.ll b/llvm/test/CodeGen/AMDGPU/vector-fp16.ll
new file mode 100644
index 0000000000000..501630e790200
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vector-fp16.ll
@@ -0,0 +1,2758 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX906 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+
+declare <1 x half> @llvm.sin.v1f16(<1 x half>)
+declare <1 x half> @llvm.cos.v1f16(<1 x half>)
+declare <1 x half> @llvm.log.v1f16(<1 x half>)
+declare <1 x half> @llvm.log2.v1f16(<1 x half>)
+declare <1 x half> @llvm.log10.v1f16(<1 x half>)
+declare <1 x half> @llvm.exp.v1f16(<1 x half>)
+declare <1 x half> @llvm.exp2.v1f16(<1 x half>)
+declare <1 x half> @llvm.exp10.v1f16(<1 x half>)
+declare <1 x half> @llvm.sqrt.v1f16(<1 x half>)
+
+declare <2 x half> @llvm.sin.v2f16(<2 x half>)
+declare <2 x half> @llvm.cos.v2f16(<2 x half>)
+declare <2 x half> @llvm.log.v2f16(<2 x half>)
+declare <2 x half> @llvm.log2.v2f16(<2 x half>)
+declare <2 x half> @llvm.log10.v2f16(<2 x half>)
+declare <2 x half> @llvm.exp.v2f16(<2 x half>)
+declare <2 x half> @llvm.exp2.v2f16(<2 x half>)
+declare <2 x half> @llvm.exp10.v2f16(<2 x half>)
+declare <2 x half> @llvm.sqrt.v2f16(<2 x half>)
+
+declare <4 x half> @llvm.sin.v4f16(<4 x half>)
+declare <4 x half> @llvm.cos.v4f16(<4 x half>)
+declare <4 x half> @llvm.log.v4f16(<4 x half>)
+declare <4 x half> @llvm.log2.v4f16(<4 x half>)
+declare <4 x half> @llvm.log10.v4f16(<4 x half>)
+declare <4 x half> @llvm.exp.v4f16(<4 x half>)
+declare <4 x half> @llvm.exp2.v4f16(<4 x half>)
+declare <4 x half> @llvm.exp10.v4f16(<4 x half>)
+declare <4 x half> @llvm.sqrt.v4f16(<4 x half>)
+
+declare <5 x half> @llvm.sin.v5f16(<5 x half>)
+declare <5 x half> @llvm.cos.v5f16(<5 x half>)
+declare <5 x half> @llvm.log.v5f16(<5 x half>)
+declare <5 x half> @llvm.log2.v5f16(<5 x half>)
+declare <5 x half> @llvm.log10.v5f16(<5 x half>)
+declare <5 x half> @llvm.exp.v5f16(<5 x half>)
+declare <5 x half> @llvm.exp2.v5f16(<5 x half>)
+declare <5 x half> @llvm.exp10.v5f16(<5 x half>)
+declare <5 x half> @llvm.sqrt.v5f16(<5 x half>)
+
+
+define <1 x half> @sin_v1f16(<1 x half> %a) {
+; GFX8-LABEL: sin_v1f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f16_e32 v0, 0.15915494, v0
+; GFX8-NEXT:    v_fract_f16_e32 v0, v0
+; GFX8-NEXT:    v_sin_f16_e32 v0, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: sin_v1f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mul_f16_e32 v0, 0.15915494, v0
+; GFX9-NEXT:    v_sin_f16_e32 v0, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: sin_v1f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mul_f16_e32 v0, 0.15915494, v0
+; GFX10-NEXT:    v_sin_f16_e32 v0, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: sin_v1f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mul_f16_e32 v0, 0.15915494, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_sin_f16_e32 v0, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+	%res = call <1 x half> @llvm.sin.v1f16(<1 x half> %a)
+	ret <1 x half> %res
+}
+
+define <1 x half> @cos_v1f16(<1 x half> %a) {
+; GFX8-LABEL: cos_v1f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f16_e32 v0, 0.15915494, v0
+; GFX8-NEXT:    v_fract_f16_e32 v0, v0
+; GFX8-NEXT:    v_cos_f16_e32 v0, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: cos_v1f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mul_f16_e32 v0, 0.15915494, v0
+; GFX9-NEXT:    v_cos_f16_e32 v0, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: cos_v1f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mul_f16_e32 v0, 0.15915494, v0
+; GFX10-NEXT:    v_cos_f16_e32 v0, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: cos_v1f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mul_f16_e32 v0, 0.15915494, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cos_f16_e32 v0, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+	%res = call <1 x half> @llvm.cos.v1f16(<1 x half> %a)
+	ret <1 x half> %res
+}
+
+define <1 x half> @log_v1f16(<1 x half> %a) {
+; GFX8-LABEL: log_v1f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_log_f16_e32 v0, v0
+; GFX8-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: log_v1f16:
+; GFX906:       ; %bb.0:
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    v_log_f16_e32 v0, v0
+; GFX906-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: log_v1f16:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_log_f16_e32 v0, v0
+; GFX908-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: log_v1f16:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_log_f16_e32 v0, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: log_v1f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_log_f16_e32 v0, v0
+; GFX10-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: log_v1f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_log_f16_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+	%res = call <1 x half> @llvm.log.v1f16(<1 x half> %a)
+	ret <1 x half> %res
+}
+
+define <1 x half> @log2_v1f16(<1 x half> %a) {
+; GFX8-LABEL: log2_v1f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_log_f16_e32 v0, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: log2_v1f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_log_f16_e32 v0, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: log2_v1f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_log_f16_e32 v0, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: log2_v1f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_log_f16_e32 v0, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+	%res = call <1 x half> @llvm.log2.v1f16(<1 x half> %a)
+	ret <1 x half> %res
+}
+
+define <1 x half> @log10_v1f16(<1 x half> %a) {
+; GFX8-LABEL: log10_v1f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_log_f16_e32 v0, v0
+; GFX8-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: log10_v1f16:
+; GFX906:       ; %bb.0:
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    v_log_f16_e32 v0, v0
+; GFX906-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: log10_v1f16:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_log_f16_e32 v0, v0
+; GFX908-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: log10_v1f16:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_log_f16_e32 v0, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: log10_v1f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_log_f16_e32 v0, v0
+; GFX10-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: log10_v1f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_log_f16_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+	%res = call <1 x half> @llvm.log10.v1f16(<1 x half> %a)
+	ret <1 x half> %res
+}
+
+define <1 x half> @exp_v1f16(<1 x half> %a) {
+; GFX8-LABEL: exp_v1f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX8-NEXT:    v_exp_f32_e32 v0, v0
+; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: exp_v1f16:
+; GFX906:       ; %bb.0:
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX906-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX906-NEXT:    v_exp_f32_e32 v0, v0
+; GFX906-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: exp_v1f16:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX908-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX908-NEXT:    v_exp_f32_e32 v0, v0
+; GFX908-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: exp_v1f16:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX942-NEXT:    v_exp_f32_e32 v0, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: exp_v1f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX10-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX10-NEXT:    v_exp_f32_e32 v0, v0
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: exp_v1f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX11-NEXT:    v_exp_f32_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+	%res = call <1 x half> @llvm.exp.v1f16(<1 x half> %a)
+	ret <1 x half> %res
+}
+
+define <1 x half> @exp2_v1f16(<1 x half> %a) {
+; GFX8-LABEL: exp2_v1f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_exp_f16_e32 v0, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: exp2_v1f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_exp_f16_e32 v0, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: exp2_v1f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_exp_f16_e32 v0, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: exp2_v1f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_exp_f16_e32 v0, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+	%res = call <1 x half> @llvm.exp2.v1f16(<1 x half> %a)
+	ret <1 x half> %res
+}
+
+define <1 x half> @exp10_v1f16(<1 x half> %a) {
+; GFX8-LABEL: exp10_v1f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX8-NEXT:    v_exp_f32_e32 v0, v0
+; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: exp10_v1f16:
+; GFX906:       ; %bb.0:
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX906-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX906-NEXT:    v_exp_f32_e32 v0, v0
+; GFX906-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: exp10_v1f16:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX908-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX908-NEXT:    v_exp_f32_e32 v0, v0
+; GFX908-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: exp10_v1f16:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX942-NEXT:    v_exp_f32_e32 v0, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: exp10_v1f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX10-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX10-NEXT:    v_exp_f32_e32 v0, v0
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: exp10_v1f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX11-NEXT:    v_exp_f32_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+	%res = call <1 x half> @llvm.exp10.v1f16(<1 x half> %a)
+	ret <1 x half> %res
+}
+
+define <1 x half> @sqrt_v1f16(<1 x half> %a) {
+; GFX8-LABEL: sqrt_v1f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_sqrt_f16_e32 v0, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: sqrt_v1f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_sqrt_f16_e32 v0, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: sqrt_v1f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_sqrt_f16_e32 v0, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: sqrt_v1f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_sqrt_f16_e32 v0, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+	%res = call <1 x half> @llvm.sqrt.v1f16(<1 x half> %a)
+	ret <1 x half> %res
+}
+
+define <2 x half> @sin_v2f16(<2 x half> %a) {
+; GFX8-LABEL: sin_v2f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0x3118
+; GFX8-NEXT:    v_mul_f16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_mul_f16_e32 v0, 0.15915494, v0
+; GFX8-NEXT:    v_fract_f16_e32 v1, v1
+; GFX8-NEXT:    v_fract_f16_e32 v0, v0
+; GFX8-NEXT:    v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_sin_f16_e32 v0, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: sin_v2f16:
+; GFX906:       ; %bb.0:
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    v_mov_b32_e32 v1, 0x3118
+; GFX906-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT:    v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: sin_v2f16:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v1, 0x3118
+; GFX908-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT:    v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT:    v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: sin_v2f16:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0x3118
+; GFX942-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: sin_v2f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0x3118
+; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: sin_v2f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT:    v_mul_f16_e32 v0, 0.15915494, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
+; GFX11-NEXT:    v_sin_f16_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_sin_f16_e32 v1, v1
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+	%res = call <2 x half> @llvm.sin.v2f16(<2 x half> %a)
+	ret <2 x half> %res
+}
+
+define <2 x half> @cos_v2f16(<2 x half> %a) {
+; GFX8-LABEL: cos_v2f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0x3118
+; GFX8-NEXT:    v_mul_f16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_mul_f16_e32 v0, 0.15915494, v0
+; GFX8-NEXT:    v_fract_f16_e32 v1, v1
+; GFX8-NEXT:    v_fract_f16_e32 v0, v0
+; GFX8-NEXT:    v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_cos_f16_e32 v0, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: cos_v2f16:
+; GFX906:       ; %bb.0:
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    v_mov_b32_e32 v1, 0x3118
+; GFX906-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT:    v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: cos_v2f16:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v1, 0x3118
+; GFX908-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT:    v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT:    v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: cos_v2f16:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0x3118
+; GFX942-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: cos_v2f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0x3118
+; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: cos_v2f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT:    v_mul_f16_e32 v0, 0.15915494, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
+; GFX11-NEXT:    v_cos_f16_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cos_f16_e32 v1, v1
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+	%res = call <2 x half> @llvm.cos.v2f16(<2 x half> %a)
+	ret <2 x half> %res
+}
+
+define <2 x half> @log_v2f16(<2 x half> %a) {
+; GFX8-LABEL: log_v2f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_log_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_log_f16_e32 v0, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x398c
+; GFX8-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: log_v2f16:
+; GFX906:       ; %bb.0:
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT:    s_movk_i32 s4, 0x398c
+; GFX906-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: log_v2f16:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT:    s_movk_i32 s4, 0x398c
+; GFX908-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: log_v2f16:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT:    s_movk_i32 s0, 0x398c
+; GFX942-NEXT:    v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: log_v2f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0x398c
+; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: log_v2f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT:    v_log_f16_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_log_f16_e32 v1, v1
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; GFX11-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
+; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+	%res = call <2 x half> @llvm.log.v2f16(<2 x half> %a)
+	ret <2 x half> %res
+}
+
+define <2 x half> @log2_v2f16(<2 x half> %a) {
+; GFX8-LABEL: log2_v2f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_log_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_log_f16_e32 v0, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: log2_v2f16:
+; GFX906:       ; %bb.0:
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: log2_v2f16:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: log2_v2f16:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: log2_v2f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: log2_v2f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT:    v_log_f16_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_log_f16_e32 v1, v1
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+	%res = call <2 x half> @llvm.log2.v2f16(<2 x half> %a)
+	ret <2 x half> %res
+}
+
+define <2 x half> @log10_v2f16(<2 x half> %a) {
+; GFX8-LABEL: log10_v2f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_log_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_log_f16_e32 v0, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x34d1
+; GFX8-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: log10_v2f16:
+; GFX906:       ; %bb.0:
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT:    s_movk_i32 s4, 0x34d1
+; GFX906-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: log10_v2f16:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT:    s_movk_i32 s4, 0x34d1
+; GFX908-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: log10_v2f16:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT:    s_movk_i32 s0, 0x34d1
+; GFX942-NEXT:    v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: log10_v2f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0x34d1
+; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: log10_v2f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT:    v_log_f16_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_log_f16_e32 v1, v1
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; GFX11-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
+; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+	%res = call <2 x half> @llvm.log10.v2f16(<2 x half> %a)
+	ret <2 x half> %res
+}
+
+define <2 x half> @exp_v2f16(<2 x half> %a) {
+; GFX8-LABEL: exp_v2f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cvt_f32_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX8-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX8-NEXT:    v_exp_f32_e32 v1, v1
+; GFX8-NEXT:    v_exp_f32_e32 v0, v0
+; GFX8-NEXT:    v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: exp_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX9-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX9-NEXT:    v_exp_f32_e32 v1, v1
+; GFX9-NEXT:    v_exp_f32_e32 v0, v0
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: exp_v2f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX10-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX10-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX10-NEXT:    v_exp_f32_e32 v1, v1
+; GFX10-NEXT:    v_exp_f32_e32 v0, v0
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX10-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: exp_v2f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX11-NEXT:    v_dual_mul_f32 v0, 0x3fb8aa3b, v0 :: v_dual_mul_f32 v1, 0x3fb8aa3b, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_exp_f32_e32 v0, v0
+; GFX11-NEXT:    v_exp_f32_e32 v1, v1
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+	%res = call <2 x half> @llvm.exp.v2f16(<2 x half> %a)
+	ret <2 x half> %res
+}
+
+define <2 x half> @exp2_v2f16(<2 x half> %a) {
+; GFX8-LABEL: exp2_v2f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_exp_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_exp_f16_e32 v0, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: exp2_v2f16:
+; GFX906:       ; %bb.0:
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: exp2_v2f16:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: exp2_v2f16:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: exp2_v2f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: exp2_v2f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT:    v_exp_f16_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_exp_f16_e32 v1, v1
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+	%res = call <2 x half> @llvm.exp2.v2f16(<2 x half> %a)
+	ret <2 x half> %res
+}
+
+define <2 x half> @exp10_v2f16(<2 x half> %a) {
+; GFX8-LABEL: exp10_v2f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cvt_f32_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX8-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX8-NEXT:    v_exp_f32_e32 v1, v1
+; GFX8-NEXT:    v_exp_f32_e32 v0, v0
+; GFX8-NEXT:    v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: exp10_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX9-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX9-NEXT:    v_exp_f32_e32 v1, v1
+; GFX9-NEXT:    v_exp_f32_e32 v0, v0
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: exp10_v2f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX10-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX10-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX10-NEXT:    v_exp_f32_e32 v1, v1
+; GFX10-NEXT:    v_exp_f32_e32 v0, v0
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX10-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: exp10_v2f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX11-NEXT:    v_dual_mul_f32 v0, 0x3fb8aa3b, v0 :: v_dual_mul_f32 v1, 0x3fb8aa3b, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_exp_f32_e32 v0, v0
+; GFX11-NEXT:    v_exp_f32_e32 v1, v1
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+	%res = call <2 x half> @llvm.exp10.v2f16(<2 x half> %a)
+	ret <2 x half> %res
+}
+
+define <2 x half> @sqrt_v2f16(<2 x half> %a) {
+; GFX8-LABEL: sqrt_v2f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_sqrt_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_sqrt_f16_e32 v0, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: sqrt_v2f16:
+; GFX906:       ; %bb.0:
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: sqrt_v2f16:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: sqrt_v2f16:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: sqrt_v2f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: sqrt_v2f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT:    v_sqrt_f16_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_sqrt_f16_e32 v1, v1
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+	%res = call <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
+	ret <2 x half> %res
+}
+
+define <4 x half> @sin_v4f16(<4 x half> %a) {
+; GFX8-LABEL: sin_v4f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0x3118
+; GFX8-NEXT:    v_mul_f16_e32 v2, 0.15915494, v1
+; GFX8-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_mul_f16_e32 v4, 0.15915494, v0
+; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_fract_f16_e32 v2, v2
+; GFX8-NEXT:    v_fract_f16_e32 v1, v1
+; GFX8-NEXT:    v_fract_f16_e32 v4, v4
+; GFX8-NEXT:    v_fract_f16_e32 v0, v0
+; GFX8-NEXT:    v_sin_f16_e32 v2, v2
+; GFX8-NEXT:    v_sin_f16_e32 v4, v4
+; GFX8-NEXT:    v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: sin_v4f16:
+; GFX906:       ; %bb.0:
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    v_mov_b32_e32 v2, 0x3118
+; GFX906-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_sin_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT:    v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT:    v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT:    v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: sin_v4f16:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v2, 0x3118
+; GFX908-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT:    v_sin_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT:    v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT:    v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT:    v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: sin_v4f16:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x3118
+; GFX942-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT:    v_sin_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT:    v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT:    v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT:    v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: sin_v4f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0x3118
+; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_sin_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: sin_v4f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
+; GFX11-NEXT:    v_mul_f16_e32 v0, 0.15915494, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
+; GFX11-NEXT:    v_mul_f16_e32 v3, 0.15915494, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_sin_f16_e32 v1, v1
+; GFX11-NEXT:    v_sin_f16_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_sin_f16_e32 v2, v2
+; GFX11-NEXT:    v_sin_f16_e32 v3, v3
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+	%res = call <4 x half> @llvm.sin.v4f16(<4 x half> %a)
+	ret <4 x half> %res
+}
+
+define <4 x half> @cos_v4f16(<4 x half> %a) {
+; GFX8-LABEL: cos_v4f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0x3118
+; GFX8-NEXT:    v_mul_f16_e32 v2, 0.15915494, v1
+; GFX8-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_mul_f16_e32 v4, 0.15915494, v0
+; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_fract_f16_e32 v2, v2
+; GFX8-NEXT:    v_fract_f16_e32 v1, v1
+; GFX8-NEXT:    v_fract_f16_e32 v4, v4
+; GFX8-NEXT:    v_fract_f16_e32 v0, v0
+; GFX8-NEXT:    v_cos_f16_e32 v2, v2
+; GFX8-NEXT:    v_cos_f16_e32 v4, v4
+; GFX8-NEXT:    v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: cos_v4f16:
+; GFX906:       ; %bb.0:
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    v_mov_b32_e32 v2, 0x3118
+; GFX906-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_cos_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT:    v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT:    v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT:    v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: cos_v4f16:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v2, 0x3118
+; GFX908-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT:    v_cos_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT:    v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT:    v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT:    v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: cos_v4f16:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x3118
+; GFX942-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT:    v_cos_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT:    v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT:    v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT:    v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: cos_v4f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0x3118
+; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_cos_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: cos_v4f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
+; GFX11-NEXT:    v_mul_f16_e32 v0, 0.15915494, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
+; GFX11-NEXT:    v_mul_f16_e32 v3, 0.15915494, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cos_f16_e32 v1, v1
+; GFX11-NEXT:    v_cos_f16_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cos_f16_e32 v2, v2
+; GFX11-NEXT:    v_cos_f16_e32 v3, v3
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+	%res = call <4 x half> @llvm.cos.v4f16(<4 x half> %a)
+	ret <4 x half> %res
+}
+
+define <4 x half> @log_v4f16(<4 x half> %a) {
+; GFX8-LABEL: log_v4f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_log_f16_e32 v2, v1
+; GFX8-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_log_f16_e32 v3, v0
+; GFX8-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0x398c
+; GFX8-NEXT:    v_mul_f16_e32 v2, 0x398c, v2
+; GFX8-NEXT:    v_mul_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_mul_f16_e32 v3, 0x398c, v3
+; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: log_v4f16:
+; GFX906:       ; %bb.0:
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT:    s_movk_i32 s4, 0x398c
+; GFX906-NEXT:    v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT:    v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: log_v4f16:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT:    s_movk_i32 s4, 0x398c
+; GFX908-NEXT:    v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT:    v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: log_v4f16:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT:    s_movk_i32 s0, 0x398c
+; GFX942-NEXT:    v_mul_f16_sdwa v1, v1, s0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT:    v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT:    v_mul_f16_sdwa v1, v1, s0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT:    v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: log_v4f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0x398c
+; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: log_v4f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT:    v_log_f16_e32 v1, v1
+; GFX11-NEXT:    v_log_f16_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_log_f16_e32 v2, v2
+; GFX11-NEXT:    v_log_f16_e32 v3, v3
+; GFX11-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
+; GFX11-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_mul_f16_e32 v2, 0x398c, v2
+; GFX11-NEXT:    v_mul_f16_e32 v3, 0x398c, v3
+; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+	%res = call <4 x half> @llvm.log.v4f16(<4 x half> %a)
+	ret <4 x half> %res
+}
+
+define <4 x half> @log2_v4f16(<4 x half> %a) {
+; GFX8-LABEL: log2_v4f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_log_f16_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_log_f16_sdwa v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_log_f16_e32 v0, v0
+; GFX8-NEXT:    v_log_f16_e32 v1, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: log2_v4f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: log2_v4f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: log2_v4f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT:    v_log_f16_e32 v1, v1
+; GFX11-NEXT:    v_log_f16_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_log_f16_e32 v2, v2
+; GFX11-NEXT:    v_log_f16_e32 v3, v3
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+	%res = call <4 x half> @llvm.log2.v4f16(<4 x half> %a)
+	ret <4 x half> %res
+}
+
+define <4 x half> @log10_v4f16(<4 x half> %a) {
+; GFX8-LABEL: log10_v4f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_log_f16_e32 v2, v1
+; GFX8-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_log_f16_e32 v3, v0
+; GFX8-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0x34d1
+; GFX8-NEXT:    v_mul_f16_e32 v2, 0x34d1, v2
+; GFX8-NEXT:    v_mul_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_mul_f16_e32 v3, 0x34d1, v3
+; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: log10_v4f16:
+; GFX906:       ; %bb.0:
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT:    s_movk_i32 s4, 0x34d1
+; GFX906-NEXT:    v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT:    v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: log10_v4f16:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT:    s_movk_i32 s4, 0x34d1
+; GFX908-NEXT:    v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT:    v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: log10_v4f16:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT:    s_movk_i32 s0, 0x34d1
+; GFX942-NEXT:    v_mul_f16_sdwa v1, v1, s0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT:    v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT:    v_mul_f16_sdwa v1, v1, s0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT:    v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: log10_v4f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0x34d1
+; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: log10_v4f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT:    v_log_f16_e32 v1, v1
+; GFX11-NEXT:    v_log_f16_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_log_f16_e32 v2, v2
+; GFX11-NEXT:    v_log_f16_e32 v3, v3
+; GFX11-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
+; GFX11-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_mul_f16_e32 v2, 0x34d1, v2
+; GFX11-NEXT:    v_mul_f16_e32 v3, 0x34d1, v3
+; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+	%res = call <4 x half> @llvm.log10.v4f16(<4 x half> %a)
+	ret <4 x half> %res
+}
+
+define <4 x half> @exp_v4f16(<4 x half> %a) {
+; GFX8-LABEL: exp_v4f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX8-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; GFX8-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+; GFX8-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX8-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v3
+; GFX8-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX8-NEXT:    v_exp_f32_e32 v2, v2
+; GFX8-NEXT:    v_exp_f32_e32 v3, v3
+; GFX8-NEXT:    v_exp_f32_e32 v0, v0
+; GFX8-NEXT:    v_exp_f32_e32 v1, v1
+; GFX8-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX8-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX8-NEXT:    v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: exp_v4f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX9-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; GFX9-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+; GFX9-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX9-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v3
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX9-NEXT:    v_exp_f32_e32 v2, v2
+; GFX9-NEXT:    v_exp_f32_e32 v3, v3
+; GFX9-NEXT:    v_exp_f32_e32 v0, v0
+; GFX9-NEXT:    v_exp_f32_e32 v1, v1
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT:    v_pack_b32_f16 v0, v3, v0
+; GFX9-NEXT:    v_pack_b32_f16 v1, v2, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: exp_v4f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; GFX10-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+; GFX10-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v3
+; GFX10-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX10-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX10-NEXT:    v_exp_f32_e32 v2, v2
+; GFX10-NEXT:    v_exp_f32_e32 v3, v3
+; GFX10-NEXT:    v_exp_f32_e32 v0, v0
+; GFX10-NEXT:    v_exp_f32_e32 v1, v1
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX10-NEXT:    v_pack_b32_f16 v0, v3, v0
+; GFX10-NEXT:    v_pack_b32_f16 v1, v2, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: exp_v4f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_mul_f32 v1, 0x3fb8aa3b, v1 :: v_dual_mul_f32 v0, 0x3fb8aa3b, v0
+; GFX11-NEXT:    v_dual_mul_f32 v2, 0x3fb8aa3b, v2 :: v_dual_mul_f32 v3, 0x3fb8aa3b, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_exp_f32_e32 v1, v1
+; GFX11-NEXT:    v_exp_f32_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_exp_f32_e32 v2, v2
+; GFX11-NEXT:    v_exp_f32_e32 v3, v3
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+	%res = call <4 x half> @llvm.exp.v4f16(<4 x half> %a)
+	ret <4 x half> %res
+}
+
+define <4 x half> @exp2_v4f16(<4 x half> %a) {
+; GFX8-LABEL: exp2_v4f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_exp_f16_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_exp_f16_sdwa v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_exp_f16_e32 v0, v0
+; GFX8-NEXT:    v_exp_f16_e32 v1, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: exp2_v4f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT:    v_exp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_exp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: exp2_v4f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_exp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_exp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: exp2_v4f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT:    v_exp_f16_e32 v1, v1
+; GFX11-NEXT:    v_exp_f16_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_exp_f16_e32 v2, v2
+; GFX11-NEXT:    v_exp_f16_e32 v3, v3
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+	%res = call <4 x half> @llvm.exp2.v4f16(<4 x half> %a)
+	ret <4 x half> %res
+}
+
+define <4 x half> @exp10_v4f16(<4 x half> %a) {
+; GFX8-LABEL: exp10_v4f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX8-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; GFX8-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+; GFX8-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX8-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v3
+; GFX8-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX8-NEXT:    v_exp_f32_e32 v2, v2
+; GFX8-NEXT:    v_exp_f32_e32 v3, v3
+; GFX8-NEXT:    v_exp_f32_e32 v0, v0
+; GFX8-NEXT:    v_exp_f32_e32 v1, v1
+; GFX8-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX8-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX8-NEXT:    v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: exp10_v4f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX9-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; GFX9-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+; GFX9-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX9-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v3
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX9-NEXT:    v_exp_f32_e32 v2, v2
+; GFX9-NEXT:    v_exp_f32_e32 v3, v3
+; GFX9-NEXT:    v_exp_f32_e32 v0, v0
+; GFX9-NEXT:    v_exp_f32_e32 v1, v1
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT:    v_pack_b32_f16 v0, v3, v0
+; GFX9-NEXT:    v_pack_b32_f16 v1, v2, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: exp10_v4f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; GFX10-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+; GFX10-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v3
+; GFX10-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX10-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX10-NEXT:    v_exp_f32_e32 v2, v2
+; GFX10-NEXT:    v_exp_f32_e32 v3, v3
+; GFX10-NEXT:    v_exp_f32_e32 v0, v0
+; GFX10-NEXT:    v_exp_f32_e32 v1, v1
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX10-NEXT:    v_pack_b32_f16 v0, v3, v0
+; GFX10-NEXT:    v_pack_b32_f16 v1, v2, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: exp10_v4f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_mul_f32 v1, 0x3fb8aa3b, v1 :: v_dual_mul_f32 v0, 0x3fb8aa3b, v0
+; GFX11-NEXT:    v_dual_mul_f32 v2, 0x3fb8aa3b, v2 :: v_dual_mul_f32 v3, 0x3fb8aa3b, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_exp_f32_e32 v1, v1
+; GFX11-NEXT:    v_exp_f32_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_exp_f32_e32 v2, v2
+; GFX11-NEXT:    v_exp_f32_e32 v3, v3
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+	%res = call <4 x half> @llvm.exp10.v4f16(<4 x half> %a)
+	ret <4 x half> %res
+}
+
+define <4 x half> @sqrt_v4f16(<4 x half> %a) {
+; GFX8-LABEL: sqrt_v4f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_sqrt_f16_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_sqrt_f16_sdwa v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_sqrt_f16_e32 v0, v0
+; GFX8-NEXT:    v_sqrt_f16_e32 v1, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: sqrt_v4f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT:    v_sqrt_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_sqrt_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: sqrt_v4f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_sqrt_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_sqrt_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: sqrt_v4f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT:    v_sqrt_f16_e32 v1, v1
+; GFX11-NEXT:    v_sqrt_f16_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_sqrt_f16_e32 v2, v2
+; GFX11-NEXT:    v_sqrt_f16_e32 v3, v3
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+	%res = call <4 x half> @llvm.sqrt.v4f16(<4 x half> %a)
+	ret <4 x half> %res
+}
+
+define <5 x half> @sin_v5f16(<5 x half> %a) {
+; GFX8-LABEL: sin_v5f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0x3118
+; GFX8-NEXT:    v_mul_f16_e32 v3, 0.15915494, v1
+; GFX8-NEXT:    v_mul_f16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_mul_f16_e32 v5, 0.15915494, v0
+; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
+; GFX8-NEXT:    v_fract_f16_e32 v3, v3
+; GFX8-NEXT:    v_fract_f16_e32 v1, v1
+; GFX8-NEXT:    v_fract_f16_e32 v5, v5
+; GFX8-NEXT:    v_fract_f16_e32 v0, v0
+; GFX8-NEXT:    v_sin_f16_e32 v3, v3
+; GFX8-NEXT:    v_sin_f16_e32 v5, v5
+; GFX8-NEXT:    v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_fract_f16_e32 v2, v2
+; GFX8-NEXT:    v_sin_f16_e32 v2, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v5, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: sin_v5f16:
+; GFX906:       ; %bb.0:
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    v_mov_b32_e32 v3, 0x3118
+; GFX906-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_sin_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT:    v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
+; GFX906-NEXT:    v_sin_f16_e32 v2, v2
+; GFX906-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT:    v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT:    v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: sin_v5f16:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v3, 0x3118
+; GFX908-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT:    v_sin_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT:    v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
+; GFX908-NEXT:    v_sin_f16_e32 v2, v2
+; GFX908-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT:    v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT:    v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: sin_v5f16:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0x3118
+; GFX942-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT:    v_sin_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT:    v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
+; GFX942-NEXT:    v_sin_f16_e32 v2, v2
+; GFX942-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT:    v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT:    v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: sin_v5f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0x3118
+; GFX10-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
+; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_sin_f16_e32 v2, v2
+; GFX10-NEXT:    v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_sin_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: sin_v5f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
+; GFX11-NEXT:    v_mul_f16_e32 v0, 0.15915494, v0
+; GFX11-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
+; GFX11-NEXT:    v_mul_f16_e32 v3, 0.15915494, v3
+; GFX11-NEXT:    v_mul_f16_e32 v4, 0.15915494, v4
+; GFX11-NEXT:    v_sin_f16_e32 v1, v1
+; GFX11-NEXT:    v_sin_f16_e32 v0, v0
+; GFX11-NEXT:    v_sin_f16_e32 v2, v2
+; GFX11-NEXT:    v_sin_f16_e32 v3, v3
+; GFX11-NEXT:    v_sin_f16_e32 v4, v4
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v3
+; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v4
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+	%res = call <5 x half> @llvm.sin.v5f16(<5 x half> %a)
+	ret <5 x half> %res
+}
+
+define <5 x half> @cos_v5f16(<5 x half> %a) {
+; GFX8-LABEL: cos_v5f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0x3118
+; GFX8-NEXT:    v_mul_f16_e32 v3, 0.15915494, v1
+; GFX8-NEXT:    v_mul_f16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_mul_f16_e32 v5, 0.15915494, v0
+; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
+; GFX8-NEXT:    v_fract_f16_e32 v3, v3
+; GFX8-NEXT:    v_fract_f16_e32 v1, v1
+; GFX8-NEXT:    v_fract_f16_e32 v5, v5
+; GFX8-NEXT:    v_fract_f16_e32 v0, v0
+; GFX8-NEXT:    v_cos_f16_e32 v3, v3
+; GFX8-NEXT:    v_cos_f16_e32 v5, v5
+; GFX8-NEXT:    v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_fract_f16_e32 v2, v2
+; GFX8-NEXT:    v_cos_f16_e32 v2, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v5, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: cos_v5f16:
+; GFX906:       ; %bb.0:
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    v_mov_b32_e32 v3, 0x3118
+; GFX906-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_cos_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT:    v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
+; GFX906-NEXT:    v_cos_f16_e32 v2, v2
+; GFX906-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT:    v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT:    v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: cos_v5f16:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v3, 0x3118
+; GFX908-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT:    v_cos_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT:    v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
+; GFX908-NEXT:    v_cos_f16_e32 v2, v2
+; GFX908-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT:    v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT:    v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: cos_v5f16:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0x3118
+; GFX942-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT:    v_cos_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT:    v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
+; GFX942-NEXT:    v_cos_f16_e32 v2, v2
+; GFX942-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT:    v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT:    v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: cos_v5f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0x3118
+; GFX10-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
+; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_cos_f16_e32 v2, v2
+; GFX10-NEXT:    v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_cos_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: cos_v5f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
+; GFX11-NEXT:    v_mul_f16_e32 v0, 0.15915494, v0
+; GFX11-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
+; GFX11-NEXT:    v_mul_f16_e32 v3, 0.15915494, v3
+; GFX11-NEXT:    v_mul_f16_e32 v4, 0.15915494, v4
+; GFX11-NEXT:    v_cos_f16_e32 v1, v1
+; GFX11-NEXT:    v_cos_f16_e32 v0, v0
+; GFX11-NEXT:    v_cos_f16_e32 v2, v2
+; GFX11-NEXT:    v_cos_f16_e32 v3, v3
+; GFX11-NEXT:    v_cos_f16_e32 v4, v4
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v3
+; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v4
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+	%res = call <5 x half> @llvm.cos.v5f16(<5 x half> %a)
+	ret <5 x half> %res
+}
+
+define <5 x half> @log_v5f16(<5 x half> %a) {
+; GFX8-LABEL: log_v5f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_log_f16_e32 v3, v1
+; GFX8-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_log_f16_e32 v4, v0
+; GFX8-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_log_f16_e32 v2, v2
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0x398c
+; GFX8-NEXT:    v_mul_f16_e32 v3, 0x398c, v3
+; GFX8-NEXT:    v_mul_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_mul_f16_e32 v4, 0x398c, v4
+; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_mul_f16_e32 v2, 0x398c, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: log_v5f16:
+; GFX906:       ; %bb.0:
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT:    s_movk_i32 s4, 0x398c
+; GFX906-NEXT:    v_log_f16_e32 v2, v2
+; GFX906-NEXT:    v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT:    v_mul_f16_e32 v2, 0x398c, v2
+; GFX906-NEXT:    v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: log_v5f16:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT:    s_movk_i32 s4, 0x398c
+; GFX908-NEXT:    v_log_f16_e32 v2, v2
+; GFX908-NEXT:    v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT:    v_mul_f16_e32 v2, 0x398c, v2
+; GFX908-NEXT:    v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: log_v5f16:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT:    s_movk_i32 s0, 0x398c
+; GFX942-NEXT:    v_log_f16_e32 v2, v2
+; GFX942-NEXT:    v_mul_f16_sdwa v1, v1, s0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT:    v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT:    v_mul_f16_e32 v2, 0x398c, v2
+; GFX942-NEXT:    v_mul_f16_sdwa v1, v1, s0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT:    v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: log_v5f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0x398c
+; GFX10-NEXT:    v_log_f16_e32 v2, v2
+; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_mul_f16_e32 v2, 0x398c, v2
+; GFX10-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: log_v5f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-NEXT:    v_log_f16_e32 v1, v1
+; GFX11-NEXT:    v_log_f16_e32 v0, v0
+; GFX11-NEXT:    v_log_f16_e32 v2, v2
+; GFX11-NEXT:    v_log_f16_e32 v3, v3
+; GFX11-NEXT:    v_log_f16_e32 v4, v4
+; GFX11-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
+; GFX11-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; GFX11-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_mul_f16_e32 v2, 0x398c, v2
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_mul_f16_e32 v3, 0x398c, v3
+; GFX11-NEXT:    v_mul_f16_e32 v4, 0x398c, v4
+; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v4
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+	%res = call <5 x half> @llvm.log.v5f16(<5 x half> %a)
+	ret <5 x half> %res
+}
+
+define <5 x half> @log2_v5f16(<5 x half> %a) {
+; GFX8-LABEL: log2_v5f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_log_f16_sdwa v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_log_f16_sdwa v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_log_f16_e32 v0, v0
+; GFX8-NEXT:    v_log_f16_e32 v1, v1
+; GFX8-NEXT:    v_log_f16_e32 v2, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: log2_v5f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT:    v_log_f16_e32 v2, v2
+; GFX9-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: log2_v5f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_log_f16_e32 v2, v2
+; GFX10-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: log2_v5f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-NEXT:    v_log_f16_e32 v1, v1
+; GFX11-NEXT:    v_log_f16_e32 v0, v0
+; GFX11-NEXT:    v_log_f16_e32 v2, v2
+; GFX11-NEXT:    v_log_f16_e32 v3, v3
+; GFX11-NEXT:    v_log_f16_e32 v4, v4
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v3
+; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v4
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+	%res = call <5 x half> @llvm.log2.v5f16(<5 x half> %a)
+	ret <5 x half> %res
+}
+
+define <5 x half> @log10_v5f16(<5 x half> %a) {
+; GFX8-LABEL: log10_v5f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_log_f16_e32 v3, v1
+; GFX8-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_log_f16_e32 v4, v0
+; GFX8-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_log_f16_e32 v2, v2
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0x34d1
+; GFX8-NEXT:    v_mul_f16_e32 v3, 0x34d1, v3
+; GFX8-NEXT:    v_mul_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_mul_f16_e32 v4, 0x34d1, v4
+; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_mul_f16_e32 v2, 0x34d1, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: log10_v5f16:
+; GFX906:       ; %bb.0:
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT:    s_movk_i32 s4, 0x34d1
+; GFX906-NEXT:    v_log_f16_e32 v2, v2
+; GFX906-NEXT:    v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT:    v_mul_f16_e32 v2, 0x34d1, v2
+; GFX906-NEXT:    v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: log10_v5f16:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT:    s_movk_i32 s4, 0x34d1
+; GFX908-NEXT:    v_log_f16_e32 v2, v2
+; GFX908-NEXT:    v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT:    v_mul_f16_e32 v2, 0x34d1, v2
+; GFX908-NEXT:    v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: log10_v5f16:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT:    s_movk_i32 s0, 0x34d1
+; GFX942-NEXT:    v_log_f16_e32 v2, v2
+; GFX942-NEXT:    v_mul_f16_sdwa v1, v1, s0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT:    v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT:    v_mul_f16_e32 v2, 0x34d1, v2
+; GFX942-NEXT:    v_mul_f16_sdwa v1, v1, s0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT:    v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: log10_v5f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0x34d1
+; GFX10-NEXT:    v_log_f16_e32 v2, v2
+; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_mul_f16_e32 v2, 0x34d1, v2
+; GFX10-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: log10_v5f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-NEXT:    v_log_f16_e32 v1, v1
+; GFX11-NEXT:    v_log_f16_e32 v0, v0
+; GFX11-NEXT:    v_log_f16_e32 v2, v2
+; GFX11-NEXT:    v_log_f16_e32 v3, v3
+; GFX11-NEXT:    v_log_f16_e32 v4, v4
+; GFX11-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
+; GFX11-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; GFX11-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_mul_f16_e32 v2, 0x34d1, v2
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_mul_f16_e32 v3, 0x34d1, v3
+; GFX11-NEXT:    v_mul_f16_e32 v4, 0x34d1, v4
+; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v4
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+	%res = call <5 x half> @llvm.log10.v5f16(<5 x half> %a)
+	ret <5 x half> %res
+}
+
+define <5 x half> @exp_v5f16(<5 x half> %a) {
+; GFX8-LABEL: exp_v5f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX8-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_cvt_f32_f16_e32 v4, v0
+; GFX8-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX8-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v3
+; GFX8-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX8-NEXT:    v_mul_f32_e32 v4, 0x3fb8aa3b, v4
+; GFX8-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX8-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+; GFX8-NEXT:    v_exp_f32_e32 v3, v3
+; GFX8-NEXT:    v_exp_f32_e32 v4, v4
+; GFX8-NEXT:    v_exp_f32_e32 v0, v0
+; GFX8-NEXT:    v_exp_f32_e32 v1, v1
+; GFX8-NEXT:    v_exp_f32_e32 v2, v2
+; GFX8-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX8-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX8-NEXT:    v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: exp_v5f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX9-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v4, v0
+; GFX9-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX9-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v3
+; GFX9-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX9-NEXT:    v_mul_f32_e32 v4, 0x3fb8aa3b, v4
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX9-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+; GFX9-NEXT:    v_exp_f32_e32 v3, v3
+; GFX9-NEXT:    v_exp_f32_e32 v4, v4
+; GFX9-NEXT:    v_exp_f32_e32 v0, v0
+; GFX9-NEXT:    v_exp_f32_e32 v1, v1
+; GFX9-NEXT:    v_exp_f32_e32 v2, v2
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX9-NEXT:    v_pack_b32_f16 v0, v4, v0
+; GFX9-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: exp_v5f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v4, v0
+; GFX10-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX10-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v3
+; GFX10-NEXT:    v_mul_f32_e32 v4, 0x3fb8aa3b, v4
+; GFX10-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX10-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX10-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+; GFX10-NEXT:    v_exp_f32_e32 v3, v3
+; GFX10-NEXT:    v_exp_f32_e32 v4, v4
+; GFX10-NEXT:    v_exp_f32_e32 v0, v0
+; GFX10-NEXT:    v_exp_f32_e32 v1, v1
+; GFX10-NEXT:    v_exp_f32_e32 v2, v2
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX10-NEXT:    v_pack_b32_f16 v0, v4, v0
+; GFX10-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: exp_v5f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_mul_f32 v1, 0x3fb8aa3b, v1 :: v_dual_mul_f32 v0, 0x3fb8aa3b, v0
+; GFX11-NEXT:    v_dual_mul_f32 v2, 0x3fb8aa3b, v2 :: v_dual_mul_f32 v3, 0x3fb8aa3b, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_mul_f32_e32 v4, 0x3fb8aa3b, v4
+; GFX11-NEXT:    v_exp_f32_e32 v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_exp_f32_e32 v0, v0
+; GFX11-NEXT:    v_exp_f32_e32 v2, v2
+; GFX11-NEXT:    v_exp_f32_e32 v3, v3
+; GFX11-NEXT:    v_exp_f32_e32 v4, v4
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v4
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+	%res = call <5 x half> @llvm.exp.v5f16(<5 x half> %a)
+	ret <5 x half> %res
+}
+
+define <5 x half> @exp2_v5f16(<5 x half> %a) {
+; GFX8-LABEL: exp2_v5f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_exp_f16_sdwa v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_exp_f16_sdwa v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_exp_f16_e32 v0, v0
+; GFX8-NEXT:    v_exp_f16_e32 v1, v1
+; GFX8-NEXT:    v_exp_f16_e32 v2, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: exp2_v5f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT:    v_exp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT:    v_exp_f16_e32 v2, v2
+; GFX9-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_exp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: exp2_v5f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_exp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_exp_f16_e32 v2, v2
+; GFX10-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_exp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: exp2_v5f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-NEXT:    v_exp_f16_e32 v1, v1
+; GFX11-NEXT:    v_exp_f16_e32 v0, v0
+; GFX11-NEXT:    v_exp_f16_e32 v2, v2
+; GFX11-NEXT:    v_exp_f16_e32 v3, v3
+; GFX11-NEXT:    v_exp_f16_e32 v4, v4
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v3
+; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v4
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+	%res = call <5 x half> @llvm.exp2.v5f16(<5 x half> %a)
+	ret <5 x half> %res
+}
+
+define <5 x half> @exp10_v5f16(<5 x half> %a) {
+; GFX8-LABEL: exp10_v5f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX8-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_cvt_f32_f16_e32 v4, v0
+; GFX8-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX8-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v3
+; GFX8-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX8-NEXT:    v_mul_f32_e32 v4, 0x3fb8aa3b, v4
+; GFX8-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX8-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+; GFX8-NEXT:    v_exp_f32_e32 v3, v3
+; GFX8-NEXT:    v_exp_f32_e32 v4, v4
+; GFX8-NEXT:    v_exp_f32_e32 v0, v0
+; GFX8-NEXT:    v_exp_f32_e32 v1, v1
+; GFX8-NEXT:    v_exp_f32_e32 v2, v2
+; GFX8-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX8-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX8-NEXT:    v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: exp10_v5f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX9-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v4, v0
+; GFX9-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX9-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v3
+; GFX9-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX9-NEXT:    v_mul_f32_e32 v4, 0x3fb8aa3b, v4
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX9-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+; GFX9-NEXT:    v_exp_f32_e32 v3, v3
+; GFX9-NEXT:    v_exp_f32_e32 v4, v4
+; GFX9-NEXT:    v_exp_f32_e32 v0, v0
+; GFX9-NEXT:    v_exp_f32_e32 v1, v1
+; GFX9-NEXT:    v_exp_f32_e32 v2, v2
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX9-NEXT:    v_pack_b32_f16 v0, v4, v0
+; GFX9-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: exp10_v5f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v4, v0
+; GFX10-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX10-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v3
+; GFX10-NEXT:    v_mul_f32_e32 v4, 0x3fb8aa3b, v4
+; GFX10-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX10-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX10-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+; GFX10-NEXT:    v_exp_f32_e32 v3, v3
+; GFX10-NEXT:    v_exp_f32_e32 v4, v4
+; GFX10-NEXT:    v_exp_f32_e32 v0, v0
+; GFX10-NEXT:    v_exp_f32_e32 v1, v1
+; GFX10-NEXT:    v_exp_f32_e32 v2, v2
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX10-NEXT:    v_pack_b32_f16 v0, v4, v0
+; GFX10-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: exp10_v5f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_mul_f32 v1, 0x3fb8aa3b, v1 :: v_dual_mul_f32 v0, 0x3fb8aa3b, v0
+; GFX11-NEXT:    v_dual_mul_f32 v2, 0x3fb8aa3b, v2 :: v_dual_mul_f32 v3, 0x3fb8aa3b, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_mul_f32_e32 v4, 0x3fb8aa3b, v4
+; GFX11-NEXT:    v_exp_f32_e32 v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_exp_f32_e32 v0, v0
+; GFX11-NEXT:    v_exp_f32_e32 v2, v2
+; GFX11-NEXT:    v_exp_f32_e32 v3, v3
+; GFX11-NEXT:    v_exp_f32_e32 v4, v4
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v4
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+	%res = call <5 x half> @llvm.exp10.v5f16(<5 x half> %a)
+	ret <5 x half> %res
+}
+
+define <5 x half> @sqrt_v5f16(<5 x half> %a) {
+; GFX8-LABEL: sqrt_v5f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_sqrt_f16_sdwa v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_sqrt_f16_sdwa v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_sqrt_f16_e32 v0, v0
+; GFX8-NEXT:    v_sqrt_f16_e32 v1, v1
+; GFX8-NEXT:    v_sqrt_f16_e32 v2, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: sqrt_v5f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT:    v_sqrt_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT:    v_sqrt_f16_e32 v2, v2
+; GFX9-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_sqrt_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: sqrt_v5f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_sqrt_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_sqrt_f16_e32 v2, v2
+; GFX10-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_sqrt_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: sqrt_v5f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-NEXT:    v_sqrt_f16_e32 v1, v1
+; GFX11-NEXT:    v_sqrt_f16_e32 v0, v0
+; GFX11-NEXT:    v_sqrt_f16_e32 v2, v2
+; GFX11-NEXT:    v_sqrt_f16_e32 v3, v3
+; GFX11-NEXT:    v_sqrt_f16_e32 v4, v4
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v3
+; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v4
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+	%res = call <5 x half> @llvm.sqrt.v5f16(<5 x half> %a)
+	ret <5 x half> %res
+}
+
+define <4 x half> @cascaded_v4f16(<4 x half> %a) {
+; GFX8-LABEL: cascaded_v4f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_log_f16_e32 v2, v1
+; GFX8-NEXT:    v_log_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_log_f16_e32 v0, v0
+; GFX8-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
+; GFX8-NEXT:    v_mul_f16_e32 v3, 0.15915494, v3
+; GFX8-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
+; GFX8-NEXT:    v_mul_f16_e32 v0, 0.15915494, v0
+; GFX8-NEXT:    v_fract_f16_e32 v2, v2
+; GFX8-NEXT:    v_fract_f16_e32 v1, v1
+; GFX8-NEXT:    v_fract_f16_e32 v0, v0
+; GFX8-NEXT:    v_fract_f16_e32 v3, v3
+; GFX8-NEXT:    v_sin_f16_e32 v2, v2
+; GFX8-NEXT:    v_sin_f16_e32 v0, v0
+; GFX8-NEXT:    v_sin_f16_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: cascaded_v4f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_log_f16_e32 v2, v1
+; GFX9-NEXT:    v_log_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_log_f16_e32 v0, v0
+; GFX9-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
+; GFX9-NEXT:    v_mul_f16_e32 v3, 0.15915494, v3
+; GFX9-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
+; GFX9-NEXT:    v_mul_f16_e32 v0, 0.15915494, v0
+; GFX9-NEXT:    v_sin_f16_e32 v2, v2
+; GFX9-NEXT:    v_sin_f16_e32 v0, v0
+; GFX9-NEXT:    v_sin_f16_e32 v3, v3
+; GFX9-NEXT:    v_sin_f16_e32 v1, v1
+; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v3
+; GFX9-NEXT:    v_pack_b32_f16 v1, v2, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: cascaded_v4f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_log_f16_e32 v2, v1
+; GFX10-NEXT:    v_log_f16_e32 v3, v0
+; GFX10-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
+; GFX10-NEXT:    v_mul_f16_e32 v3, 0.15915494, v3
+; GFX10-NEXT:    v_mul_f16_e32 v0, 0.15915494, v0
+; GFX10-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
+; GFX10-NEXT:    v_sin_f16_e32 v2, v2
+; GFX10-NEXT:    v_sin_f16_e32 v3, v3
+; GFX10-NEXT:    v_sin_f16_e32 v0, v0
+; GFX10-NEXT:    v_sin_f16_e32 v1, v1
+; GFX10-NEXT:    v_pack_b32_f16 v0, v3, v0
+; GFX10-NEXT:    v_pack_b32_f16 v1, v2, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: cascaded_v4f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT:    v_log_f16_e32 v1, v1
+; GFX11-NEXT:    v_log_f16_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_log_f16_e32 v2, v2
+; GFX11-NEXT:    v_log_f16_e32 v3, v3
+; GFX11-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
+; GFX11-NEXT:    s_delay_alu instid0(TRANS32_DEP_3)
+; GFX11-NEXT:    v_mul_f16_e32 v0, 0.15915494, v0
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
+; GFX11-NEXT:    v_mul_f16_e32 v3, 0.15915494, v3
+; GFX11-NEXT:    v_sin_f16_e32 v1, v1
+; GFX11-NEXT:    v_sin_f16_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_sin_f16_e32 v2, v2
+; GFX11-NEXT:    v_sin_f16_e32 v3, v3
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+	%b = call <4 x half> @llvm.log2.v4f16(<4 x half> %a)
+  %res = call <4 x half> @llvm.sin.v4f16(<4 x half> %b)
+	ret <4 x half> %res
+}
+
+define <5 x half> @cascaded_v5f16(<5 x half> %a) {
+; GFX8-LABEL: cascaded_v5f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_log_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_log_f16_e32 v4, v1
+; GFX8-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_log_f16_e32 v0, v0
+; GFX8-NEXT:    v_log_f16_e32 v2, v2
+; GFX8-NEXT:    v_mul_f16_e32 v4, 0.15915494, v4
+; GFX8-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
+; GFX8-NEXT:    v_mul_f16_e32 v0, 0.15915494, v0
+; GFX8-NEXT:    v_mul_f16_e32 v3, 0.15915494, v3
+; GFX8-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
+; GFX8-NEXT:    v_fract_f16_e32 v4, v4
+; GFX8-NEXT:    v_fract_f16_e32 v1, v1
+; GFX8-NEXT:    v_fract_f16_e32 v0, v0
+; GFX8-NEXT:    v_fract_f16_e32 v3, v3
+; GFX8-NEXT:    v_sin_f16_e32 v4, v4
+; GFX8-NEXT:    v_sin_f16_e32 v0, v0
+; GFX8-NEXT:    v_sin_f16_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_fract_f16_e32 v2, v2
+; GFX8-NEXT:    v_sin_f16_e32 v2, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: cascaded_v5f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_log_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_log_f16_e32 v4, v1
+; GFX9-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_log_f16_e32 v0, v0
+; GFX9-NEXT:    v_log_f16_e32 v2, v2
+; GFX9-NEXT:    v_mul_f16_e32 v4, 0.15915494, v4
+; GFX9-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
+; GFX9-NEXT:    v_mul_f16_e32 v0, 0.15915494, v0
+; GFX9-NEXT:    v_mul_f16_e32 v3, 0.15915494, v3
+; GFX9-NEXT:    v_sin_f16_e32 v4, v4
+; GFX9-NEXT:    v_sin_f16_e32 v0, v0
+; GFX9-NEXT:    v_sin_f16_e32 v3, v3
+; GFX9-NEXT:    v_sin_f16_e32 v1, v1
+; GFX9-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
+; GFX9-NEXT:    v_sin_f16_e32 v2, v2
+; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v3
+; GFX9-NEXT:    v_pack_b32_f16 v1, v4, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: cascaded_v5f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_log_f16_e32 v3, v1
+; GFX10-NEXT:    v_log_f16_e32 v4, v0
+; GFX10-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_log_f16_e32 v2, v2
+; GFX10-NEXT:    v_mul_f16_e32 v3, 0.15915494, v3
+; GFX10-NEXT:    v_mul_f16_e32 v4, 0.15915494, v4
+; GFX10-NEXT:    v_mul_f16_e32 v0, 0.15915494, v0
+; GFX10-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
+; GFX10-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
+; GFX10-NEXT:    v_sin_f16_e32 v3, v3
+; GFX10-NEXT:    v_sin_f16_e32 v4, v4
+; GFX10-NEXT:    v_sin_f16_e32 v0, v0
+; GFX10-NEXT:    v_sin_f16_e32 v1, v1
+; GFX10-NEXT:    v_sin_f16_e32 v2, v2
+; GFX10-NEXT:    v_pack_b32_f16 v0, v4, v0
+; GFX10-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: cascaded_v5f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-NEXT:    v_log_f16_e32 v1, v1
+; GFX11-NEXT:    v_log_f16_e32 v0, v0
+; GFX11-NEXT:    v_log_f16_e32 v2, v2
+; GFX11-NEXT:    v_log_f16_e32 v3, v3
+; GFX11-NEXT:    v_log_f16_e32 v4, v4
+; GFX11-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
+; GFX11-NEXT:    v_mul_f16_e32 v0, 0.15915494, v0
+; GFX11-NEXT:    s_delay_alu instid0(TRANS32_DEP_3)
+; GFX11-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_mul_f16_e32 v3, 0.15915494, v3
+; GFX11-NEXT:    v_mul_f16_e32 v4, 0.15915494, v4
+; GFX11-NEXT:    v_sin_f16_e32 v1, v1
+; GFX11-NEXT:    v_sin_f16_e32 v0, v0
+; GFX11-NEXT:    v_sin_f16_e32 v2, v2
+; GFX11-NEXT:    v_sin_f16_e32 v3, v3
+; GFX11-NEXT:    v_sin_f16_e32 v4, v4
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v3
+; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v4
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+	%b = call <5 x half> @llvm.log2.v5f16(<5 x half> %a)
+  %res = call <5 x half> @llvm.sin.v5f16(<5 x half> %b)
+	ret <5 x half> %res
+}

>From 9a1a08a2dc676a6cf581aae6f8ebd4e48c6171b3 Mon Sep 17 00:00:00 2001
From: vikashgu <Vikash.Gupta at amd.com>
Date: Thu, 24 Apr 2025 11:10:42 +0000
Subject: [PATCH 4/8] Refactored the code in order to encapsulate redundant
 code as lambda function.

---
 llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 57 ++++++++++++-----------
 1 file changed, 31 insertions(+), 26 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 930dc9aac5a8d..7cc93b4f9f490 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -1480,8 +1480,21 @@ static bool checkForRightSrcRootAccess(MachineInstr *Def0MI,
                                        MachineInstr *Def1MI,
                                        Register SrcRootReg,
                                        const SIInstrInfo *TII) {
-  // As if could, the Def1MI would have been sdwa-ed
-  if (!TII->isSDWA(Def1MI->getOpcode()))
+  // As if could, the Def1MI would have been sdwa-ed in order to access
+  // upper half, and Def0MI should not be as it accessing lower half.
+  if (!TII->isSDWA(Def1MI->getOpcode()) || TII->isSDWA(Def0MI->getOpcode()))
+    return false;
+
+  // Def1 should be writing into entire DWORD of dst, with unused part set
+  // to zero-pad.
+  MachineOperand *Def1DstSel =
+      TII->getNamedOperand(*Def1MI, AMDGPU::OpName::dst_sel);
+  if (!Def1DstSel || Def1DstSel->getImm() != AMDGPU::SDWA::SdwaSel::DWORD)
+    return false;
+  MachineOperand *Def1DstUnused =
+      TII->getNamedOperand(*Def1MI, AMDGPU::OpName::dst_unused);
+  if (!Def1DstUnused ||
+      Def1DstUnused->getImm() != AMDGPU::SDWA::DstUnused::UNUSED_PAD)
     return false;
 
   MachineOperand *Def1Src0 =
@@ -1493,13 +1506,7 @@ static bool checkForRightSrcRootAccess(MachineInstr *Def0MI,
   MachineOperand *Def0Src1 =
       TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src1);
 
-  if (Def1Src0 && Def1Src0->isReg() && (Def1Src0->getReg() == SrcRootReg)) {
-    MachineOperand *Def1Src0Sel =
-        TII->getNamedOperand(*Def1MI, AMDGPU::OpName::src0_sel);
-    if (!Def1Src0Sel ||
-        (Def1Src0Sel->getImm() != AMDGPU::SDWA::SdwaSel::WORD_1))
-      return false;
-
+  auto chkForDef0MIAccess = [&]() -> bool {
     if (Def0Src0 && Def0Src0->isReg() && (Def0Src0->getReg() == SrcRootReg)) {
       MachineOperand *Def0Src0Sel =
           TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src0_sel);
@@ -1517,6 +1524,19 @@ static bool checkForRightSrcRootAccess(MachineInstr *Def0MI,
       if (Def0Src1Sel && Def0Src1Sel->getImm() == AMDGPU::SDWA::SdwaSel::WORD_0)
         return true;
     }
+
+    return false;
+  };
+
+  if (Def1Src0 && Def1Src0->isReg() && (Def1Src0->getReg() == SrcRootReg)) {
+    MachineOperand *Def1Src0Sel =
+        TII->getNamedOperand(*Def1MI, AMDGPU::OpName::src0_sel);
+    if (!Def1Src0Sel ||
+        (Def1Src0Sel->getImm() != AMDGPU::SDWA::SdwaSel::WORD_1))
+      return false;
+
+    if (chkForDef0MIAccess())
+      return true;
   }
 
   if (Def1Src1 && Def1Src1->isReg() && (Def1Src1->getReg() == SrcRootReg)) {
@@ -1526,23 +1546,8 @@ static bool checkForRightSrcRootAccess(MachineInstr *Def0MI,
         (Def1Src1Sel->getImm() != AMDGPU::SDWA::SdwaSel::WORD_1))
       return false;
 
-    if (Def0Src0 && Def0Src0->isReg() && (Def0Src0->getReg() == SrcRootReg)) {
-      MachineOperand *Def0Src0Sel =
-          TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src0_sel);
-      if (!Def0Src0Sel)
-        return true;
-      if (Def0Src0Sel && Def0Src0Sel->getImm() == AMDGPU::SDWA::SdwaSel::WORD_0)
-        return true;
-    }
-
-    if (Def0Src1 && Def0Src1->isReg() && (Def0Src1->getReg() == SrcRootReg)) {
-      MachineOperand *Def0Src1Sel =
-          TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src1_sel);
-      if (!Def0Src1Sel)
-        return true;
-      if (Def0Src1Sel && Def0Src1Sel->getImm() == AMDGPU::SDWA::SdwaSel::WORD_0)
-        return true;
-    }
+    if (chkForDef0MIAccess())
+      return true;
   }
 
   return false;

>From 9d75623d0a13c478917b0322d0448790e53bf6ae Mon Sep 17 00:00:00 2001
From: vikashgu <Vikash.Gupta at amd.com>
Date: Mon, 28 Apr 2025 09:33:53 +0000
Subject: [PATCH 5/8] Added reviewed changes addressing redundant code &
 complex logic.

---
 llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 450 ++++++++++------------
 1 file changed, 214 insertions(+), 236 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 7cc93b4f9f490..4ab731f149c93 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -78,7 +78,7 @@ class SIPeepholeSDWA {
   computeMIChainsForPackedOps(MachineInstr *ParentMI,
                               std::queue<MachineOperand *> &DefSrcQueue,
                               const GCNSubtarget &ST);
-  void convertMIToSDWAWithOpsel(MachineInstr &MI, MachineOperand &SrcMO,
+  void convertMIToSDWAWithOpsel(MachineInstr *MI, MachineOperand &SrcMO,
                                 AMDGPU::SDWA::SdwaSel OpSel);
 
 public:
@@ -281,17 +281,13 @@ void SDWADstPreserveOperand::print(raw_ostream& OS) const {
 
 #endif
 
-static void copyRegOperand(MachineOperand &To, const MachineOperand &From,
-                           bool isKill = false) {
+static void copyRegOperand(MachineOperand &To, const MachineOperand &From) {
   assert(To.isReg() && From.isReg());
   To.setReg(From.getReg());
   To.setSubReg(From.getSubReg());
   To.setIsUndef(From.isUndef());
   if (To.isUse()) {
-    if (isKill)
-      To.setIsKill(true);
-    else
-      To.setIsKill(From.isKill());
+    To.setIsKill(From.isKill());
   } else {
     To.setIsDead(From.isDead());
   }
@@ -1506,22 +1502,20 @@ static bool checkForRightSrcRootAccess(MachineInstr *Def0MI,
   MachineOperand *Def0Src1 =
       TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src1);
 
-  auto chkForDef0MIAccess = [&]() -> bool {
+  auto checkForDef0MIAccess = [&]() -> bool {
     if (Def0Src0 && Def0Src0->isReg() && (Def0Src0->getReg() == SrcRootReg)) {
       MachineOperand *Def0Src0Sel =
           TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src0_sel);
-      if (!Def0Src0Sel)
-        return true;
-      if (Def0Src0Sel && Def0Src0Sel->getImm() == AMDGPU::SDWA::SdwaSel::WORD_0)
+      if (!Def0Src0Sel ||
+          Def0Src0Sel->getImm() == AMDGPU::SDWA::SdwaSel::WORD_0)
         return true;
     }
 
     if (Def0Src1 && Def0Src1->isReg() && (Def0Src1->getReg() == SrcRootReg)) {
       MachineOperand *Def0Src1Sel =
           TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src1_sel);
-      if (!Def0Src1Sel)
-        return true;
-      if (Def0Src1Sel && Def0Src1Sel->getImm() == AMDGPU::SDWA::SdwaSel::WORD_0)
+      if (!Def0Src1Sel ||
+          Def0Src1Sel->getImm() == AMDGPU::SDWA::SdwaSel::WORD_0)
         return true;
     }
 
@@ -1531,22 +1525,20 @@ static bool checkForRightSrcRootAccess(MachineInstr *Def0MI,
   if (Def1Src0 && Def1Src0->isReg() && (Def1Src0->getReg() == SrcRootReg)) {
     MachineOperand *Def1Src0Sel =
         TII->getNamedOperand(*Def1MI, AMDGPU::OpName::src0_sel);
-    if (!Def1Src0Sel ||
-        (Def1Src0Sel->getImm() != AMDGPU::SDWA::SdwaSel::WORD_1))
+    if (!Def1Src0Sel || Def1Src0Sel->getImm() != AMDGPU::SDWA::SdwaSel::WORD_1)
       return false;
 
-    if (chkForDef0MIAccess())
+    if (checkForDef0MIAccess())
       return true;
   }
 
   if (Def1Src1 && Def1Src1->isReg() && (Def1Src1->getReg() == SrcRootReg)) {
     MachineOperand *Def1Src1Sel =
         TII->getNamedOperand(*Def1MI, AMDGPU::OpName::src1_sel);
-    if (!Def1Src1Sel ||
-        (Def1Src1Sel->getImm() != AMDGPU::SDWA::SdwaSel::WORD_1))
+    if (!Def1Src1Sel || Def1Src1Sel->getImm() != AMDGPU::SDWA::SdwaSel::WORD_1)
       return false;
 
-    if (chkForDef0MIAccess())
+    if (checkForDef0MIAccess())
       return true;
   }
 
@@ -1571,71 +1563,69 @@ static bool dominates(MachineBasicBlock::const_iterator A,
 
 // Convert MI into its SDWA version with its Dst_Sel & SrcMO_Sel set with OpSel
 // and preserving the rest of Dst's bits.
-void SIPeepholeSDWA::convertMIToSDWAWithOpsel(MachineInstr &MI,
+void SIPeepholeSDWA::convertMIToSDWAWithOpsel(MachineInstr *MI,
                                               MachineOperand &SrcMO,
                                               AMDGPU::SDWA::SdwaSel OpSel) {
   LLVM_DEBUG(dbgs() << "Convert instruction:" << MI);
 
-  MachineInstr *SDWAInst;
-  if (TII->isSDWA(MI.getOpcode())) {
-    SDWAInst = &MI;
-  } else {
-    SDWAInst = createSDWAVersion(MI);
-    MI.eraseFromParent();
+  if (!TII->isSDWA(MI->getOpcode())) {
+    MachineInstr *SDWAInst = createSDWAVersion(*MI);
+    MI->eraseFromParent();
+    MI = SDWAInst;
   }
 
-  ConvertedInstructions.push_back(SDWAInst);
-  unsigned SDWAOpcode = SDWAInst->getOpcode();
+  ConvertedInstructions.push_back(MI);
+  unsigned SDWAOpcode = MI->getOpcode();
   ++NumSDWAInstructionsToEliminateFP16Pack;
 
-  MachineOperand *Dst = TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::vdst);
+  MachineOperand *Dst = TII->getNamedOperand(*MI, AMDGPU::OpName::vdst);
   assert(Dst && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::vdst));
 
-  MachineOperand *DstSel =
-      TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::dst_sel);
+  MachineOperand *DstSel = TII->getNamedOperand(*MI, AMDGPU::OpName::dst_sel);
   assert(DstSel &&
          AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::dst_sel));
   DstSel->setImm(OpSel);
 
   MachineOperand *DstUnused =
-      TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::dst_unused);
+      TII->getNamedOperand(*MI, AMDGPU::OpName::dst_unused);
   assert(DstUnused &&
          AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::dst_unused));
   assert(!(DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) &&
          "Dst_unused should not be UNUSED_PRESERVE already");
   DstUnused->setImm(AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE);
 
-  auto PreserveDstIdx =
+  int PreserveDstIdx =
       AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst);
   assert(PreserveDstIdx != -1);
-  auto NewSrcImplitMO = MachineOperand::CreateReg(SrcMO.getReg(), false, true);
+  MachineOperand NewSrcImplitMO =
+      MachineOperand::CreateReg(SrcMO.getReg(), false, true);
   copyRegOperand(NewSrcImplitMO, SrcMO);
-  SDWAInst->addOperand(NewSrcImplitMO);
-  SDWAInst->tieOperands(PreserveDstIdx, SDWAInst->getNumOperands() - 1);
+  MI->addOperand(NewSrcImplitMO);
+  MI->tieOperands(PreserveDstIdx, MI->getNumOperands() - 1);
 
-  MachineOperand *Src0 = TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::src0);
+  MachineOperand *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
   assert(Src0 && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0));
   if (Src0->isReg() && (Src0->getReg() == SrcMO.getReg())) {
     MachineOperand *Src0Sel =
-        TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::src0_sel);
+        TII->getNamedOperand(*MI, AMDGPU::OpName::src0_sel);
     assert(Src0Sel &&
            AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_sel));
     Src0Sel->setImm(OpSel);
 
-    LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n');
+    LLVM_DEBUG(dbgs() << "\nInto:" << *MI << '\n');
     return;
   }
 
-  MachineOperand *Src1 = TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::src1);
+  MachineOperand *Src1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1);
   assert(Src1 && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1));
   if (Src1->isReg() && (Src1->getReg() == SrcMO.getReg())) {
     MachineOperand *Src1Sel =
-        TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::src1_sel);
+        TII->getNamedOperand(*MI, AMDGPU::OpName::src1_sel);
     assert(Src1Sel &&
            AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_sel));
     Src1Sel->setImm(OpSel);
 
-    LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n');
+    LLVM_DEBUG(dbgs() << "\nInto:" << *MI << '\n');
     return;
   }
 }
@@ -1646,15 +1636,20 @@ unsigned SIPeepholeSDWA::computeMIChainsForPackedOps(
     MachineInstr *ParentMI, std::queue<MachineOperand *> &DefSrcQueue,
     const GCNSubtarget &ST) {
   unsigned NumOfFP16Def;
+
+  // We will go up the use-def chain for ParentMI, until we encounter the
+  // exit condition, where we don't find any such defs of use operands
+  // which satisfy convertibility to SDWA OR find such uses more than 1 as now
+  // we don't know which path to follow-up.
   do {
-    MachineInstr *NextMIInChain = nullptr;
     NumOfFP16Def = 0;
-    for (MachineOperand &currentMO : ParentMI->uses()) {
-      if (!currentMO.isReg() || currentMO.getReg().isPhysical() ||
-          !MRI->hasOneUse(currentMO.getReg()))
+    MachineInstr *NextMIInChain = nullptr;
+    for (MachineOperand &CurrentMO : ParentMI->uses()) {
+      if (!CurrentMO.isReg() || CurrentMO.getReg().isPhysical() ||
+          !MRI->hasOneUse(CurrentMO.getReg()))
         continue;
 
-      MachineOperand *DefCurrMO = findSingleRegDef(&currentMO, MRI);
+      MachineOperand *DefCurrMO = findSingleRegDef(&CurrentMO, MRI);
       if (!DefCurrMO)
         continue;
 
@@ -1668,11 +1663,8 @@ unsigned SIPeepholeSDWA::computeMIChainsForPackedOps(
       NumOfFP16Def++;
     }
 
-    if (NumOfFP16Def > 1)
-      break;
-
     ParentMI = NextMIInChain;
-  } while (ParentMI);
+  } while (NumOfFP16Def == 1);
 
   return NumOfFP16Def;
 }
@@ -1683,216 +1675,202 @@ void SIPeepholeSDWA::eliminateFP16Packing(MachineBasicBlock &MBB,
     return;
 
   for (MachineInstr &MI : make_early_inc_range(MBB)) {
-    if (MI.getOpcode() == AMDGPU::V_PACK_B32_F16_e64) {
-      LLVM_DEBUG(dbgs() << "\nCandidate FP16 Packed MI : " << MI << '\n');
-      std::queue<MachineOperand *> DefSrc0Queue;
-      std::queue<MachineOperand *> DefSrc1Queue;
-      MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
-      MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
-
-      if (!Src0->isReg() || Src0->getReg().isPhysical() ||
-          !MRI->hasOneUse(Src0->getReg()) || !Src1->isReg() ||
-          Src1->getReg().isPhysical() || !MRI->hasOneUse(Src1->getReg()))
-        continue;
+    if (MI.getOpcode() != AMDGPU::V_PACK_B32_F16_e64)
+      continue;
+    LLVM_DEBUG(dbgs() << "\nCandidate FP16 Packed MI : " << MI << '\n');
 
-      MachineOperand *Op0 = findSingleRegDef(Src0, MRI);
-      MachineOperand *Op1 = findSingleRegDef(Src1, MRI);
+    std::queue<MachineOperand *> DefSrc0Queue;
+    std::queue<MachineOperand *> DefSrc1Queue;
+    MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+    MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
 
-      if (!Op0 || !Op1)
-        continue;
+    if (!Src0->isReg() || Src0->getReg().isPhysical() ||
+        !MRI->hasOneUse(Src0->getReg()) || !Src1->isReg() ||
+        Src1->getReg().isPhysical() || !MRI->hasOneUse(Src1->getReg()))
+      continue;
 
-      MachineInstr *ParentMIOp0 = Op0->getParent();
-      MachineInstr *ParentMIOp1 = Op1->getParent();
+    MachineOperand *Op0 = findSingleRegDef(Src0, MRI);
+    MachineOperand *Op1 = findSingleRegDef(Src1, MRI);
 
-      if (!isSrcDestFP16Bits(ParentMIOp0, TII) ||
-          !isSrcDestFP16Bits(ParentMIOp1, TII) ||
-          !isConvertibleToSDWA(*ParentMIOp0, ST, TII) ||
-          !isConvertibleToSDWA(*ParentMIOp1, ST, TII))
-        continue;
+    if (!Op0 || !Op1)
+      continue;
 
-      DefSrc0Queue.push(Op0);
-      DefSrc1Queue.push(Op1);
+    MachineInstr *ParentMIOp0 = Op0->getParent();
+    MachineInstr *ParentMIOp1 = Op1->getParent();
 
-      // This checks for the given MI, that it only has exact one register MO
-      // use , that is defined by pure FP16 instruction (that is SDWA-able too)
-      unsigned NumOfFP16Def;
+    if (!isSrcDestFP16Bits(ParentMIOp0, TII) ||
+        !isSrcDestFP16Bits(ParentMIOp1, TII) ||
+        !isConvertibleToSDWA(*ParentMIOp0, ST, TII) ||
+        !isConvertibleToSDWA(*ParentMIOp1, ST, TII))
+      continue;
 
-      NumOfFP16Def = computeMIChainsForPackedOps(ParentMIOp0, DefSrc0Queue, ST);
-      if (NumOfFP16Def > 1)
-        continue;
+    DefSrc0Queue.push(Op0);
+    DefSrc1Queue.push(Op1);
 
-      NumOfFP16Def = computeMIChainsForPackedOps(ParentMIOp1, DefSrc1Queue, ST);
-      if (NumOfFP16Def > 1)
-        continue;
+    // This checks for the given MI, that it only has exact one register MO
+    // use , that is defined by pure FP16 instruction (that is SDWA-able too)
+    unsigned NumOfFP16Def;
 
-      MachineInstr *Def0RootMI = (DefSrc0Queue.back())->getParent();
-      MachineInstr *Def1RootMI = (DefSrc1Queue.back())->getParent();
-      Register SrcRootMOReg = AMDGPU::NoRegister;
+    NumOfFP16Def = computeMIChainsForPackedOps(ParentMIOp0, DefSrc0Queue, ST);
+    if (NumOfFP16Def > 1)
+      continue;
 
-      // Now, check if the last operation for each in of the DefSrcQueue
-      // has the common MO, that would be the source root MO for element-wise
-      // fp16 chain operations
-      for (MachineOperand &Current0MO : Def0RootMI->uses()) {
-        if (!Current0MO.isReg() || Current0MO.getReg().isPhysical())
-          continue;
+    NumOfFP16Def = computeMIChainsForPackedOps(ParentMIOp1, DefSrc1Queue, ST);
+    if (NumOfFP16Def > 1)
+      continue;
 
-        for (MachineOperand &Current1MO : Def1RootMI->uses()) {
-          if (!Current1MO.isReg() || Current1MO.getReg().isPhysical())
-            continue;
+    MachineInstr *Def0RootMI = (DefSrc0Queue.back())->getParent();
+    MachineInstr *Def1RootMI = (DefSrc1Queue.back())->getParent();
+    Register SrcRootMOReg = AMDGPU::NoRegister;
 
-          if (Current0MO.getReg() == Current1MO.getReg() &&
-              Current0MO.getSubReg() == Current1MO.getSubReg()) {
-            SrcRootMOReg = Current0MO.getReg();
-            break;
-          }
-        }
-        // Found it, no more check needed, so break;
-        if (SrcRootMOReg != AMDGPU::NoRegister)
+    // Now, check if the last operation for each in of the DefSrcQueue
+    // has the common MO, that would be the source root MO for element-wise
+    // fp16 chain operations
+    for (MachineOperand &Current0MO : Def0RootMI->uses()) {
+      if (!Current0MO.isReg() || Current0MO.getReg().isPhysical())
+        continue;
+
+      for (MachineOperand &Current1MO : Def1RootMI->uses()) {
+        if (!Current1MO.isReg() || Current1MO.getReg().isPhysical())
+          continue;
+
+        if (Current0MO.getReg() == Current1MO.getReg() &&
+            Current0MO.getSubReg() == Current1MO.getSubReg()) {
+          SrcRootMOReg = Current0MO.getReg();
           break;
+        }
       }
+      // Found it, no more check needed, so break;
+      if (SrcRootMOReg != AMDGPU::NoRegister)
+        break;
+    }
 
-      if (SrcRootMOReg == AMDGPU::NoRegister)
-        continue;
+    if (SrcRootMOReg == AMDGPU::NoRegister)
+      continue;
 
-      // Also we need to ensure that each of the DefXRootMI should access the
-      // lower and upper half word of SrcRootMOReg respectively.
-      if (!checkForRightSrcRootAccess(Def0RootMI, Def1RootMI, SrcRootMOReg,
-                                      TII))
-        continue;
+    // Also we need to ensure that each of the DefXRootMI should access the
+    // lower and upper half word of SrcRootMOReg respectively.
+    if (!checkForRightSrcRootAccess(Def0RootMI, Def1RootMI, SrcRootMOReg, TII))
+      continue;
 
-      // The graph below represents the connection :
-      //       Op0Intial  -->  Op0x  --> ...  -->  Op0Final
-      //      /                                       \'
-      // SrcRootMO                                    v_Pack_b32_f16
-      //      \                                       /
-      //       Op1Intial  -->  Op1x  --> ...  -->  Op1Final
-      // The nomenclature is based upon above flow-graph
-      //
-      // Also for each of DefSrcXQueue :
-      // OpXIntial is at back & OpXFinal is at front
-      auto Op0FinalMI = (DefSrc0Queue.front())->getParent();
-      auto Op1FinalMI = (DefSrc1Queue.front())->getParent();
-      auto Op0IntialMI = (DefSrc0Queue.back())->getParent();
-      auto Op1IntialMI = (DefSrc1Queue.back())->getParent();
-
-      MachineOperand *FinalOutMO = nullptr;
-      std::queue<MachineOperand *> ChainedDefOps;
-      AMDGPU::SDWA::SdwaSel OpSel = AMDGPU::SDWA::SdwaSel::DWORD;
-      int NumOfElemInSecondOpChain = 0;
-
-      // Now, we will change the flow as per the dominace of MI as follows, if
-      // possible and store it in ChainedDefOps, so later can be used to convert
-      // into its SDWA version:
-      //
-      // If (dominates(Op0FinalMI, Op1IntialMI)) == TRUE
-      // SrcRootMO -> Op0Intial -> Op0x -> ... -> Op0Final
-      // -> Op1Intial -> Op1x -> ... -> Op1Final (FinalOutMO)
-      //
-      // If (dominates(Op1FinalMI, Op0IntialMI)) == TRUE
-      // SrcRootMO -> Op1Intial -> Op1x -> ... -> Op1Final
-      // -> Op0Intial -> Op0x -> ... -> Op0Final (FinalOutMO)
-      //
-      // TODO : Else, not handled!
-      // One such case is observed when multiple fp16 instruction are chained
-      // on a fp16 vector input. For Example :
-      //
-      // %1 = call <2 x half> @llvm.log.v2f16 (<2 x half> %0)
-      // %res = call <2 x half> @llvm.sin.v2f16 (<2 x half> %1)
-      // return <2 x half> %res
-      if (dominates(Op0FinalMI, Op1IntialMI)) {
-        int OpIdx = Op1IntialMI->findRegisterUseOperandIdx(SrcRootMOReg, TRI);
-        auto &MOTo = Op1IntialMI->getOperand(OpIdx);
-        auto MOFrom = DefSrc0Queue.front();
-        copyRegOperand(MOTo, *MOFrom, true);
-        FinalOutMO = DefSrc1Queue.front();
-
-        LLVM_DEBUG(dbgs() << "Updated Connecting MI : " << *Op1IntialMI
-                          << '\n');
-        OpIdx = Op0IntialMI->findRegisterUseOperandIdx(SrcRootMOReg, TRI);
-        auto &IntialInMO = Op0IntialMI->getOperand(OpIdx);
-
-        while (!DefSrc1Queue.empty()) {
-          ChainedDefOps.push(DefSrc1Queue.front());
-          DefSrc1Queue.pop();
-          NumOfElemInSecondOpChain++;
-        }
-        while (!DefSrc0Queue.empty()) {
-          ChainedDefOps.push(DefSrc0Queue.front());
-          DefSrc0Queue.pop();
-        }
+    // The graph below represents the connection :
+    //       Op0Intial  -->  Op0x  --> ...  -->  Op0Final
+    //      /                                       \'
+    // SrcRootMO                                    v_Pack_b32_f16
+    //      \                                       /
+    //       Op1Intial  -->  Op1x  --> ...  -->  Op1Final
+    // The nomenclature is based upon above flow-graph
+    //
+    // Also for each of DefSrcXQueue :
+    // OpXIntial is at back & OpXFinal is at front
+    auto Op0FinalMI = (DefSrc0Queue.front())->getParent();
+    auto Op1FinalMI = (DefSrc1Queue.front())->getParent();
+    auto Op0IntialMI = (DefSrc0Queue.back())->getParent();
+    auto Op1IntialMI = (DefSrc1Queue.back())->getParent();
+
+    MachineOperand *FinalOutMO = nullptr;
+    std::queue<MachineOperand *> ChainedDefOps;
+    AMDGPU::SDWA::SdwaSel OpSel = AMDGPU::SDWA::SdwaSel::DWORD;
+    int NumOfElemInSecondOpChain = 0;
+
+    auto canonicalizedMIFlow =
+        [&](std::queue<MachineOperand *> DefFromQueue,
+            std::queue<MachineOperand *> DefToQueue) -> void {
+      MachineInstr *OpToIntialMI = (DefToQueue.back())->getParent();
+      int OpIdx = OpToIntialMI->findRegisterUseOperandIdx(SrcRootMOReg, TRI);
+      auto &MOTo = OpToIntialMI->getOperand(OpIdx);
+      auto MOFrom = DefFromQueue.front();
+      copyRegOperand(MOTo, *MOFrom);
+
+      LLVM_DEBUG(dbgs() << "Updated Connecting MI : " << *OpToIntialMI << '\n');
+
+      FinalOutMO = DefToQueue.front();
+      MachineInstr *OpFromIntialMI = (DefFromQueue.back())->getParent();
+      OpIdx = OpFromIntialMI->findRegisterUseOperandIdx(SrcRootMOReg, TRI);
+      auto &IntialInMO = OpFromIntialMI->getOperand(OpIdx);
+
+      while (!DefToQueue.empty()) {
+        ChainedDefOps.push(DefToQueue.front());
+        DefToQueue.pop();
+        NumOfElemInSecondOpChain++;
+      }
+      while (!DefFromQueue.empty()) {
+        ChainedDefOps.push(DefFromQueue.front());
+        DefFromQueue.pop();
+      }
+      ChainedDefOps.push(&IntialInMO);
+    };
 
-        ChainedDefOps.push(&IntialInMO);
-        OpSel = AMDGPU::SDWA::SdwaSel::WORD_1;
-      } else if (dominates(Op1FinalMI, Op0IntialMI)) {
-        int OpIdx = Op0IntialMI->findRegisterUseOperandIdx(SrcRootMOReg, TRI);
-        auto &MOTo = Op0IntialMI->getOperand(OpIdx);
-        auto MOFrom = DefSrc1Queue.front();
-        copyRegOperand(MOTo, *MOFrom, true);
-        FinalOutMO = DefSrc0Queue.front();
-
-        LLVM_DEBUG(dbgs() << "Updated Connecting MI : " << *Op0IntialMI
-                          << '\n');
-        OpIdx = Op1IntialMI->findRegisterUseOperandIdx(SrcRootMOReg, TRI);
-        auto &IntialInMO = Op1IntialMI->getOperand(OpIdx);
-
-        while (!DefSrc0Queue.empty()) {
-          ChainedDefOps.push(DefSrc0Queue.front());
-          DefSrc0Queue.pop();
-          NumOfElemInSecondOpChain++;
-        }
-        while (!DefSrc1Queue.empty()) {
-          ChainedDefOps.push(DefSrc1Queue.front());
-          DefSrc1Queue.pop();
-        }
+    // Now, we will change the flow as per the dominace of MI as follows, if
+    // possible and store it in ChainedDefOps, so later can be used to convert
+    // into its SDWA version:
+    //
+    // If (dominates(Op0FinalMI, Op1IntialMI)) == TRUE
+    // SrcRootMO -> Op0Intial -> Op0x -> ... -> Op0Final
+    // -> Op1Intial -> Op1x -> ... -> Op1Final (FinalOutMO)
+    //
+    // If (dominates(Op1FinalMI, Op0IntialMI)) == TRUE
+    // SrcRootMO -> Op1Intial -> Op1x -> ... -> Op1Final
+    // -> Op0Intial -> Op0x -> ... -> Op0Final (FinalOutMO)
+    //
+    // TODO : Else, not handled!
+    // One such case is observed when multiple fp16 instruction are chained
+    // on a fp16 vector input. For Example :
+    //
+    // %1 = call <2 x half> @llvm.log.v2f16 (<2 x half> %0)
+    // %res = call <2 x half> @llvm.sin.v2f16 (<2 x half> %1)
+    // return <2 x half> %res
+    if (dominates(Op0FinalMI, Op1IntialMI)) {
+      canonicalizedMIFlow(DefSrc0Queue, DefSrc1Queue);
+      OpSel = AMDGPU::SDWA::SdwaSel::WORD_1;
+    } else if (dominates(Op1FinalMI, Op0IntialMI)) {
+      canonicalizedMIFlow(DefSrc1Queue, DefSrc0Queue);
+      OpSel = AMDGPU::SDWA::SdwaSel::WORD_0;
+    } else {
+      LLVM_DEBUG(dbgs() << "No Connecting MI exists" << '\n');
+      continue;
+    }
 
-        ChainedDefOps.push(&IntialInMO);
-        OpSel = AMDGPU::SDWA::SdwaSel::WORD_0;
-      } else {
-        LLVM_DEBUG(dbgs() << "No Connecting MI exists" << '\n');
+    // Replace all use places of MI(v_pack) defMO with FinalOutMO.
+    MachineOperand &DefMO = MI.getOperand(0);
+    for (MachineOperand &MO :
+         make_early_inc_range(MRI->use_nodbg_operands(DefMO.getReg()))) {
+      if (!MO.isReg())
         continue;
-      }
 
-      // Replace all use places of MI(v_pack) defMO with FinalOutMO.
-      MachineOperand &DefMO = MI.getOperand(0);
-      for (MachineOperand &MO :
-           make_early_inc_range(MRI->use_nodbg_operands(DefMO.getReg()))) {
-        if (!MO.isReg())
-          continue;
+      MO.setReg(FinalOutMO->getReg());
+      MO.setSubReg(FinalOutMO->getSubReg());
+    }
+    LLVM_DEBUG(dbgs() << "Replace uses of " << DefMO << " in " << MI << "With "
+                      << *FinalOutMO << '\n');
 
-        MO.setReg(FinalOutMO->getReg());
-        MO.setSubReg(FinalOutMO->getSubReg());
+    // Delete v_pack machine instruction
+    LLVM_DEBUG(dbgs() << "\nInstruction to be deleted : " << MI << "\n\n");
+    MI.eraseFromParent();
+    ++Num16BitPackedInstructionsEliminated;
+
+    // Convert machine instruction into SDWA-version
+    while (ChainedDefOps.size() != 1) {
+      if (NumOfElemInSecondOpChain == 0) {
+        if (OpSel == AMDGPU::SDWA::SdwaSel::WORD_0)
+          OpSel = AMDGPU::SDWA::SdwaSel::WORD_1;
+        else
+          OpSel = AMDGPU::SDWA::SdwaSel::WORD_0;
       }
-      LLVM_DEBUG(dbgs() << "Replace uses of " << DefMO << " in " << MI
-                        << "With " << *FinalOutMO << '\n');
-
-      // Delete v_pack machine instruction
-      LLVM_DEBUG(dbgs() << "\nInstruction to be deleted : " << MI << "\n\n");
-      MI.eraseFromParent();
-      ++Num16BitPackedInstructionsEliminated;
-
-      // Convert machine instruction into SDWA-version
-      while (ChainedDefOps.size() != 1) {
-        if (NumOfElemInSecondOpChain == 0) {
-          if (OpSel == AMDGPU::SDWA::SdwaSel::WORD_0)
-            OpSel = AMDGPU::SDWA::SdwaSel::WORD_1;
-          else
-            OpSel = AMDGPU::SDWA::SdwaSel::WORD_0;
-        }
-
-        MachineInstr *DefMI = ChainedDefOps.front()->getParent();
-        ChainedDefOps.pop();
-        MachineOperand *SrcMO = ChainedDefOps.front();
 
-        // Take SrcMO (which are def) as its usage in DefMI
-        if (SrcMO->isDef()) {
-          assert(MRI->hasOneUse(SrcMO->getReg()));
-          SrcMO = findSingleRegUse(SrcMO, MRI);
-          assert(DefMI == SrcMO->getParent() && "the only use is not in DefMI");
-        }
+      MachineInstr *DefMI = ChainedDefOps.front()->getParent();
+      ChainedDefOps.pop();
+      MachineOperand *SrcMO = ChainedDefOps.front();
 
-        convertMIToSDWAWithOpsel(*DefMI, *SrcMO, OpSel);
-        NumOfElemInSecondOpChain--;
+      // Take SrcMO (which are def) as its usage in DefMI
+      if (SrcMO->isDef()) {
+        assert(MRI->hasOneUse(SrcMO->getReg()));
+        SrcMO = findSingleRegUse(SrcMO, MRI);
+        assert(DefMI == SrcMO->getParent() && "the only use is not in DefMI");
       }
+
+      convertMIToSDWAWithOpsel(DefMI, *SrcMO, OpSel);
+      NumOfElemInSecondOpChain--;
     }
   }
 }

>From 6867911108538bd0bb3c0303be1b9929de76dc34 Mon Sep 17 00:00:00 2001
From: vikashgu <Vikash.Gupta at amd.com>
Date: Mon, 28 Apr 2025 10:41:00 +0000
Subject: [PATCH 6/8] Added MIR test to demonstrate the specific MIR pattern
 handling to eliminate packing for fp16.

---
 llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp    |  11 +-
 llvm/test/CodeGen/AMDGPU/packed-vec-fp16.mir | 337 +++++++++++++++++++
 2 files changed, 344 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/packed-vec-fp16.mir

diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 4ab731f149c93..1aa2bdc40c4d6 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -1554,11 +1554,14 @@ static bool dominates(MachineBasicBlock::const_iterator A,
   if (B == MBBEnd)
     return true;
 
-  MachineBasicBlock::const_iterator I = MBB->begin();
-  for (; &*I != A && &*I != B; ++I)
-    ;
+  if (A == MBBEnd)
+    return false;
+
+  MachineBasicBlock::const_iterator I = A;
+  while (I != B && I != MBBEnd)
+    I++;
 
-  return &*I == A;
+  return (I == B);
 }
 
 // Convert MI into its SDWA version with its Dst_Sel & SrcMO_Sel set with OpSel
diff --git a/llvm/test/CodeGen/AMDGPU/packed-vec-fp16.mir b/llvm/test/CodeGen/AMDGPU/packed-vec-fp16.mir
new file mode 100644
index 0000000000000..2b2dce0d26a09
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/packed-vec-fp16.mir
@@ -0,0 +1,337 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -run-pass si-peephole-sdwa,dead-mi-elimination -o - %s | FileCheck -check-prefixes=GFX9 %s
+
+--- |
+  source_filename = "/home/vikashgu/work/upstream/llvm-project/llvm/test/CodeGen/AMDGPU/vector-fp16.ll"
+  target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+  target triple = "amdgcn-amd-amdhsa"
+
+  define <4 x half> @sin_v4f16(<4 x half> %a) #0 {
+    %res = call <4 x half> @llvm.sin.v4f16(<4 x half> %a)
+    ret <4 x half> %res
+  }
+
+  define <4 x half> @cos_v4f16(<4 x half> %a) #0 {
+    %res = call <4 x half> @llvm.cos.v4f16(<4 x half> %a)
+    ret <4 x half> %res
+  }
+
+  define <4 x half> @log_v4f16(<4 x half> %a) #0 {
+    %res = call <4 x half> @llvm.log.v4f16(<4 x half> %a)
+    ret <4 x half> %res
+  }
+
+  define <4 x half> @log2_v4f16(<4 x half> %a) #0 {
+    %res = call <4 x half> @llvm.log2.v4f16(<4 x half> %a)
+    ret <4 x half> %res
+  }
+
+  define <4 x half> @exp_v4f16(<4 x half> %a) #0 {
+    %res = call <4 x half> @llvm.exp.v4f16(<4 x half> %a)
+    ret <4 x half> %res
+  }
+
+  define <4 x half> @cascaded_v4f16(<4 x half> %a) #0 {
+    %b = call <4 x half> @llvm.log2.v4f16(<4 x half> %a)
+    %res = call <4 x half> @llvm.sin.v4f16(<4 x half> %b)
+    ret <4 x half> %res
+  }
+
+  declare <4 x half> @llvm.cos.v4f16(<4 x half>) #1
+
+  declare <4 x half> @llvm.exp.v4f16(<4 x half>) #1
+
+  declare <4 x half> @llvm.log.v4f16(<4 x half>) #1
+
+  declare <4 x half> @llvm.log2.v4f16(<4 x half>) #1
+
+  declare <4 x half> @llvm.sin.v4f16(<4 x half>) #1
+
+  attributes #0 = { "target-cpu"="gfx942" }
+  attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx942" }
+...
+
+
+---
+name:            sin_v4f16
+tracksRegLiveness: true
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX9-LABEL: name: sin_v4f16
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 12568, implicit $exec
+    ; GFX9-NEXT: [[V_MUL_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, [[COPY]], 0, [[V_MOV_B32_e32_]], 0, 0, 4, 2, 4, 6, implicit $mode, implicit $exec, implicit [[COPY]](tied-def 0)
+    ; GFX9-NEXT: [[V_SIN_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_SIN_F16_sdwa 0, killed [[V_MUL_F16_sdwa]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit killed [[V_MUL_F16_sdwa]](tied-def 0)
+    ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 12568, implicit $exec
+    ; GFX9-NEXT: [[V_MUL_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, [[V_SIN_F16_sdwa]], 0, [[V_MOV_B32_e32_1]], 0, 0, 5, 2, 5, 6, implicit $mode, implicit $exec, implicit [[V_SIN_F16_sdwa]](tied-def 0)
+    ; GFX9-NEXT: [[V_SIN_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_SIN_F16_sdwa 0, killed [[V_MUL_F16_sdwa1]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit killed [[V_MUL_F16_sdwa1]](tied-def 0)
+    ; GFX9-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 12568, implicit $exec
+    ; GFX9-NEXT: [[V_MUL_F16_sdwa2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, [[COPY1]], 0, [[V_MOV_B32_e32_2]], 0, 0, 4, 2, 4, 6, implicit $mode, implicit $exec, implicit [[COPY1]](tied-def 0)
+    ; GFX9-NEXT: [[V_SIN_F16_sdwa2:%[0-9]+]]:vgpr_32 = nofpexcept V_SIN_F16_sdwa 0, killed [[V_MUL_F16_sdwa2]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit killed [[V_MUL_F16_sdwa2]](tied-def 0)
+    ; GFX9-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 12568, implicit $exec
+    ; GFX9-NEXT: [[V_MUL_F16_sdwa3:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, [[V_SIN_F16_sdwa2]], 0, [[V_MOV_B32_e32_3]], 0, 0, 5, 2, 5, 6, implicit $mode, implicit $exec, implicit [[V_SIN_F16_sdwa2]](tied-def 0)
+    ; GFX9-NEXT: [[V_SIN_F16_sdwa3:%[0-9]+]]:vgpr_32 = nofpexcept V_SIN_F16_sdwa 0, killed [[V_MUL_F16_sdwa3]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit killed [[V_MUL_F16_sdwa3]](tied-def 0)
+    ; GFX9-NEXT: $vgpr0 = COPY [[V_SIN_F16_sdwa3]]
+    ; GFX9-NEXT: $vgpr1 = COPY [[V_SIN_F16_sdwa1]]
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+    %9:vgpr_32 = COPY $vgpr1
+    %8:vgpr_32 = COPY $vgpr0
+    %12:vgpr_32 = nofpexcept V_MUL_F16_e64 0, %9, 0, 12568, 0, 0, implicit $mode, implicit $exec
+    %13:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed %12, 0, 0, implicit $mode, implicit $exec
+    %15:vgpr_32 = V_LSHRREV_B32_e64 16, %9, implicit $exec
+    %17:vgpr_32 = nofpexcept V_MUL_F16_e64 0, %15, 0, 12568, 0, 0, implicit $mode, implicit $exec
+    %18:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed %17, 0, 0, implicit $mode, implicit $exec
+    %20:vgpr_32 = nofpexcept V_MUL_F16_e64 0, %8, 0, 12568, 0, 0, implicit $mode, implicit $exec
+    %21:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed %20, 0, 0, implicit $mode, implicit $exec
+    %22:vgpr_32 = V_LSHRREV_B32_e64 16, %8, implicit $exec
+    %24:vgpr_32 = nofpexcept V_MUL_F16_e64 0, %22, 0, 12568, 0, 0, implicit $mode, implicit $exec
+    %25:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed %24, 0, 0, implicit $mode, implicit $exec
+    %26:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %21, 0, killed %25, 0, 0, implicit $mode, implicit $exec
+    %27:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %13, 0, killed %18, 0, 0, implicit $mode, implicit $exec
+    $vgpr0 = COPY %26
+    $vgpr1 = COPY %27
+    SI_RETURN implicit $vgpr0, implicit $vgpr1
+...
+
+---
+name:            cos_v4f16
+tracksRegLiveness: true
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX9-LABEL: name: cos_v4f16
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 12568, implicit $exec
+    ; GFX9-NEXT: [[V_MUL_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, [[COPY]], 0, [[V_MOV_B32_e32_]], 0, 0, 4, 2, 4, 6, implicit $mode, implicit $exec, implicit [[COPY]](tied-def 0)
+    ; GFX9-NEXT: [[V_COS_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_COS_F16_sdwa 0, killed [[V_MUL_F16_sdwa]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit killed [[V_MUL_F16_sdwa]](tied-def 0)
+    ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 12568, implicit $exec
+    ; GFX9-NEXT: [[V_MUL_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, [[V_COS_F16_sdwa]], 0, [[V_MOV_B32_e32_1]], 0, 0, 5, 2, 5, 6, implicit $mode, implicit $exec, implicit [[V_COS_F16_sdwa]](tied-def 0)
+    ; GFX9-NEXT: [[V_COS_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_COS_F16_sdwa 0, killed [[V_MUL_F16_sdwa1]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit killed [[V_MUL_F16_sdwa1]](tied-def 0)
+    ; GFX9-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 12568, implicit $exec
+    ; GFX9-NEXT: [[V_MUL_F16_sdwa2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, [[COPY1]], 0, [[V_MOV_B32_e32_2]], 0, 0, 4, 2, 4, 6, implicit $mode, implicit $exec, implicit [[COPY1]](tied-def 0)
+    ; GFX9-NEXT: [[V_COS_F16_sdwa2:%[0-9]+]]:vgpr_32 = nofpexcept V_COS_F16_sdwa 0, killed [[V_MUL_F16_sdwa2]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit killed [[V_MUL_F16_sdwa2]](tied-def 0)
+    ; GFX9-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 12568, implicit $exec
+    ; GFX9-NEXT: [[V_MUL_F16_sdwa3:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, [[V_COS_F16_sdwa2]], 0, [[V_MOV_B32_e32_3]], 0, 0, 5, 2, 5, 6, implicit $mode, implicit $exec, implicit [[V_COS_F16_sdwa2]](tied-def 0)
+    ; GFX9-NEXT: [[V_COS_F16_sdwa3:%[0-9]+]]:vgpr_32 = nofpexcept V_COS_F16_sdwa 0, killed [[V_MUL_F16_sdwa3]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit killed [[V_MUL_F16_sdwa3]](tied-def 0)
+    ; GFX9-NEXT: $vgpr0 = COPY [[V_COS_F16_sdwa3]]
+    ; GFX9-NEXT: $vgpr1 = COPY [[V_COS_F16_sdwa1]]
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+    %9:vgpr_32 = COPY $vgpr1
+    %8:vgpr_32 = COPY $vgpr0
+    %12:vgpr_32 = nofpexcept V_MUL_F16_e64 0, %9, 0, 12568, 0, 0, implicit $mode, implicit $exec
+    %13:vgpr_32 = nofpexcept V_COS_F16_e64 0, killed %12, 0, 0, implicit $mode, implicit $exec
+    %15:vgpr_32 = V_LSHRREV_B32_e64 16, %9, implicit $exec
+    %17:vgpr_32 = nofpexcept V_MUL_F16_e64 0, %15, 0, 12568, 0, 0, implicit $mode, implicit $exec
+    %18:vgpr_32 = nofpexcept V_COS_F16_e64 0, killed %17, 0, 0, implicit $mode, implicit $exec
+    %20:vgpr_32 = nofpexcept V_MUL_F16_e64 0, %8, 0, 12568, 0, 0, implicit $mode, implicit $exec
+    %21:vgpr_32 = nofpexcept V_COS_F16_e64 0, killed %20, 0, 0, implicit $mode, implicit $exec
+    %22:vgpr_32 = V_LSHRREV_B32_e64 16, %8, implicit $exec
+    %24:vgpr_32 = nofpexcept V_MUL_F16_e64 0, %22, 0, 12568, 0, 0, implicit $mode, implicit $exec
+    %25:vgpr_32 = nofpexcept V_COS_F16_e64 0, killed %24, 0, 0, implicit $mode, implicit $exec
+    %26:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %21, 0, killed %25, 0, 0, implicit $mode, implicit $exec
+    %27:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %13, 0, killed %18, 0, 0, implicit $mode, implicit $exec
+    $vgpr0 = COPY %26
+    $vgpr1 = COPY %27
+    SI_RETURN implicit $vgpr0, implicit $vgpr1
+...
+
+---
+name:            log_v4f16
+tracksRegLiveness: true
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX9-LABEL: name: log_v4f16
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9-NEXT: [[V_LOG_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[COPY]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit [[COPY]](tied-def 0)
+    ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 14732
+    ; GFX9-NEXT: [[V_MUL_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, killed [[V_LOG_F16_sdwa]], 0, [[S_MOV_B32_]], 0, 0, 4, 2, 4, 6, implicit $mode, implicit $exec, implicit killed [[V_LOG_F16_sdwa]](tied-def 0)
+    ; GFX9-NEXT: [[V_LOG_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[V_MUL_F16_sdwa]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit [[V_MUL_F16_sdwa]](tied-def 0)
+    ; GFX9-NEXT: [[V_MUL_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, killed [[V_LOG_F16_sdwa1]], 0, [[S_MOV_B32_]], 0, 0, 5, 2, 5, 6, implicit $mode, implicit $exec, implicit killed [[V_LOG_F16_sdwa1]](tied-def 0)
+    ; GFX9-NEXT: [[V_LOG_F16_sdwa2:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[COPY1]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit [[COPY1]](tied-def 0)
+    ; GFX9-NEXT: [[V_MUL_F16_sdwa2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, killed [[V_LOG_F16_sdwa2]], 0, [[S_MOV_B32_]], 0, 0, 4, 2, 4, 6, implicit $mode, implicit $exec, implicit killed [[V_LOG_F16_sdwa2]](tied-def 0)
+    ; GFX9-NEXT: [[V_LOG_F16_sdwa3:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[V_MUL_F16_sdwa2]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit [[V_MUL_F16_sdwa2]](tied-def 0)
+    ; GFX9-NEXT: [[V_MUL_F16_sdwa3:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, killed [[V_LOG_F16_sdwa3]], 0, [[S_MOV_B32_]], 0, 0, 5, 2, 5, 6, implicit $mode, implicit $exec, implicit killed [[V_LOG_F16_sdwa3]](tied-def 0)
+    ; GFX9-NEXT: $vgpr0 = COPY [[V_MUL_F16_sdwa3]]
+    ; GFX9-NEXT: $vgpr1 = COPY [[V_MUL_F16_sdwa1]]
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+    %9:vgpr_32 = COPY $vgpr1
+    %8:vgpr_32 = COPY $vgpr0
+    %11:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %9, 0, 0, implicit $mode, implicit $exec
+    %12:sreg_32 = S_MOV_B32 14732
+    %13:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %11, 0, %12, 0, 0, implicit $mode, implicit $exec
+    %15:vgpr_32 = V_LSHRREV_B32_e64 16, %9, implicit $exec
+    %17:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %15, 0, 0, implicit $mode, implicit $exec
+    %18:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %17, 0, %12, 0, 0, implicit $mode, implicit $exec
+    %20:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %8, 0, 0, implicit $mode, implicit $exec
+    %21:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %20, 0, %12, 0, 0, implicit $mode, implicit $exec
+    %22:vgpr_32 = V_LSHRREV_B32_e64 16, %8, implicit $exec
+    %24:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %22, 0, 0, implicit $mode, implicit $exec
+    %25:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %24, 0, %12, 0, 0, implicit $mode, implicit $exec
+    %26:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %21, 0, killed %25, 0, 0, implicit $mode, implicit $exec
+    %27:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %13, 0, killed %18, 0, 0, implicit $mode, implicit $exec
+    $vgpr0 = COPY %26
+    $vgpr1 = COPY %27
+    SI_RETURN implicit $vgpr0, implicit $vgpr1
+...
+
+---
+name:            log2_v4f16
+tracksRegLiveness: true
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX9-LABEL: name: log2_v4f16
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9-NEXT: [[V_LOG_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[COPY]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit [[COPY]](tied-def 0)
+    ; GFX9-NEXT: [[V_LOG_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[COPY1]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit [[COPY1]](tied-def 0)
+    ; GFX9-NEXT: [[V_LOG_F16_sdwa2:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[V_LOG_F16_sdwa]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit [[V_LOG_F16_sdwa]](tied-def 0)
+    ; GFX9-NEXT: [[V_LOG_F16_sdwa3:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[V_LOG_F16_sdwa1]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit [[V_LOG_F16_sdwa1]](tied-def 0)
+    ; GFX9-NEXT: $vgpr0 = COPY [[V_LOG_F16_sdwa3]]
+    ; GFX9-NEXT: $vgpr1 = COPY [[V_LOG_F16_sdwa2]]
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+    %9:vgpr_32 = COPY $vgpr1
+    %8:vgpr_32 = COPY $vgpr0
+    %11:vgpr_32 = V_LSHRREV_B32_e64 16, %9, implicit $exec
+    %13:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %11, 0, 0, implicit $mode, implicit $exec
+    %14:vgpr_32 = V_LSHRREV_B32_e64 16, %8, implicit $exec
+    %16:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %14, 0, 0, implicit $mode, implicit $exec
+    %18:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %9, 0, 0, implicit $mode, implicit $exec
+    %20:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %8, 0, 0, implicit $mode, implicit $exec
+    %21:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %20, 0, killed %16, 0, 0, implicit $mode, implicit $exec
+    %22:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %18, 0, killed %13, 0, 0, implicit $mode, implicit $exec
+    $vgpr0 = COPY %21
+    $vgpr1 = COPY %22
+    SI_RETURN implicit $vgpr0, implicit $vgpr1
+...
+
+---
+name:            exp_v4f16
+tracksRegLiveness: true
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX9-LABEL: name: exp_v4f16
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9-NEXT: [[V_CVT_F32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 1069066811
+    ; GFX9-NEXT: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed [[V_CVT_F32_F16_e64_]], 0, [[S_MOV_B32_]], 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_EXP_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F32_e64 0, killed [[V_MUL_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_CVT_F16_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, killed [[V_EXP_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_CVT_F32_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_sdwa 0, [[COPY]], 0, 0, 6, 0, 5, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_MUL_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed [[V_CVT_F32_F16_sdwa]], 0, [[S_MOV_B32_]], 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_EXP_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F32_e64 0, killed [[V_MUL_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_CVT_F16_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, killed [[V_EXP_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_CVT_F32_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_MUL_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed [[V_CVT_F32_F16_e64_1]], 0, [[S_MOV_B32_]], 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_EXP_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F32_e64 0, killed [[V_MUL_F32_e64_2]], 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_CVT_F16_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, killed [[V_EXP_F32_e64_2]], 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_CVT_F32_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_sdwa 0, [[COPY1]], 0, 0, 6, 0, 5, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_MUL_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed [[V_CVT_F32_F16_sdwa1]], 0, [[S_MOV_B32_]], 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_EXP_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F32_e64 0, killed [[V_MUL_F32_e64_3]], 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_CVT_F16_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, killed [[V_EXP_F32_e64_3]], 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_PACK_B32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed [[V_CVT_F16_F32_e64_2]], 0, killed [[V_CVT_F16_F32_e64_3]], 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_PACK_B32_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed [[V_CVT_F16_F32_e64_]], 0, killed [[V_CVT_F16_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: $vgpr0 = COPY [[V_PACK_B32_F16_e64_]]
+    ; GFX9-NEXT: $vgpr1 = COPY [[V_PACK_B32_F16_e64_1]]
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+    %9:vgpr_32 = COPY $vgpr1
+    %8:vgpr_32 = COPY $vgpr0
+    %11:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, %9, 0, 0, implicit $mode, implicit $exec
+    %12:sgpr_32 = S_MOV_B32 1069066811
+    %13:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed %11, 0, %12, 0, 0, implicit $mode, implicit $exec
+    %14:vgpr_32 = nofpexcept V_EXP_F32_e64 0, killed %13, 0, 0, implicit $mode, implicit $exec
+    %15:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, killed %14, 0, 0, implicit $mode, implicit $exec
+    %17:vgpr_32 = V_LSHRREV_B32_e64 16, %9, implicit $exec
+    %19:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, %17, 0, 0, implicit $mode, implicit $exec
+    %20:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed %19, 0, %12, 0, 0, implicit $mode, implicit $exec
+    %21:vgpr_32 = nofpexcept V_EXP_F32_e64 0, killed %20, 0, 0, implicit $mode, implicit $exec
+    %22:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, killed %21, 0, 0, implicit $mode, implicit $exec
+    %24:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, %8, 0, 0, implicit $mode, implicit $exec
+    %25:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed %24, 0, %12, 0, 0, implicit $mode, implicit $exec
+    %26:vgpr_32 = nofpexcept V_EXP_F32_e64 0, killed %25, 0, 0, implicit $mode, implicit $exec
+    %27:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, killed %26, 0, 0, implicit $mode, implicit $exec
+    %28:vgpr_32 = V_LSHRREV_B32_e64 16, %8, implicit $exec
+    %30:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, %28, 0, 0, implicit $mode, implicit $exec
+    %31:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed %30, 0, %12, 0, 0, implicit $mode, implicit $exec
+    %32:vgpr_32 = nofpexcept V_EXP_F32_e64 0, killed %31, 0, 0, implicit $mode, implicit $exec
+    %33:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, killed %32, 0, 0, implicit $mode, implicit $exec
+    %34:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %27, 0, killed %33, 0, 0, implicit $mode, implicit $exec
+    %35:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %15, 0, killed %22, 0, 0, implicit $mode, implicit $exec
+    $vgpr0 = COPY %34
+    $vgpr1 = COPY %35
+    SI_RETURN implicit $vgpr0, implicit $vgpr1
+...
+
+---
+name:            cascaded_v4f16
+tracksRegLiveness: true
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX9-LABEL: name: cascaded_v4f16
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9-NEXT: [[V_LOG_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[COPY1]], 0, 0, 6, 0, 5, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_LOG_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[COPY]], 0, 0, 6, 0, 5, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_LOG_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_LOG_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_MUL_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed [[V_LOG_F16_e64_1]], 0, 12568, 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_SIN_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed [[V_MUL_F16_e64_]], 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_MUL_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed [[V_LOG_F16_sdwa1]], 0, 12568, 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_SIN_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed [[V_MUL_F16_e64_1]], 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_MUL_F16_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed [[V_LOG_F16_e64_]], 0, 12568, 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_SIN_F16_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed [[V_MUL_F16_e64_2]], 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_MUL_F16_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed [[V_LOG_F16_sdwa]], 0, 12568, 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_SIN_F16_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed [[V_MUL_F16_e64_3]], 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_PACK_B32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed [[V_SIN_F16_e64_2]], 0, killed [[V_SIN_F16_e64_3]], 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_PACK_B32_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed [[V_SIN_F16_e64_]], 0, killed [[V_SIN_F16_e64_1]], 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: $vgpr0 = COPY [[V_PACK_B32_F16_e64_]]
+    ; GFX9-NEXT: $vgpr1 = COPY [[V_PACK_B32_F16_e64_1]]
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+    %9:vgpr_32 = COPY $vgpr1
+    %8:vgpr_32 = COPY $vgpr0
+    %11:vgpr_32 = V_LSHRREV_B32_e64 16, %8, implicit $exec
+    %13:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %11, 0, 0, implicit $mode, implicit $exec
+    %14:vgpr_32 = V_LSHRREV_B32_e64 16, %9, implicit $exec
+    %16:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %14, 0, 0, implicit $mode, implicit $exec
+    %18:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %8, 0, 0, implicit $mode, implicit $exec
+    %20:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %9, 0, 0, implicit $mode, implicit $exec
+    %22:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %20, 0, 12568, 0, 0, implicit $mode, implicit $exec
+    %23:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed %22, 0, 0, implicit $mode, implicit $exec
+    %24:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %16, 0, 12568, 0, 0, implicit $mode, implicit $exec
+    %25:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed %24, 0, 0, implicit $mode, implicit $exec
+    %26:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %18, 0, 12568, 0, 0, implicit $mode, implicit $exec
+    %27:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed %26, 0, 0, implicit $mode, implicit $exec
+    %28:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %13, 0, 12568, 0, 0, implicit $mode, implicit $exec
+    %29:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed %28, 0, 0, implicit $mode, implicit $exec
+    %30:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %27, 0, killed %29, 0, 0, implicit $mode, implicit $exec
+    %31:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %23, 0, killed %25, 0, 0, implicit $mode, implicit $exec
+    $vgpr0 = COPY %30
+    $vgpr1 = COPY %31
+    SI_RETURN implicit $vgpr0, implicit $vgpr1
+...

>From ce99d8c107a9d5f3eadd367bbc1b6799e730fbf3 Mon Sep 17 00:00:00 2001
From: vikashgu <Vikash.Gupta at amd.com>
Date: Tue, 6 May 2025 07:49:38 +0000
Subject: [PATCH 7/8] Reduced duplicate code length & added a new MIR test in
 existing testFile.

---
 llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp    | 106 ++++++++-----------
 llvm/test/CodeGen/AMDGPU/packed-vec-fp16.mir |  43 ++++++++
 2 files changed, 89 insertions(+), 60 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 1aa2bdc40c4d6..aa49f80883bf0 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -1493,54 +1493,44 @@ static bool checkForRightSrcRootAccess(MachineInstr *Def0MI,
       Def1DstUnused->getImm() != AMDGPU::SDWA::DstUnused::UNUSED_PAD)
     return false;
 
-  MachineOperand *Def1Src0 =
-      TII->getNamedOperand(*Def1MI, AMDGPU::OpName::src0);
-  MachineOperand *Def1Src1 =
-      TII->getNamedOperand(*Def1MI, AMDGPU::OpName::src1);
-  MachineOperand *Def0Src0 =
-      TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src0);
-  MachineOperand *Def0Src1 =
-      TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src1);
-
-  auto checkForDef0MIAccess = [&]() -> bool {
-    if (Def0Src0 && Def0Src0->isReg() && (Def0Src0->getReg() == SrcRootReg)) {
-      MachineOperand *Def0Src0Sel =
-          TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src0_sel);
-      if (!Def0Src0Sel ||
-          Def0Src0Sel->getImm() == AMDGPU::SDWA::SdwaSel::WORD_0)
-        return true;
-    }
-
-    if (Def0Src1 && Def0Src1->isReg() && (Def0Src1->getReg() == SrcRootReg)) {
-      MachineOperand *Def0Src1Sel =
-          TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src1_sel);
-      if (!Def0Src1Sel ||
-          Def0Src1Sel->getImm() == AMDGPU::SDWA::SdwaSel::WORD_0)
-        return true;
+  const auto checkSrcSel = [&](MachineInstr *DefMI, AMDGPU::OpName SrcName,
+                               AMDGPU::OpName SrcSelName,
+                               AMDGPU::SDWA::SdwaSel SdwaSel) -> bool {
+    MachineOperand *DefSrc = TII->getNamedOperand(*DefMI, SrcName);
+    if (DefSrc && DefSrc->isReg() && (DefSrc->getReg() == SrcRootReg)) {
+      MachineOperand *DefSrcSel = TII->getNamedOperand(*DefMI, SrcSelName);
+      if (SdwaSel == AMDGPU::SDWA::SdwaSel::WORD_0) {
+        if (!DefSrcSel || DefSrcSel->getImm() == SdwaSel)
+          return true;
+      } else {
+        assert(SdwaSel == AMDGPU::SDWA::SdwaSel::WORD_1 &&
+               "Not valid SDWA SrcSel operand");
+        if (DefSrcSel && DefSrcSel->getImm() == SdwaSel)
+          return true;
+      }
     }
-
     return false;
   };
 
-  if (Def1Src0 && Def1Src0->isReg() && (Def1Src0->getReg() == SrcRootReg)) {
-    MachineOperand *Def1Src0Sel =
-        TII->getNamedOperand(*Def1MI, AMDGPU::OpName::src0_sel);
-    if (!Def1Src0Sel || Def1Src0Sel->getImm() != AMDGPU::SDWA::SdwaSel::WORD_1)
-      return false;
+  const auto checkForDef0MIAccess = [&]() -> bool {
+    if (checkSrcSel(Def0MI, AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel,
+                    AMDGPU::SDWA::SdwaSel::WORD_0))
+      return true;
+    if (checkSrcSel(Def0MI, AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel,
+                    AMDGPU::SDWA::SdwaSel::WORD_0))
+      return true;
+    return false;
+  };
 
+  if (checkSrcSel(Def1MI, AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel,
+                  AMDGPU::SDWA::SdwaSel::WORD_1))
     if (checkForDef0MIAccess())
       return true;
-  }
-
-  if (Def1Src1 && Def1Src1->isReg() && (Def1Src1->getReg() == SrcRootReg)) {
-    MachineOperand *Def1Src1Sel =
-        TII->getNamedOperand(*Def1MI, AMDGPU::OpName::src1_sel);
-    if (!Def1Src1Sel || Def1Src1Sel->getImm() != AMDGPU::SDWA::SdwaSel::WORD_1)
-      return false;
 
+  if (checkSrcSel(Def1MI, AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel,
+                  AMDGPU::SDWA::SdwaSel::WORD_1))
     if (checkForDef0MIAccess())
       return true;
-  }
 
   return false;
 }
@@ -1593,7 +1583,7 @@ void SIPeepholeSDWA::convertMIToSDWAWithOpsel(MachineInstr *MI,
       TII->getNamedOperand(*MI, AMDGPU::OpName::dst_unused);
   assert(DstUnused &&
          AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::dst_unused));
-  assert(!(DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) &&
+  assert(DstUnused->getImm() != AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE &&
          "Dst_unused should not be UNUSED_PRESERVE already");
   DstUnused->setImm(AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE);
 
@@ -1606,31 +1596,27 @@ void SIPeepholeSDWA::convertMIToSDWAWithOpsel(MachineInstr *MI,
   MI->addOperand(NewSrcImplitMO);
   MI->tieOperands(PreserveDstIdx, MI->getNumOperands() - 1);
 
-  MachineOperand *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
-  assert(Src0 && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0));
-  if (Src0->isReg() && (Src0->getReg() == SrcMO.getReg())) {
-    MachineOperand *Src0Sel =
-        TII->getNamedOperand(*MI, AMDGPU::OpName::src0_sel);
-    assert(Src0Sel &&
-           AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_sel));
-    Src0Sel->setImm(OpSel);
+  auto modifySrcSelIntoOpSel = [&](AMDGPU::OpName SrcName,
+                                   AMDGPU::OpName SrcSelName) -> bool {
+    MachineOperand *Src = TII->getNamedOperand(*MI, SrcName);
+    assert(Src && AMDGPU::hasNamedOperand(SDWAOpcode, SrcName));
+    if (Src->isReg() && (Src->getReg() == SrcMO.getReg())) {
+      MachineOperand *SrcSel = TII->getNamedOperand(*MI, SrcSelName);
+      assert(SrcSel && AMDGPU::hasNamedOperand(SDWAOpcode, SrcSelName));
+      SrcSel->setImm(OpSel);
 
-    LLVM_DEBUG(dbgs() << "\nInto:" << *MI << '\n');
-    return;
-  }
+      LLVM_DEBUG(dbgs() << "\nInto:" << *MI << '\n');
+      return true;
+    }
 
-  MachineOperand *Src1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1);
-  assert(Src1 && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1));
-  if (Src1->isReg() && (Src1->getReg() == SrcMO.getReg())) {
-    MachineOperand *Src1Sel =
-        TII->getNamedOperand(*MI, AMDGPU::OpName::src1_sel);
-    assert(Src1Sel &&
-           AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_sel));
-    Src1Sel->setImm(OpSel);
+    return false;
+  };
 
-    LLVM_DEBUG(dbgs() << "\nInto:" << *MI << '\n');
+  if (modifySrcSelIntoOpSel(AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel))
+    return;
+
+  if (modifySrcSelIntoOpSel(AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel))
     return;
-  }
 }
 
 // BackTracks the given Parent MI to look for any of its use operand that has
diff --git a/llvm/test/CodeGen/AMDGPU/packed-vec-fp16.mir b/llvm/test/CodeGen/AMDGPU/packed-vec-fp16.mir
index 2b2dce0d26a09..9318f8dd2bbea 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-vec-fp16.mir
+++ b/llvm/test/CodeGen/AMDGPU/packed-vec-fp16.mir
@@ -37,6 +37,10 @@
     ret <4 x half> %res
   }
 
+  define void @unbalanced_operations_packed(<4 x half> %a) #0 {
+    ret void
+  }
+
   declare <4 x half> @llvm.cos.v4f16(<4 x half>) #1
 
   declare <4 x half> @llvm.exp.v4f16(<4 x half>) #1
@@ -335,3 +339,42 @@ body:             |
     $vgpr1 = COPY %31
     SI_RETURN implicit $vgpr0, implicit $vgpr1
 ...
+
+---
+name:            unbalanced_operations_packed
+tracksRegLiveness: true
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: $vgpr0, $vgpr1
+    ; GFX9-LABEL: name: unbalanced_operations_packed
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9-NEXT: [[V_LOG_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[COPY]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit [[COPY]](tied-def 0)
+    ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 14732
+    ; GFX9-NEXT: [[V_MUL_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, killed [[V_LOG_F16_sdwa]], 0, [[S_MOV_B32_]], 0, 0, 4, 2, 4, 6, implicit $mode, implicit $exec, implicit killed [[V_LOG_F16_sdwa]](tied-def 0)
+    ; GFX9-NEXT: [[V_LOG_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[V_MUL_F16_sdwa]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit [[V_MUL_F16_sdwa]](tied-def 0)
+    ; GFX9-NEXT: [[V_LOG_F16_sdwa2:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[COPY1]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit [[COPY1]](tied-def 0)
+    ; GFX9-NEXT: [[V_LOG_F16_sdwa3:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[V_LOG_F16_sdwa2]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit [[V_LOG_F16_sdwa2]](tied-def 0)
+    ; GFX9-NEXT: [[V_MUL_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, killed [[V_LOG_F16_sdwa3]], 0, [[S_MOV_B32_]], 0, 0, 5, 2, 5, 6, implicit $mode, implicit $exec, implicit killed [[V_LOG_F16_sdwa3]](tied-def 0)
+    ; GFX9-NEXT: $vgpr0 = COPY [[V_MUL_F16_sdwa1]]
+    ; GFX9-NEXT: $vgpr1 = COPY [[V_LOG_F16_sdwa1]]
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+    %9:vgpr_32 = COPY $vgpr1
+    %8:vgpr_32 = COPY $vgpr0
+    %11:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %9, 0, 0, implicit $mode, implicit $exec
+    %12:sreg_32 = S_MOV_B32 14732
+    %13:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %11, 0, %12, 0, 0, implicit $mode, implicit $exec
+    %15:vgpr_32 = V_LSHRREV_B32_e64 16, %9, implicit $exec
+    %17:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %15, 0, 0, implicit $mode, implicit $exec
+    %20:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %8, 0, 0, implicit $mode, implicit $exec
+    %22:vgpr_32 = V_LSHRREV_B32_e64 16, %8, implicit $exec
+    %24:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %22, 0, 0, implicit $mode, implicit $exec
+    %25:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %24, 0, %12, 0, 0, implicit $mode, implicit $exec
+    %26:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %20, 0, killed %25, 0, 0, implicit $mode, implicit $exec
+    %27:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %13, 0, killed %17, 0, 0, implicit $mode, implicit $exec
+    $vgpr0 = COPY %26
+    $vgpr1 = COPY %27
+    SI_RETURN implicit $vgpr0, implicit $vgpr1
+...

>From d0d7bcec878fc3cd1d1691c7c7c1b7b8a7b176d2 Mon Sep 17 00:00:00 2001
From: vikashgu <Vikash.Gupta at amd.com>
Date: Tue, 13 May 2025 10:44:30 +0000
Subject: [PATCH 8/8] Updated MIR test case, and addressed code reviews.

---
 llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp    |   35 +-
 llvm/test/CodeGen/AMDGPU/packed-vec-fp16.mir |  360 +--
 llvm/test/CodeGen/AMDGPU/vector-fp16.ll      | 2758 ------------------
 3 files changed, 109 insertions(+), 3044 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AMDGPU/vector-fp16.ll

diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index aa49f80883bf0..2cb5a31870f5b 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -1493,7 +1493,7 @@ static bool checkForRightSrcRootAccess(MachineInstr *Def0MI,
       Def1DstUnused->getImm() != AMDGPU::SDWA::DstUnused::UNUSED_PAD)
     return false;
 
-  const auto checkSrcSel = [&](MachineInstr *DefMI, AMDGPU::OpName SrcName,
+  const auto CheckSrcSel = [&](MachineInstr *DefMI, AMDGPU::OpName SrcName,
                                AMDGPU::OpName SrcSelName,
                                AMDGPU::SDWA::SdwaSel SdwaSel) -> bool {
     MachineOperand *DefSrc = TII->getNamedOperand(*DefMI, SrcName);
@@ -1512,27 +1512,16 @@ static bool checkForRightSrcRootAccess(MachineInstr *Def0MI,
     return false;
   };
 
-  const auto checkForDef0MIAccess = [&]() -> bool {
-    if (checkSrcSel(Def0MI, AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel,
-                    AMDGPU::SDWA::SdwaSel::WORD_0))
-      return true;
-    if (checkSrcSel(Def0MI, AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel,
-                    AMDGPU::SDWA::SdwaSel::WORD_0))
-      return true;
+  if (!CheckSrcSel(Def1MI, AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel,
+                   AMDGPU::SDWA::SdwaSel::WORD_1) &&
+      !CheckSrcSel(Def1MI, AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel,
+                   AMDGPU::SDWA::SdwaSel::WORD_1))
     return false;
-  };
-
-  if (checkSrcSel(Def1MI, AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel,
-                  AMDGPU::SDWA::SdwaSel::WORD_1))
-    if (checkForDef0MIAccess())
-      return true;
-
-  if (checkSrcSel(Def1MI, AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel,
-                  AMDGPU::SDWA::SdwaSel::WORD_1))
-    if (checkForDef0MIAccess())
-      return true;
 
-  return false;
+  return CheckSrcSel(Def0MI, AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel,
+                     AMDGPU::SDWA::SdwaSel::WORD_0) ||
+         CheckSrcSel(Def0MI, AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel,
+                     AMDGPU::SDWA::SdwaSel::WORD_0);
 }
 
 /// Given A and B are in the same MBB, returns true if A comes before B.
@@ -1596,7 +1585,7 @@ void SIPeepholeSDWA::convertMIToSDWAWithOpsel(MachineInstr *MI,
   MI->addOperand(NewSrcImplitMO);
   MI->tieOperands(PreserveDstIdx, MI->getNumOperands() - 1);
 
-  auto modifySrcSelIntoOpSel = [&](AMDGPU::OpName SrcName,
+  auto ModifySrcSelIntoOpSel = [&](AMDGPU::OpName SrcName,
                                    AMDGPU::OpName SrcSelName) -> bool {
     MachineOperand *Src = TII->getNamedOperand(*MI, SrcName);
     assert(Src && AMDGPU::hasNamedOperand(SDWAOpcode, SrcName));
@@ -1612,10 +1601,10 @@ void SIPeepholeSDWA::convertMIToSDWAWithOpsel(MachineInstr *MI,
     return false;
   };
 
-  if (modifySrcSelIntoOpSel(AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel))
+  if (ModifySrcSelIntoOpSel(AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel))
     return;
 
-  if (modifySrcSelIntoOpSel(AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel))
+  if (ModifySrcSelIntoOpSel(AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel))
     return;
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/packed-vec-fp16.mir b/llvm/test/CodeGen/AMDGPU/packed-vec-fp16.mir
index 9318f8dd2bbea..e45d7dd8f2029 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-vec-fp16.mir
+++ b/llvm/test/CodeGen/AMDGPU/packed-vec-fp16.mir
@@ -1,245 +1,126 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
 # RUN: llc -mtriple=amdgcn -mcpu=gfx942 -run-pass si-peephole-sdwa,dead-mi-elimination -o - %s | FileCheck -check-prefixes=GFX9 %s
 
---- |
-  source_filename = "/home/vikashgu/work/upstream/llvm-project/llvm/test/CodeGen/AMDGPU/vector-fp16.ll"
-  target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
-  target triple = "amdgcn-amd-amdhsa"
-
-  define <4 x half> @sin_v4f16(<4 x half> %a) #0 {
-    %res = call <4 x half> @llvm.sin.v4f16(<4 x half> %a)
-    ret <4 x half> %res
-  }
-
-  define <4 x half> @cos_v4f16(<4 x half> %a) #0 {
-    %res = call <4 x half> @llvm.cos.v4f16(<4 x half> %a)
-    ret <4 x half> %res
-  }
-
-  define <4 x half> @log_v4f16(<4 x half> %a) #0 {
-    %res = call <4 x half> @llvm.log.v4f16(<4 x half> %a)
-    ret <4 x half> %res
-  }
-
-  define <4 x half> @log2_v4f16(<4 x half> %a) #0 {
-    %res = call <4 x half> @llvm.log2.v4f16(<4 x half> %a)
-    ret <4 x half> %res
-  }
-
-  define <4 x half> @exp_v4f16(<4 x half> %a) #0 {
-    %res = call <4 x half> @llvm.exp.v4f16(<4 x half> %a)
-    ret <4 x half> %res
-  }
-
-  define <4 x half> @cascaded_v4f16(<4 x half> %a) #0 {
-    %b = call <4 x half> @llvm.log2.v4f16(<4 x half> %a)
-    %res = call <4 x half> @llvm.sin.v4f16(<4 x half> %b)
-    ret <4 x half> %res
-  }
-
-  define void @unbalanced_operations_packed(<4 x half> %a) #0 {
-    ret void
-  }
-
-  declare <4 x half> @llvm.cos.v4f16(<4 x half>) #1
-
-  declare <4 x half> @llvm.exp.v4f16(<4 x half>) #1
-
-  declare <4 x half> @llvm.log.v4f16(<4 x half>) #1
-
-  declare <4 x half> @llvm.log2.v4f16(<4 x half>) #1
-
-  declare <4 x half> @llvm.sin.v4f16(<4 x half>) #1
-
-  attributes #0 = { "target-cpu"="gfx942" }
-  attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx942" }
-...
-
-
 ---
-name:            sin_v4f16
+name:            symmetric_equal_edges_fp16
 tracksRegLiveness: true
 body:             |
-  bb.0 (%ir-block.0):
-    liveins: $vgpr0, $vgpr1
-
-    ; GFX9-LABEL: name: sin_v4f16
-    ; GFX9: liveins: $vgpr0, $vgpr1
+  bb.0:
+    liveins: $vgpr0
+    ; GFX9-LABEL: name: symmetric_equal_edges_fp16
+    ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 12568, implicit $exec
     ; GFX9-NEXT: [[V_MUL_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, [[COPY]], 0, [[V_MOV_B32_e32_]], 0, 0, 4, 2, 4, 6, implicit $mode, implicit $exec, implicit [[COPY]](tied-def 0)
     ; GFX9-NEXT: [[V_SIN_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_SIN_F16_sdwa 0, killed [[V_MUL_F16_sdwa]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit killed [[V_MUL_F16_sdwa]](tied-def 0)
     ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 12568, implicit $exec
     ; GFX9-NEXT: [[V_MUL_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, [[V_SIN_F16_sdwa]], 0, [[V_MOV_B32_e32_1]], 0, 0, 5, 2, 5, 6, implicit $mode, implicit $exec, implicit [[V_SIN_F16_sdwa]](tied-def 0)
     ; GFX9-NEXT: [[V_SIN_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_SIN_F16_sdwa 0, killed [[V_MUL_F16_sdwa1]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit killed [[V_MUL_F16_sdwa1]](tied-def 0)
-    ; GFX9-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 12568, implicit $exec
-    ; GFX9-NEXT: [[V_MUL_F16_sdwa2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, [[COPY1]], 0, [[V_MOV_B32_e32_2]], 0, 0, 4, 2, 4, 6, implicit $mode, implicit $exec, implicit [[COPY1]](tied-def 0)
-    ; GFX9-NEXT: [[V_SIN_F16_sdwa2:%[0-9]+]]:vgpr_32 = nofpexcept V_SIN_F16_sdwa 0, killed [[V_MUL_F16_sdwa2]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit killed [[V_MUL_F16_sdwa2]](tied-def 0)
-    ; GFX9-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 12568, implicit $exec
-    ; GFX9-NEXT: [[V_MUL_F16_sdwa3:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, [[V_SIN_F16_sdwa2]], 0, [[V_MOV_B32_e32_3]], 0, 0, 5, 2, 5, 6, implicit $mode, implicit $exec, implicit [[V_SIN_F16_sdwa2]](tied-def 0)
-    ; GFX9-NEXT: [[V_SIN_F16_sdwa3:%[0-9]+]]:vgpr_32 = nofpexcept V_SIN_F16_sdwa 0, killed [[V_MUL_F16_sdwa3]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit killed [[V_MUL_F16_sdwa3]](tied-def 0)
-    ; GFX9-NEXT: $vgpr0 = COPY [[V_SIN_F16_sdwa3]]
-    ; GFX9-NEXT: $vgpr1 = COPY [[V_SIN_F16_sdwa1]]
-    ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
-    %9:vgpr_32 = COPY $vgpr1
+    ; GFX9-NEXT: $vgpr0 = COPY [[V_SIN_F16_sdwa1]]
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0
     %8:vgpr_32 = COPY $vgpr0
-    %12:vgpr_32 = nofpexcept V_MUL_F16_e64 0, %9, 0, 12568, 0, 0, implicit $mode, implicit $exec
-    %13:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed %12, 0, 0, implicit $mode, implicit $exec
-    %15:vgpr_32 = V_LSHRREV_B32_e64 16, %9, implicit $exec
-    %17:vgpr_32 = nofpexcept V_MUL_F16_e64 0, %15, 0, 12568, 0, 0, implicit $mode, implicit $exec
-    %18:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed %17, 0, 0, implicit $mode, implicit $exec
     %20:vgpr_32 = nofpexcept V_MUL_F16_e64 0, %8, 0, 12568, 0, 0, implicit $mode, implicit $exec
     %21:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed %20, 0, 0, implicit $mode, implicit $exec
     %22:vgpr_32 = V_LSHRREV_B32_e64 16, %8, implicit $exec
     %24:vgpr_32 = nofpexcept V_MUL_F16_e64 0, %22, 0, 12568, 0, 0, implicit $mode, implicit $exec
     %25:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed %24, 0, 0, implicit $mode, implicit $exec
     %26:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %21, 0, killed %25, 0, 0, implicit $mode, implicit $exec
-    %27:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %13, 0, killed %18, 0, 0, implicit $mode, implicit $exec
     $vgpr0 = COPY %26
-    $vgpr1 = COPY %27
-    SI_RETURN implicit $vgpr0, implicit $vgpr1
+    SI_RETURN implicit $vgpr0
 ...
 
 ---
-name:            cos_v4f16
+name:            asymmetric_equal_edges_fp16
 tracksRegLiveness: true
 body:             |
-  bb.0 (%ir-block.0):
-    liveins: $vgpr0, $vgpr1
-
-    ; GFX9-LABEL: name: cos_v4f16
-    ; GFX9: liveins: $vgpr0, $vgpr1
+  bb.0:
+    liveins: $vgpr0
+    ; GFX9-LABEL: name: asymmetric_equal_edges_fp16
+    ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 12568, implicit $exec
     ; GFX9-NEXT: [[V_MUL_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, [[COPY]], 0, [[V_MOV_B32_e32_]], 0, 0, 4, 2, 4, 6, implicit $mode, implicit $exec, implicit [[COPY]](tied-def 0)
     ; GFX9-NEXT: [[V_COS_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_COS_F16_sdwa 0, killed [[V_MUL_F16_sdwa]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit killed [[V_MUL_F16_sdwa]](tied-def 0)
     ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 12568, implicit $exec
     ; GFX9-NEXT: [[V_MUL_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, [[V_COS_F16_sdwa]], 0, [[V_MOV_B32_e32_1]], 0, 0, 5, 2, 5, 6, implicit $mode, implicit $exec, implicit [[V_COS_F16_sdwa]](tied-def 0)
-    ; GFX9-NEXT: [[V_COS_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_COS_F16_sdwa 0, killed [[V_MUL_F16_sdwa1]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit killed [[V_MUL_F16_sdwa1]](tied-def 0)
-    ; GFX9-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 12568, implicit $exec
-    ; GFX9-NEXT: [[V_MUL_F16_sdwa2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, [[COPY1]], 0, [[V_MOV_B32_e32_2]], 0, 0, 4, 2, 4, 6, implicit $mode, implicit $exec, implicit [[COPY1]](tied-def 0)
-    ; GFX9-NEXT: [[V_COS_F16_sdwa2:%[0-9]+]]:vgpr_32 = nofpexcept V_COS_F16_sdwa 0, killed [[V_MUL_F16_sdwa2]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit killed [[V_MUL_F16_sdwa2]](tied-def 0)
-    ; GFX9-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 12568, implicit $exec
-    ; GFX9-NEXT: [[V_MUL_F16_sdwa3:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, [[V_COS_F16_sdwa2]], 0, [[V_MOV_B32_e32_3]], 0, 0, 5, 2, 5, 6, implicit $mode, implicit $exec, implicit [[V_COS_F16_sdwa2]](tied-def 0)
-    ; GFX9-NEXT: [[V_COS_F16_sdwa3:%[0-9]+]]:vgpr_32 = nofpexcept V_COS_F16_sdwa 0, killed [[V_MUL_F16_sdwa3]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit killed [[V_MUL_F16_sdwa3]](tied-def 0)
-    ; GFX9-NEXT: $vgpr0 = COPY [[V_COS_F16_sdwa3]]
-    ; GFX9-NEXT: $vgpr1 = COPY [[V_COS_F16_sdwa1]]
-    ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
-    %9:vgpr_32 = COPY $vgpr1
+    ; GFX9-NEXT: [[V_EXP_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F16_sdwa 0, killed [[V_MUL_F16_sdwa1]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit killed [[V_MUL_F16_sdwa1]](tied-def 0)
+    ; GFX9-NEXT: $vgpr0 = COPY [[V_EXP_F16_sdwa]]
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0
     %8:vgpr_32 = COPY $vgpr0
-    %12:vgpr_32 = nofpexcept V_MUL_F16_e64 0, %9, 0, 12568, 0, 0, implicit $mode, implicit $exec
-    %13:vgpr_32 = nofpexcept V_COS_F16_e64 0, killed %12, 0, 0, implicit $mode, implicit $exec
-    %15:vgpr_32 = V_LSHRREV_B32_e64 16, %9, implicit $exec
-    %17:vgpr_32 = nofpexcept V_MUL_F16_e64 0, %15, 0, 12568, 0, 0, implicit $mode, implicit $exec
-    %18:vgpr_32 = nofpexcept V_COS_F16_e64 0, killed %17, 0, 0, implicit $mode, implicit $exec
     %20:vgpr_32 = nofpexcept V_MUL_F16_e64 0, %8, 0, 12568, 0, 0, implicit $mode, implicit $exec
     %21:vgpr_32 = nofpexcept V_COS_F16_e64 0, killed %20, 0, 0, implicit $mode, implicit $exec
     %22:vgpr_32 = V_LSHRREV_B32_e64 16, %8, implicit $exec
     %24:vgpr_32 = nofpexcept V_MUL_F16_e64 0, %22, 0, 12568, 0, 0, implicit $mode, implicit $exec
-    %25:vgpr_32 = nofpexcept V_COS_F16_e64 0, killed %24, 0, 0, implicit $mode, implicit $exec
+    %25:vgpr_32 = nofpexcept V_EXP_F16_e64 0, killed %24, 0, 0, implicit $mode, implicit $exec
     %26:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %21, 0, killed %25, 0, 0, implicit $mode, implicit $exec
-    %27:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %13, 0, killed %18, 0, 0, implicit $mode, implicit $exec
     $vgpr0 = COPY %26
-    $vgpr1 = COPY %27
-    SI_RETURN implicit $vgpr0, implicit $vgpr1
+    SI_RETURN implicit $vgpr0
 ...
 
 ---
-name:            log_v4f16
+name:            asymmetric_unequal_edges_fp16
 tracksRegLiveness: true
 body:             |
-  bb.0 (%ir-block.0):
-    liveins: $vgpr0, $vgpr1
-
-    ; GFX9-LABEL: name: log_v4f16
-    ; GFX9: liveins: $vgpr0, $vgpr1
+  bb.0:
+    liveins: $vgpr0
+    ; GFX9-LABEL: name: asymmetric_unequal_edges_fp16
+    ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[V_LOG_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[COPY]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit [[COPY]](tied-def 0)
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 14732
-    ; GFX9-NEXT: [[V_MUL_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, killed [[V_LOG_F16_sdwa]], 0, [[S_MOV_B32_]], 0, 0, 4, 2, 4, 6, implicit $mode, implicit $exec, implicit killed [[V_LOG_F16_sdwa]](tied-def 0)
-    ; GFX9-NEXT: [[V_LOG_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[V_MUL_F16_sdwa]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit [[V_MUL_F16_sdwa]](tied-def 0)
-    ; GFX9-NEXT: [[V_MUL_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, killed [[V_LOG_F16_sdwa1]], 0, [[S_MOV_B32_]], 0, 0, 5, 2, 5, 6, implicit $mode, implicit $exec, implicit killed [[V_LOG_F16_sdwa1]](tied-def 0)
-    ; GFX9-NEXT: [[V_LOG_F16_sdwa2:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[COPY1]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit [[COPY1]](tied-def 0)
-    ; GFX9-NEXT: [[V_MUL_F16_sdwa2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, killed [[V_LOG_F16_sdwa2]], 0, [[S_MOV_B32_]], 0, 0, 4, 2, 4, 6, implicit $mode, implicit $exec, implicit killed [[V_LOG_F16_sdwa2]](tied-def 0)
-    ; GFX9-NEXT: [[V_LOG_F16_sdwa3:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[V_MUL_F16_sdwa2]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit [[V_MUL_F16_sdwa2]](tied-def 0)
-    ; GFX9-NEXT: [[V_MUL_F16_sdwa3:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, killed [[V_LOG_F16_sdwa3]], 0, [[S_MOV_B32_]], 0, 0, 5, 2, 5, 6, implicit $mode, implicit $exec, implicit killed [[V_LOG_F16_sdwa3]](tied-def 0)
-    ; GFX9-NEXT: $vgpr0 = COPY [[V_MUL_F16_sdwa3]]
-    ; GFX9-NEXT: $vgpr1 = COPY [[V_MUL_F16_sdwa1]]
-    ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
-    %9:vgpr_32 = COPY $vgpr1
+    ; GFX9-NEXT: [[V_LOG_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[COPY]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit [[COPY]](tied-def 0)
+    ; GFX9-NEXT: [[V_EXP_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F16_sdwa 0, [[V_LOG_F16_sdwa]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit [[V_LOG_F16_sdwa]](tied-def 0)
+    ; GFX9-NEXT: [[V_MUL_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, killed [[V_EXP_F16_sdwa]], 0, [[S_MOV_B32_]], 0, 0, 5, 2, 5, 6, implicit $mode, implicit $exec, implicit killed [[V_EXP_F16_sdwa]](tied-def 0)
+    ; GFX9-NEXT: $vgpr0 = COPY [[V_MUL_F16_sdwa]]
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0
     %8:vgpr_32 = COPY $vgpr0
-    %11:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %9, 0, 0, implicit $mode, implicit $exec
     %12:sreg_32 = S_MOV_B32 14732
-    %13:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %11, 0, %12, 0, 0, implicit $mode, implicit $exec
-    %15:vgpr_32 = V_LSHRREV_B32_e64 16, %9, implicit $exec
-    %17:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %15, 0, 0, implicit $mode, implicit $exec
-    %18:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %17, 0, %12, 0, 0, implicit $mode, implicit $exec
     %20:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %8, 0, 0, implicit $mode, implicit $exec
-    %21:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %20, 0, %12, 0, 0, implicit $mode, implicit $exec
     %22:vgpr_32 = V_LSHRREV_B32_e64 16, %8, implicit $exec
-    %24:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %22, 0, 0, implicit $mode, implicit $exec
+    %24:vgpr_32 = nofpexcept V_EXP_F16_e64 0, %22, 0, 0, implicit $mode, implicit $exec
     %25:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %24, 0, %12, 0, 0, implicit $mode, implicit $exec
-    %26:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %21, 0, killed %25, 0, 0, implicit $mode, implicit $exec
-    %27:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %13, 0, killed %18, 0, 0, implicit $mode, implicit $exec
+    %26:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %20, 0, killed %25, 0, 0, implicit $mode, implicit $exec
     $vgpr0 = COPY %26
-    $vgpr1 = COPY %27
-    SI_RETURN implicit $vgpr0, implicit $vgpr1
+    SI_RETURN implicit $vgpr0
 ...
 
 ---
-name:            log2_v4f16
+name:            symmetric_one_edge_fp16
 tracksRegLiveness: true
 body:             |
-  bb.0 (%ir-block.0):
-    liveins: $vgpr0, $vgpr1
-
-    ; GFX9-LABEL: name: log2_v4f16
-    ; GFX9: liveins: $vgpr0, $vgpr1
+  bb.0:
+    liveins: $vgpr0
+    ; GFX9-LABEL: name: symmetric_one_edge_fp16
+    ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-NEXT: [[V_LOG_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[COPY]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit [[COPY]](tied-def 0)
-    ; GFX9-NEXT: [[V_LOG_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[COPY1]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit [[COPY1]](tied-def 0)
-    ; GFX9-NEXT: [[V_LOG_F16_sdwa2:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[V_LOG_F16_sdwa]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit [[V_LOG_F16_sdwa]](tied-def 0)
-    ; GFX9-NEXT: [[V_LOG_F16_sdwa3:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[V_LOG_F16_sdwa1]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit [[V_LOG_F16_sdwa1]](tied-def 0)
-    ; GFX9-NEXT: $vgpr0 = COPY [[V_LOG_F16_sdwa3]]
-    ; GFX9-NEXT: $vgpr1 = COPY [[V_LOG_F16_sdwa2]]
-    ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
-    %9:vgpr_32 = COPY $vgpr1
+    ; GFX9-NEXT: [[V_LOG_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[V_LOG_F16_sdwa]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit [[V_LOG_F16_sdwa]](tied-def 0)
+    ; GFX9-NEXT: $vgpr0 = COPY [[V_LOG_F16_sdwa1]]
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0
     %8:vgpr_32 = COPY $vgpr0
-    %11:vgpr_32 = V_LSHRREV_B32_e64 16, %9, implicit $exec
-    %13:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %11, 0, 0, implicit $mode, implicit $exec
     %14:vgpr_32 = V_LSHRREV_B32_e64 16, %8, implicit $exec
     %16:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %14, 0, 0, implicit $mode, implicit $exec
-    %18:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %9, 0, 0, implicit $mode, implicit $exec
     %20:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %8, 0, 0, implicit $mode, implicit $exec
     %21:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %20, 0, killed %16, 0, 0, implicit $mode, implicit $exec
-    %22:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %18, 0, killed %13, 0, 0, implicit $mode, implicit $exec
     $vgpr0 = COPY %21
-    $vgpr1 = COPY %22
-    SI_RETURN implicit $vgpr0, implicit $vgpr1
+    SI_RETURN implicit $vgpr0
 ...
 
 ---
-name:            exp_v4f16
+name:            symmetric_equal_edges_fp16_fp32
 tracksRegLiveness: true
 body:             |
-  bb.0 (%ir-block.0):
+  bb.0:
     liveins: $vgpr0, $vgpr1
-
-    ; GFX9-LABEL: name: exp_v4f16
+    ; GFX9-LABEL: name: symmetric_equal_edges_fp16_fp32
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[V_CVT_F32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 1069066811
+    ; GFX9-NEXT: [[V_CVT_F32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GFX9-NEXT: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed [[V_CVT_F32_F16_e64_]], 0, [[S_MOV_B32_]], 0, 0, implicit $mode, implicit $exec
     ; GFX9-NEXT: [[V_EXP_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F32_e64 0, killed [[V_MUL_F32_e64_]], 0, 0, implicit $mode, implicit $exec
     ; GFX9-NEXT: [[V_CVT_F16_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, killed [[V_EXP_F32_e64_]], 0, 0, implicit $mode, implicit $exec
@@ -247,31 +128,11 @@ body:             |
     ; GFX9-NEXT: [[V_MUL_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed [[V_CVT_F32_F16_sdwa]], 0, [[S_MOV_B32_]], 0, 0, implicit $mode, implicit $exec
     ; GFX9-NEXT: [[V_EXP_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F32_e64 0, killed [[V_MUL_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
     ; GFX9-NEXT: [[V_CVT_F16_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, killed [[V_EXP_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
-    ; GFX9-NEXT: [[V_CVT_F32_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-    ; GFX9-NEXT: [[V_MUL_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed [[V_CVT_F32_F16_e64_1]], 0, [[S_MOV_B32_]], 0, 0, implicit $mode, implicit $exec
-    ; GFX9-NEXT: [[V_EXP_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F32_e64 0, killed [[V_MUL_F32_e64_2]], 0, 0, implicit $mode, implicit $exec
-    ; GFX9-NEXT: [[V_CVT_F16_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, killed [[V_EXP_F32_e64_2]], 0, 0, implicit $mode, implicit $exec
-    ; GFX9-NEXT: [[V_CVT_F32_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_sdwa 0, [[COPY1]], 0, 0, 6, 0, 5, implicit $mode, implicit $exec
-    ; GFX9-NEXT: [[V_MUL_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed [[V_CVT_F32_F16_sdwa1]], 0, [[S_MOV_B32_]], 0, 0, implicit $mode, implicit $exec
-    ; GFX9-NEXT: [[V_EXP_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F32_e64 0, killed [[V_MUL_F32_e64_3]], 0, 0, implicit $mode, implicit $exec
-    ; GFX9-NEXT: [[V_CVT_F16_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, killed [[V_EXP_F32_e64_3]], 0, 0, implicit $mode, implicit $exec
-    ; GFX9-NEXT: [[V_PACK_B32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed [[V_CVT_F16_F32_e64_2]], 0, killed [[V_CVT_F16_F32_e64_3]], 0, 0, implicit $mode, implicit $exec
-    ; GFX9-NEXT: [[V_PACK_B32_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed [[V_CVT_F16_F32_e64_]], 0, killed [[V_CVT_F16_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_PACK_B32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed [[V_CVT_F16_F32_e64_]], 0, killed [[V_CVT_F16_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY [[V_PACK_B32_F16_e64_]]
-    ; GFX9-NEXT: $vgpr1 = COPY [[V_PACK_B32_F16_e64_1]]
-    ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
-    %9:vgpr_32 = COPY $vgpr1
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0
     %8:vgpr_32 = COPY $vgpr0
-    %11:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, %9, 0, 0, implicit $mode, implicit $exec
     %12:sgpr_32 = S_MOV_B32 1069066811
-    %13:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed %11, 0, %12, 0, 0, implicit $mode, implicit $exec
-    %14:vgpr_32 = nofpexcept V_EXP_F32_e64 0, killed %13, 0, 0, implicit $mode, implicit $exec
-    %15:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, killed %14, 0, 0, implicit $mode, implicit $exec
-    %17:vgpr_32 = V_LSHRREV_B32_e64 16, %9, implicit $exec
-    %19:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, %17, 0, 0, implicit $mode, implicit $exec
-    %20:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed %19, 0, %12, 0, 0, implicit $mode, implicit $exec
-    %21:vgpr_32 = nofpexcept V_EXP_F32_e64 0, killed %20, 0, 0, implicit $mode, implicit $exec
-    %22:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, killed %21, 0, 0, implicit $mode, implicit $exec
     %24:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, %8, 0, 0, implicit $mode, implicit $exec
     %25:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed %24, 0, %12, 0, 0, implicit $mode, implicit $exec
     %26:vgpr_32 = nofpexcept V_EXP_F32_e64 0, killed %25, 0, 0, implicit $mode, implicit $exec
@@ -282,99 +143,72 @@ body:             |
     %32:vgpr_32 = nofpexcept V_EXP_F32_e64 0, killed %31, 0, 0, implicit $mode, implicit $exec
     %33:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, killed %32, 0, 0, implicit $mode, implicit $exec
     %34:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %27, 0, killed %33, 0, 0, implicit $mode, implicit $exec
-    %35:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %15, 0, killed %22, 0, 0, implicit $mode, implicit $exec
     $vgpr0 = COPY %34
-    $vgpr1 = COPY %35
-    SI_RETURN implicit $vgpr0, implicit $vgpr1
+    SI_RETURN implicit $vgpr0
 ...
 
 ---
-name:            cascaded_v4f16
+name:            asymmetric_unequal_edges_fp16_fp32
 tracksRegLiveness: true
 body:             |
-  bb.0 (%ir-block.0):
+  bb.0:
     liveins: $vgpr0, $vgpr1
-
-    ; GFX9-LABEL: name: cascaded_v4f16
+    ; GFX9-LABEL: name: asymmetric_unequal_edges_fp16_fp32
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[V_LOG_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[COPY1]], 0, 0, 6, 0, 5, implicit $mode, implicit $exec
-    ; GFX9-NEXT: [[V_LOG_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[COPY]], 0, 0, 6, 0, 5, implicit $mode, implicit $exec
-    ; GFX9-NEXT: [[V_LOG_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-    ; GFX9-NEXT: [[V_LOG_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GFX9-NEXT: [[V_MUL_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed [[V_LOG_F16_e64_1]], 0, 12568, 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 1069066811
+    ; GFX9-NEXT: [[V_CVT_F32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed [[V_CVT_F32_F16_e64_]], 0, [[S_MOV_B32_]], 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_EXP_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F32_e64 0, killed [[V_MUL_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_CVT_F16_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, killed [[V_EXP_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_MUL_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, [[COPY]], 0, [[S_MOV_B32_]], 0, 0, 6, 0, 5, 6, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_EXP_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F16_e64 0, killed [[V_MUL_F16_sdwa]], 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_PACK_B32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed [[V_CVT_F16_F32_e64_]], 0, killed [[V_EXP_F16_e64_]], 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: $vgpr0 = COPY [[V_PACK_B32_F16_e64_]]
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+    %8:vgpr_32 = COPY $vgpr0
+    %12:sgpr_32 = S_MOV_B32 1069066811
+    %24:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, %8, 0, 0, implicit $mode, implicit $exec
+    %25:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed %24, 0, %12, 0, 0, implicit $mode, implicit $exec
+    %26:vgpr_32 = nofpexcept V_EXP_F32_e64 0, killed %25, 0, 0, implicit $mode, implicit $exec
+    %27:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, killed %26, 0, 0, implicit $mode, implicit $exec
+    %28:vgpr_32 = V_LSHRREV_B32_e64 16, %8, implicit $exec
+    %31:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %28, 0, %12, 0, 0, implicit $mode, implicit $exec
+    %32:vgpr_32 = nofpexcept V_EXP_F16_e64 0, killed %31, 0, 0, implicit $mode, implicit $exec
+    %34:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %27, 0, killed %32, 0, 0, implicit $mode, implicit $exec
+    $vgpr0 = COPY %34
+    SI_RETURN implicit $vgpr0
+...
+
+---
+name:            interleaved_symmetric_edges_fp16
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+    ; GFX9-LABEL: name: interleaved_symmetric_edges_fp16
+    ; GFX9: liveins: $vgpr0
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9-NEXT: [[V_LOG_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[COPY]], 0, 0, 6, 0, 5, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_LOG_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_MUL_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed [[V_LOG_F16_e64_]], 0, 12568, 0, 0, implicit $mode, implicit $exec
     ; GFX9-NEXT: [[V_SIN_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed [[V_MUL_F16_e64_]], 0, 0, implicit $mode, implicit $exec
-    ; GFX9-NEXT: [[V_MUL_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed [[V_LOG_F16_sdwa1]], 0, 12568, 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_MUL_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed [[V_LOG_F16_sdwa]], 0, 12568, 0, 0, implicit $mode, implicit $exec
     ; GFX9-NEXT: [[V_SIN_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed [[V_MUL_F16_e64_1]], 0, 0, implicit $mode, implicit $exec
-    ; GFX9-NEXT: [[V_MUL_F16_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed [[V_LOG_F16_e64_]], 0, 12568, 0, 0, implicit $mode, implicit $exec
-    ; GFX9-NEXT: [[V_SIN_F16_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed [[V_MUL_F16_e64_2]], 0, 0, implicit $mode, implicit $exec
-    ; GFX9-NEXT: [[V_MUL_F16_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed [[V_LOG_F16_sdwa]], 0, 12568, 0, 0, implicit $mode, implicit $exec
-    ; GFX9-NEXT: [[V_SIN_F16_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed [[V_MUL_F16_e64_3]], 0, 0, implicit $mode, implicit $exec
-    ; GFX9-NEXT: [[V_PACK_B32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed [[V_SIN_F16_e64_2]], 0, killed [[V_SIN_F16_e64_3]], 0, 0, implicit $mode, implicit $exec
-    ; GFX9-NEXT: [[V_PACK_B32_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed [[V_SIN_F16_e64_]], 0, killed [[V_SIN_F16_e64_1]], 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_PACK_B32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed [[V_SIN_F16_e64_]], 0, killed [[V_SIN_F16_e64_1]], 0, 0, implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY [[V_PACK_B32_F16_e64_]]
-    ; GFX9-NEXT: $vgpr1 = COPY [[V_PACK_B32_F16_e64_1]]
-    ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
-    %9:vgpr_32 = COPY $vgpr1
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0
     %8:vgpr_32 = COPY $vgpr0
     %11:vgpr_32 = V_LSHRREV_B32_e64 16, %8, implicit $exec
     %13:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %11, 0, 0, implicit $mode, implicit $exec
-    %14:vgpr_32 = V_LSHRREV_B32_e64 16, %9, implicit $exec
-    %16:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %14, 0, 0, implicit $mode, implicit $exec
     %18:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %8, 0, 0, implicit $mode, implicit $exec
-    %20:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %9, 0, 0, implicit $mode, implicit $exec
-    %22:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %20, 0, 12568, 0, 0, implicit $mode, implicit $exec
-    %23:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed %22, 0, 0, implicit $mode, implicit $exec
-    %24:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %16, 0, 12568, 0, 0, implicit $mode, implicit $exec
-    %25:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed %24, 0, 0, implicit $mode, implicit $exec
     %26:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %18, 0, 12568, 0, 0, implicit $mode, implicit $exec
     %27:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed %26, 0, 0, implicit $mode, implicit $exec
     %28:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %13, 0, 12568, 0, 0, implicit $mode, implicit $exec
     %29:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed %28, 0, 0, implicit $mode, implicit $exec
     %30:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %27, 0, killed %29, 0, 0, implicit $mode, implicit $exec
-    %31:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %23, 0, killed %25, 0, 0, implicit $mode, implicit $exec
     $vgpr0 = COPY %30
-    $vgpr1 = COPY %31
-    SI_RETURN implicit $vgpr0, implicit $vgpr1
-...
-
----
-name:            unbalanced_operations_packed
-tracksRegLiveness: true
-body:             |
-  bb.0 (%ir-block.0):
-    liveins: $vgpr0, $vgpr1
-    ; GFX9-LABEL: name: unbalanced_operations_packed
-    ; GFX9: liveins: $vgpr0, $vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[V_LOG_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[COPY]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit [[COPY]](tied-def 0)
-    ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 14732
-    ; GFX9-NEXT: [[V_MUL_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, killed [[V_LOG_F16_sdwa]], 0, [[S_MOV_B32_]], 0, 0, 4, 2, 4, 6, implicit $mode, implicit $exec, implicit killed [[V_LOG_F16_sdwa]](tied-def 0)
-    ; GFX9-NEXT: [[V_LOG_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[V_MUL_F16_sdwa]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit [[V_MUL_F16_sdwa]](tied-def 0)
-    ; GFX9-NEXT: [[V_LOG_F16_sdwa2:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[COPY1]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit [[COPY1]](tied-def 0)
-    ; GFX9-NEXT: [[V_LOG_F16_sdwa3:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[V_LOG_F16_sdwa2]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit [[V_LOG_F16_sdwa2]](tied-def 0)
-    ; GFX9-NEXT: [[V_MUL_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, killed [[V_LOG_F16_sdwa3]], 0, [[S_MOV_B32_]], 0, 0, 5, 2, 5, 6, implicit $mode, implicit $exec, implicit killed [[V_LOG_F16_sdwa3]](tied-def 0)
-    ; GFX9-NEXT: $vgpr0 = COPY [[V_MUL_F16_sdwa1]]
-    ; GFX9-NEXT: $vgpr1 = COPY [[V_LOG_F16_sdwa1]]
-    ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
-    %9:vgpr_32 = COPY $vgpr1
-    %8:vgpr_32 = COPY $vgpr0
-    %11:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %9, 0, 0, implicit $mode, implicit $exec
-    %12:sreg_32 = S_MOV_B32 14732
-    %13:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %11, 0, %12, 0, 0, implicit $mode, implicit $exec
-    %15:vgpr_32 = V_LSHRREV_B32_e64 16, %9, implicit $exec
-    %17:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %15, 0, 0, implicit $mode, implicit $exec
-    %20:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %8, 0, 0, implicit $mode, implicit $exec
-    %22:vgpr_32 = V_LSHRREV_B32_e64 16, %8, implicit $exec
-    %24:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %22, 0, 0, implicit $mode, implicit $exec
-    %25:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %24, 0, %12, 0, 0, implicit $mode, implicit $exec
-    %26:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %20, 0, killed %25, 0, 0, implicit $mode, implicit $exec
-    %27:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %13, 0, killed %17, 0, 0, implicit $mode, implicit $exec
-    $vgpr0 = COPY %26
-    $vgpr1 = COPY %27
-    SI_RETURN implicit $vgpr0, implicit $vgpr1
+    SI_RETURN implicit $vgpr0
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/vector-fp16.ll b/llvm/test/CodeGen/AMDGPU/vector-fp16.ll
deleted file mode 100644
index 501630e790200..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/vector-fp16.ll
+++ /dev/null
@@ -1,2758 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX906 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX908 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
-
-declare <1 x half> @llvm.sin.v1f16(<1 x half>)
-declare <1 x half> @llvm.cos.v1f16(<1 x half>)
-declare <1 x half> @llvm.log.v1f16(<1 x half>)
-declare <1 x half> @llvm.log2.v1f16(<1 x half>)
-declare <1 x half> @llvm.log10.v1f16(<1 x half>)
-declare <1 x half> @llvm.exp.v1f16(<1 x half>)
-declare <1 x half> @llvm.exp2.v1f16(<1 x half>)
-declare <1 x half> @llvm.exp10.v1f16(<1 x half>)
-declare <1 x half> @llvm.sqrt.v1f16(<1 x half>)
-
-declare <2 x half> @llvm.sin.v2f16(<2 x half>)
-declare <2 x half> @llvm.cos.v2f16(<2 x half>)
-declare <2 x half> @llvm.log.v2f16(<2 x half>)
-declare <2 x half> @llvm.log2.v2f16(<2 x half>)
-declare <2 x half> @llvm.log10.v2f16(<2 x half>)
-declare <2 x half> @llvm.exp.v2f16(<2 x half>)
-declare <2 x half> @llvm.exp2.v2f16(<2 x half>)
-declare <2 x half> @llvm.exp10.v2f16(<2 x half>)
-declare <2 x half> @llvm.sqrt.v2f16(<2 x half>)
-
-declare <4 x half> @llvm.sin.v4f16(<4 x half>)
-declare <4 x half> @llvm.cos.v4f16(<4 x half>)
-declare <4 x half> @llvm.log.v4f16(<4 x half>)
-declare <4 x half> @llvm.log2.v4f16(<4 x half>)
-declare <4 x half> @llvm.log10.v4f16(<4 x half>)
-declare <4 x half> @llvm.exp.v4f16(<4 x half>)
-declare <4 x half> @llvm.exp2.v4f16(<4 x half>)
-declare <4 x half> @llvm.exp10.v4f16(<4 x half>)
-declare <4 x half> @llvm.sqrt.v4f16(<4 x half>)
-
-declare <5 x half> @llvm.sin.v5f16(<5 x half>)
-declare <5 x half> @llvm.cos.v5f16(<5 x half>)
-declare <5 x half> @llvm.log.v5f16(<5 x half>)
-declare <5 x half> @llvm.log2.v5f16(<5 x half>)
-declare <5 x half> @llvm.log10.v5f16(<5 x half>)
-declare <5 x half> @llvm.exp.v5f16(<5 x half>)
-declare <5 x half> @llvm.exp2.v5f16(<5 x half>)
-declare <5 x half> @llvm.exp10.v5f16(<5 x half>)
-declare <5 x half> @llvm.sqrt.v5f16(<5 x half>)
-
-
-define <1 x half> @sin_v1f16(<1 x half> %a) {
-; GFX8-LABEL: sin_v1f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mul_f16_e32 v0, 0.15915494, v0
-; GFX8-NEXT:    v_fract_f16_e32 v0, v0
-; GFX8-NEXT:    v_sin_f16_e32 v0, v0
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: sin_v1f16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mul_f16_e32 v0, 0.15915494, v0
-; GFX9-NEXT:    v_sin_f16_e32 v0, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: sin_v1f16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mul_f16_e32 v0, 0.15915494, v0
-; GFX10-NEXT:    v_sin_f16_e32 v0, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: sin_v1f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mul_f16_e32 v0, 0.15915494, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_sin_f16_e32 v0, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-	%res = call <1 x half> @llvm.sin.v1f16(<1 x half> %a)
-	ret <1 x half> %res
-}
-
-define <1 x half> @cos_v1f16(<1 x half> %a) {
-; GFX8-LABEL: cos_v1f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mul_f16_e32 v0, 0.15915494, v0
-; GFX8-NEXT:    v_fract_f16_e32 v0, v0
-; GFX8-NEXT:    v_cos_f16_e32 v0, v0
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: cos_v1f16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mul_f16_e32 v0, 0.15915494, v0
-; GFX9-NEXT:    v_cos_f16_e32 v0, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: cos_v1f16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mul_f16_e32 v0, 0.15915494, v0
-; GFX10-NEXT:    v_cos_f16_e32 v0, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: cos_v1f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mul_f16_e32 v0, 0.15915494, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cos_f16_e32 v0, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-	%res = call <1 x half> @llvm.cos.v1f16(<1 x half> %a)
-	ret <1 x half> %res
-}
-
-define <1 x half> @log_v1f16(<1 x half> %a) {
-; GFX8-LABEL: log_v1f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_log_f16_e32 v0, v0
-; GFX8-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX906-LABEL: log_v1f16:
-; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    v_log_f16_e32 v0, v0
-; GFX906-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX906-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX908-LABEL: log_v1f16:
-; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_log_f16_e32 v0, v0
-; GFX908-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX908-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: log_v1f16:
-; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_log_f16_e32 v0, v0
-; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX942-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: log_v1f16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_log_f16_e32 v0, v0
-; GFX10-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: log_v1f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_log_f16_e32 v0, v0
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-	%res = call <1 x half> @llvm.log.v1f16(<1 x half> %a)
-	ret <1 x half> %res
-}
-
-define <1 x half> @log2_v1f16(<1 x half> %a) {
-; GFX8-LABEL: log2_v1f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_log_f16_e32 v0, v0
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: log2_v1f16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_log_f16_e32 v0, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: log2_v1f16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_log_f16_e32 v0, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: log2_v1f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_log_f16_e32 v0, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-	%res = call <1 x half> @llvm.log2.v1f16(<1 x half> %a)
-	ret <1 x half> %res
-}
-
-define <1 x half> @log10_v1f16(<1 x half> %a) {
-; GFX8-LABEL: log10_v1f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_log_f16_e32 v0, v0
-; GFX8-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX906-LABEL: log10_v1f16:
-; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    v_log_f16_e32 v0, v0
-; GFX906-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX906-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX908-LABEL: log10_v1f16:
-; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_log_f16_e32 v0, v0
-; GFX908-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX908-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: log10_v1f16:
-; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_log_f16_e32 v0, v0
-; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX942-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: log10_v1f16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_log_f16_e32 v0, v0
-; GFX10-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: log10_v1f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_log_f16_e32 v0, v0
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-	%res = call <1 x half> @llvm.log10.v1f16(<1 x half> %a)
-	ret <1 x half> %res
-}
-
-define <1 x half> @exp_v1f16(<1 x half> %a) {
-; GFX8-LABEL: exp_v1f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX8-NEXT:    v_exp_f32_e32 v0, v0
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX906-LABEL: exp_v1f16:
-; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX906-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX906-NEXT:    v_exp_f32_e32 v0, v0
-; GFX906-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX906-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX908-LABEL: exp_v1f16:
-; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX908-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX908-NEXT:    v_exp_f32_e32 v0, v0
-; GFX908-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX908-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: exp_v1f16:
-; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX942-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX942-NEXT:    v_exp_f32_e32 v0, v0
-; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX942-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: exp_v1f16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX10-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX10-NEXT:    v_exp_f32_e32 v0, v0
-; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: exp_v1f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX11-NEXT:    v_exp_f32_e32 v0, v0
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-	%res = call <1 x half> @llvm.exp.v1f16(<1 x half> %a)
-	ret <1 x half> %res
-}
-
-define <1 x half> @exp2_v1f16(<1 x half> %a) {
-; GFX8-LABEL: exp2_v1f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_exp_f16_e32 v0, v0
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: exp2_v1f16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_exp_f16_e32 v0, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: exp2_v1f16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_exp_f16_e32 v0, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: exp2_v1f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_exp_f16_e32 v0, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-	%res = call <1 x half> @llvm.exp2.v1f16(<1 x half> %a)
-	ret <1 x half> %res
-}
-
-define <1 x half> @exp10_v1f16(<1 x half> %a) {
-; GFX8-LABEL: exp10_v1f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX8-NEXT:    v_exp_f32_e32 v0, v0
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX906-LABEL: exp10_v1f16:
-; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX906-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX906-NEXT:    v_exp_f32_e32 v0, v0
-; GFX906-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX906-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX908-LABEL: exp10_v1f16:
-; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX908-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX908-NEXT:    v_exp_f32_e32 v0, v0
-; GFX908-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX908-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: exp10_v1f16:
-; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX942-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX942-NEXT:    v_exp_f32_e32 v0, v0
-; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX942-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: exp10_v1f16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX10-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX10-NEXT:    v_exp_f32_e32 v0, v0
-; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: exp10_v1f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX11-NEXT:    v_exp_f32_e32 v0, v0
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-	%res = call <1 x half> @llvm.exp10.v1f16(<1 x half> %a)
-	ret <1 x half> %res
-}
-
-define <1 x half> @sqrt_v1f16(<1 x half> %a) {
-; GFX8-LABEL: sqrt_v1f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_sqrt_f16_e32 v0, v0
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: sqrt_v1f16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_sqrt_f16_e32 v0, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: sqrt_v1f16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_sqrt_f16_e32 v0, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: sqrt_v1f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_sqrt_f16_e32 v0, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-	%res = call <1 x half> @llvm.sqrt.v1f16(<1 x half> %a)
-	ret <1 x half> %res
-}
-
-define <2 x half> @sin_v2f16(<2 x half> %a) {
-; GFX8-LABEL: sin_v2f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0x3118
-; GFX8-NEXT:    v_mul_f16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_mul_f16_e32 v0, 0.15915494, v0
-; GFX8-NEXT:    v_fract_f16_e32 v1, v1
-; GFX8-NEXT:    v_fract_f16_e32 v0, v0
-; GFX8-NEXT:    v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT:    v_sin_f16_e32 v0, v0
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX906-LABEL: sin_v2f16:
-; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    v_mov_b32_e32 v1, 0x3118
-; GFX906-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX906-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX906-NEXT:    v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX906-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX908-LABEL: sin_v2f16:
-; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v1, 0x3118
-; GFX908-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX908-NEXT:    v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX908-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX908-NEXT:    v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX908-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: sin_v2f16:
-; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v1, 0x3118
-; GFX942-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX942-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: sin_v2f16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0x3118
-; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: sin_v2f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_mul_f16_e32 v0, 0.15915494, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
-; GFX11-NEXT:    v_sin_f16_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_sin_f16_e32 v1, v1
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-	%res = call <2 x half> @llvm.sin.v2f16(<2 x half> %a)
-	ret <2 x half> %res
-}
-
-define <2 x half> @cos_v2f16(<2 x half> %a) {
-; GFX8-LABEL: cos_v2f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0x3118
-; GFX8-NEXT:    v_mul_f16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_mul_f16_e32 v0, 0.15915494, v0
-; GFX8-NEXT:    v_fract_f16_e32 v1, v1
-; GFX8-NEXT:    v_fract_f16_e32 v0, v0
-; GFX8-NEXT:    v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT:    v_cos_f16_e32 v0, v0
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX906-LABEL: cos_v2f16:
-; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    v_mov_b32_e32 v1, 0x3118
-; GFX906-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX906-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX906-NEXT:    v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX906-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX908-LABEL: cos_v2f16:
-; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v1, 0x3118
-; GFX908-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX908-NEXT:    v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX908-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX908-NEXT:    v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX908-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: cos_v2f16:
-; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v1, 0x3118
-; GFX942-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX942-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: cos_v2f16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0x3118
-; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: cos_v2f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_mul_f16_e32 v0, 0.15915494, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
-; GFX11-NEXT:    v_cos_f16_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cos_f16_e32 v1, v1
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-	%res = call <2 x half> @llvm.cos.v2f16(<2 x half> %a)
-	ret <2 x half> %res
-}
-
-define <2 x half> @log_v2f16(<2 x half> %a) {
-; GFX8-LABEL: log_v2f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_log_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_log_f16_e32 v0, v0
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0x398c
-; GFX8-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX906-LABEL: log_v2f16:
-; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX906-NEXT:    s_movk_i32 s4, 0x398c
-; GFX906-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX906-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX906-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX908-LABEL: log_v2f16:
-; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX908-NEXT:    s_movk_i32 s4, 0x398c
-; GFX908-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX908-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX908-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX908-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: log_v2f16:
-; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX942-NEXT:    s_movk_i32 s0, 0x398c
-; GFX942-NEXT:    v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX942-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: log_v2f16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0x398c
-; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: log_v2f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_log_f16_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_log_f16_e32 v1, v1
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX11-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-	%res = call <2 x half> @llvm.log.v2f16(<2 x half> %a)
-	ret <2 x half> %res
-}
-
-define <2 x half> @log2_v2f16(<2 x half> %a) {
-; GFX8-LABEL: log2_v2f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_log_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_log_f16_e32 v0, v0
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX906-LABEL: log2_v2f16:
-; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX906-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX906-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX908-LABEL: log2_v2f16:
-; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX908-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX908-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: log2_v2f16:
-; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX942-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: log2_v2f16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: log2_v2f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_log_f16_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_log_f16_e32 v1, v1
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-	%res = call <2 x half> @llvm.log2.v2f16(<2 x half> %a)
-	ret <2 x half> %res
-}
-
-define <2 x half> @log10_v2f16(<2 x half> %a) {
-; GFX8-LABEL: log10_v2f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_log_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_log_f16_e32 v0, v0
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0x34d1
-; GFX8-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX906-LABEL: log10_v2f16:
-; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX906-NEXT:    s_movk_i32 s4, 0x34d1
-; GFX906-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX906-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX906-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX908-LABEL: log10_v2f16:
-; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX908-NEXT:    s_movk_i32 s4, 0x34d1
-; GFX908-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX908-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX908-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX908-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: log10_v2f16:
-; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX942-NEXT:    s_movk_i32 s0, 0x34d1
-; GFX942-NEXT:    v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX942-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: log10_v2f16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0x34d1
-; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: log10_v2f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_log_f16_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_log_f16_e32 v1, v1
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX11-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-	%res = call <2 x half> @llvm.log10.v2f16(<2 x half> %a)
-	ret <2 x half> %res
-}
-
-define <2 x half> @exp_v2f16(<2 x half> %a) {
-; GFX8-LABEL: exp_v2f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX8-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX8-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX8-NEXT:    v_exp_f32_e32 v1, v1
-; GFX8-NEXT:    v_exp_f32_e32 v0, v0
-; GFX8-NEXT:    v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: exp_v2f16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; GFX9-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX9-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX9-NEXT:    v_exp_f32_e32 v1, v1
-; GFX9-NEXT:    v_exp_f32_e32 v0, v0
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX9-NEXT:    v_pack_b32_f16 v0, v1, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: exp_v2f16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; GFX10-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX10-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX10-NEXT:    v_exp_f32_e32 v1, v1
-; GFX10-NEXT:    v_exp_f32_e32 v0, v0
-; GFX10-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX10-NEXT:    v_pack_b32_f16 v0, v1, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: exp_v2f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX11-NEXT:    v_dual_mul_f32 v0, 0x3fb8aa3b, v0 :: v_dual_mul_f32 v1, 0x3fb8aa3b, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_exp_f32_e32 v0, v0
-; GFX11-NEXT:    v_exp_f32_e32 v1, v1
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-	%res = call <2 x half> @llvm.exp.v2f16(<2 x half> %a)
-	ret <2 x half> %res
-}
-
-define <2 x half> @exp2_v2f16(<2 x half> %a) {
-; GFX8-LABEL: exp2_v2f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_exp_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_exp_f16_e32 v0, v0
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX906-LABEL: exp2_v2f16:
-; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX906-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX906-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX908-LABEL: exp2_v2f16:
-; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX908-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX908-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: exp2_v2f16:
-; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX942-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: exp2_v2f16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: exp2_v2f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_exp_f16_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_exp_f16_e32 v1, v1
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-	%res = call <2 x half> @llvm.exp2.v2f16(<2 x half> %a)
-	ret <2 x half> %res
-}
-
-define <2 x half> @exp10_v2f16(<2 x half> %a) {
-; GFX8-LABEL: exp10_v2f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX8-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX8-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX8-NEXT:    v_exp_f32_e32 v1, v1
-; GFX8-NEXT:    v_exp_f32_e32 v0, v0
-; GFX8-NEXT:    v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: exp10_v2f16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; GFX9-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX9-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX9-NEXT:    v_exp_f32_e32 v1, v1
-; GFX9-NEXT:    v_exp_f32_e32 v0, v0
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX9-NEXT:    v_pack_b32_f16 v0, v1, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: exp10_v2f16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; GFX10-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX10-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX10-NEXT:    v_exp_f32_e32 v1, v1
-; GFX10-NEXT:    v_exp_f32_e32 v0, v0
-; GFX10-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX10-NEXT:    v_pack_b32_f16 v0, v1, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: exp10_v2f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX11-NEXT:    v_dual_mul_f32 v0, 0x3fb8aa3b, v0 :: v_dual_mul_f32 v1, 0x3fb8aa3b, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_exp_f32_e32 v0, v0
-; GFX11-NEXT:    v_exp_f32_e32 v1, v1
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-	%res = call <2 x half> @llvm.exp10.v2f16(<2 x half> %a)
-	ret <2 x half> %res
-}
-
-define <2 x half> @sqrt_v2f16(<2 x half> %a) {
-; GFX8-LABEL: sqrt_v2f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_sqrt_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_sqrt_f16_e32 v0, v0
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX906-LABEL: sqrt_v2f16:
-; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX906-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX906-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX908-LABEL: sqrt_v2f16:
-; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX908-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX908-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: sqrt_v2f16:
-; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX942-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: sqrt_v2f16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: sqrt_v2f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_sqrt_f16_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_sqrt_f16_e32 v1, v1
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-	%res = call <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
-	ret <2 x half> %res
-}
-
-define <4 x half> @sin_v4f16(<4 x half> %a) {
-; GFX8-LABEL: sin_v4f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v3, 0x3118
-; GFX8-NEXT:    v_mul_f16_e32 v2, 0.15915494, v1
-; GFX8-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_mul_f16_e32 v4, 0.15915494, v0
-; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_fract_f16_e32 v2, v2
-; GFX8-NEXT:    v_fract_f16_e32 v1, v1
-; GFX8-NEXT:    v_fract_f16_e32 v4, v4
-; GFX8-NEXT:    v_fract_f16_e32 v0, v0
-; GFX8-NEXT:    v_sin_f16_e32 v2, v2
-; GFX8-NEXT:    v_sin_f16_e32 v4, v4
-; GFX8-NEXT:    v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT:    v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX906-LABEL: sin_v4f16:
-; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    v_mov_b32_e32 v2, 0x3118
-; GFX906-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_sin_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX906-NEXT:    v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX906-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX906-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX906-NEXT:    v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX906-NEXT:    v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX906-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX908-LABEL: sin_v4f16:
-; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v2, 0x3118
-; GFX908-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX908-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX908-NEXT:    v_sin_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX908-NEXT:    v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX908-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX908-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX908-NEXT:    v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX908-NEXT:    v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX908-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: sin_v4f16:
-; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v2, 0x3118
-; GFX942-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT:    v_sin_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX942-NEXT:    v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX942-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX942-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX942-NEXT:    v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX942-NEXT:    v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX942-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: sin_v4f16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0x3118
-; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_sin_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT:    v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: sin_v4f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
-; GFX11-NEXT:    v_mul_f16_e32 v0, 0.15915494, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
-; GFX11-NEXT:    v_mul_f16_e32 v3, 0.15915494, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_sin_f16_e32 v1, v1
-; GFX11-NEXT:    v_sin_f16_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_sin_f16_e32 v2, v2
-; GFX11-NEXT:    v_sin_f16_e32 v3, v3
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v2
-; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v3
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-	%res = call <4 x half> @llvm.sin.v4f16(<4 x half> %a)
-	ret <4 x half> %res
-}
-
-define <4 x half> @cos_v4f16(<4 x half> %a) {
-; GFX8-LABEL: cos_v4f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v3, 0x3118
-; GFX8-NEXT:    v_mul_f16_e32 v2, 0.15915494, v1
-; GFX8-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_mul_f16_e32 v4, 0.15915494, v0
-; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_fract_f16_e32 v2, v2
-; GFX8-NEXT:    v_fract_f16_e32 v1, v1
-; GFX8-NEXT:    v_fract_f16_e32 v4, v4
-; GFX8-NEXT:    v_fract_f16_e32 v0, v0
-; GFX8-NEXT:    v_cos_f16_e32 v2, v2
-; GFX8-NEXT:    v_cos_f16_e32 v4, v4
-; GFX8-NEXT:    v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT:    v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX906-LABEL: cos_v4f16:
-; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    v_mov_b32_e32 v2, 0x3118
-; GFX906-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_cos_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX906-NEXT:    v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX906-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX906-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX906-NEXT:    v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX906-NEXT:    v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX906-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX908-LABEL: cos_v4f16:
-; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v2, 0x3118
-; GFX908-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX908-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX908-NEXT:    v_cos_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX908-NEXT:    v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX908-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX908-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX908-NEXT:    v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX908-NEXT:    v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX908-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: cos_v4f16:
-; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v2, 0x3118
-; GFX942-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT:    v_cos_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX942-NEXT:    v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX942-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX942-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX942-NEXT:    v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX942-NEXT:    v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX942-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: cos_v4f16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0x3118
-; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_cos_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT:    v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: cos_v4f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
-; GFX11-NEXT:    v_mul_f16_e32 v0, 0.15915494, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
-; GFX11-NEXT:    v_mul_f16_e32 v3, 0.15915494, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cos_f16_e32 v1, v1
-; GFX11-NEXT:    v_cos_f16_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cos_f16_e32 v2, v2
-; GFX11-NEXT:    v_cos_f16_e32 v3, v3
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v2
-; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v3
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-	%res = call <4 x half> @llvm.cos.v4f16(<4 x half> %a)
-	ret <4 x half> %res
-}
-
-define <4 x half> @log_v4f16(<4 x half> %a) {
-; GFX8-LABEL: log_v4f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_log_f16_e32 v2, v1
-; GFX8-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_log_f16_e32 v3, v0
-; GFX8-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_mov_b32_e32 v4, 0x398c
-; GFX8-NEXT:    v_mul_f16_e32 v2, 0x398c, v2
-; GFX8-NEXT:    v_mul_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_mul_f16_e32 v3, 0x398c, v3
-; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v3, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX906-LABEL: log_v4f16:
-; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX906-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX906-NEXT:    s_movk_i32 s4, 0x398c
-; GFX906-NEXT:    v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX906-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX906-NEXT:    v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX906-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX906-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX908-LABEL: log_v4f16:
-; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX908-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX908-NEXT:    s_movk_i32 s4, 0x398c
-; GFX908-NEXT:    v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX908-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX908-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX908-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX908-NEXT:    v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX908-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX908-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: log_v4f16:
-; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX942-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX942-NEXT:    s_movk_i32 s0, 0x398c
-; GFX942-NEXT:    v_mul_f16_sdwa v1, v1, s0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT:    v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX942-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX942-NEXT:    v_mul_f16_sdwa v1, v1, s0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX942-NEXT:    v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX942-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: log_v4f16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0x398c
-; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: log_v4f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-NEXT:    v_log_f16_e32 v1, v1
-; GFX11-NEXT:    v_log_f16_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_log_f16_e32 v2, v2
-; GFX11-NEXT:    v_log_f16_e32 v3, v3
-; GFX11-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX11-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f16_e32 v2, 0x398c, v2
-; GFX11-NEXT:    v_mul_f16_e32 v3, 0x398c, v3
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v3
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-	%res = call <4 x half> @llvm.log.v4f16(<4 x half> %a)
-	ret <4 x half> %res
-}
-
-define <4 x half> @log2_v4f16(<4 x half> %a) {
-; GFX8-LABEL: log2_v4f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_log_f16_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_log_f16_sdwa v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_log_f16_e32 v0, v0
-; GFX8-NEXT:    v_log_f16_e32 v1, v1
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: log2_v4f16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX9-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX9-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: log2_v4f16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: log2_v4f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-NEXT:    v_log_f16_e32 v1, v1
-; GFX11-NEXT:    v_log_f16_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_log_f16_e32 v2, v2
-; GFX11-NEXT:    v_log_f16_e32 v3, v3
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v2
-; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v3
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-	%res = call <4 x half> @llvm.log2.v4f16(<4 x half> %a)
-	ret <4 x half> %res
-}
-
-define <4 x half> @log10_v4f16(<4 x half> %a) {
-; GFX8-LABEL: log10_v4f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_log_f16_e32 v2, v1
-; GFX8-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_log_f16_e32 v3, v0
-; GFX8-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_mov_b32_e32 v4, 0x34d1
-; GFX8-NEXT:    v_mul_f16_e32 v2, 0x34d1, v2
-; GFX8-NEXT:    v_mul_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_mul_f16_e32 v3, 0x34d1, v3
-; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v3, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX906-LABEL: log10_v4f16:
-; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX906-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX906-NEXT:    s_movk_i32 s4, 0x34d1
-; GFX906-NEXT:    v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX906-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX906-NEXT:    v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX906-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX906-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX908-LABEL: log10_v4f16:
-; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX908-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX908-NEXT:    s_movk_i32 s4, 0x34d1
-; GFX908-NEXT:    v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX908-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX908-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX908-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX908-NEXT:    v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX908-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX908-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: log10_v4f16:
-; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX942-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX942-NEXT:    s_movk_i32 s0, 0x34d1
-; GFX942-NEXT:    v_mul_f16_sdwa v1, v1, s0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT:    v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX942-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX942-NEXT:    v_mul_f16_sdwa v1, v1, s0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX942-NEXT:    v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX942-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: log10_v4f16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0x34d1
-; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: log10_v4f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-NEXT:    v_log_f16_e32 v1, v1
-; GFX11-NEXT:    v_log_f16_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_log_f16_e32 v2, v2
-; GFX11-NEXT:    v_log_f16_e32 v3, v3
-; GFX11-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX11-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f16_e32 v2, 0x34d1, v2
-; GFX11-NEXT:    v_mul_f16_e32 v3, 0x34d1, v3
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v3
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-	%res = call <4 x half> @llvm.log10.v4f16(<4 x half> %a)
-	ret <4 x half> %res
-}
-
-define <4 x half> @exp_v4f16(<4 x half> %a) {
-; GFX8-LABEL: exp_v4f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v2, v1
-; GFX8-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v3, v0
-; GFX8-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v2
-; GFX8-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX8-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v3
-; GFX8-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX8-NEXT:    v_exp_f32_e32 v2, v2
-; GFX8-NEXT:    v_exp_f32_e32 v3, v3
-; GFX8-NEXT:    v_exp_f32_e32 v0, v0
-; GFX8-NEXT:    v_exp_f32_e32 v1, v1
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX8-NEXT:    v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT:    v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v3, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: exp_v4f16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_f16_e32 v2, v1
-; GFX9-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_cvt_f32_f16_e32 v3, v0
-; GFX9-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v2
-; GFX9-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX9-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v3
-; GFX9-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX9-NEXT:    v_exp_f32_e32 v2, v2
-; GFX9-NEXT:    v_exp_f32_e32 v3, v3
-; GFX9-NEXT:    v_exp_f32_e32 v0, v0
-; GFX9-NEXT:    v_exp_f32_e32 v1, v1
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX9-NEXT:    v_pack_b32_f16 v0, v3, v0
-; GFX9-NEXT:    v_pack_b32_f16 v1, v2, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: exp_v4f16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v2, v1
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v3, v0
-; GFX10-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v2
-; GFX10-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v3
-; GFX10-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX10-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX10-NEXT:    v_exp_f32_e32 v2, v2
-; GFX10-NEXT:    v_exp_f32_e32 v3, v3
-; GFX10-NEXT:    v_exp_f32_e32 v0, v0
-; GFX10-NEXT:    v_exp_f32_e32 v1, v1
-; GFX10-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX10-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX10-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX10-NEXT:    v_pack_b32_f16 v0, v3, v0
-; GFX10-NEXT:    v_pack_b32_f16 v1, v2, v1
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: exp_v4f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_mul_f32 v1, 0x3fb8aa3b, v1 :: v_dual_mul_f32 v0, 0x3fb8aa3b, v0
-; GFX11-NEXT:    v_dual_mul_f32 v2, 0x3fb8aa3b, v2 :: v_dual_mul_f32 v3, 0x3fb8aa3b, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_exp_f32_e32 v1, v1
-; GFX11-NEXT:    v_exp_f32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_exp_f32_e32 v2, v2
-; GFX11-NEXT:    v_exp_f32_e32 v3, v3
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v3
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-	%res = call <4 x half> @llvm.exp.v4f16(<4 x half> %a)
-	ret <4 x half> %res
-}
-
-define <4 x half> @exp2_v4f16(<4 x half> %a) {
-; GFX8-LABEL: exp2_v4f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_exp_f16_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_exp_f16_sdwa v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_exp_f16_e32 v0, v0
-; GFX8-NEXT:    v_exp_f16_e32 v1, v1
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: exp2_v4f16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX9-NEXT:    v_exp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX9-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_exp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: exp2_v4f16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT:    v_exp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_exp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: exp2_v4f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-NEXT:    v_exp_f16_e32 v1, v1
-; GFX11-NEXT:    v_exp_f16_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_exp_f16_e32 v2, v2
-; GFX11-NEXT:    v_exp_f16_e32 v3, v3
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v2
-; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v3
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-	%res = call <4 x half> @llvm.exp2.v4f16(<4 x half> %a)
-	ret <4 x half> %res
-}
-
-define <4 x half> @exp10_v4f16(<4 x half> %a) {
-; GFX8-LABEL: exp10_v4f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v2, v1
-; GFX8-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v3, v0
-; GFX8-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v2
-; GFX8-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX8-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v3
-; GFX8-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX8-NEXT:    v_exp_f32_e32 v2, v2
-; GFX8-NEXT:    v_exp_f32_e32 v3, v3
-; GFX8-NEXT:    v_exp_f32_e32 v0, v0
-; GFX8-NEXT:    v_exp_f32_e32 v1, v1
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX8-NEXT:    v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT:    v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v3, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: exp10_v4f16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_f16_e32 v2, v1
-; GFX9-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_cvt_f32_f16_e32 v3, v0
-; GFX9-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v2
-; GFX9-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX9-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v3
-; GFX9-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX9-NEXT:    v_exp_f32_e32 v2, v2
-; GFX9-NEXT:    v_exp_f32_e32 v3, v3
-; GFX9-NEXT:    v_exp_f32_e32 v0, v0
-; GFX9-NEXT:    v_exp_f32_e32 v1, v1
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX9-NEXT:    v_pack_b32_f16 v0, v3, v0
-; GFX9-NEXT:    v_pack_b32_f16 v1, v2, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: exp10_v4f16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v2, v1
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v3, v0
-; GFX10-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v2
-; GFX10-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v3
-; GFX10-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX10-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX10-NEXT:    v_exp_f32_e32 v2, v2
-; GFX10-NEXT:    v_exp_f32_e32 v3, v3
-; GFX10-NEXT:    v_exp_f32_e32 v0, v0
-; GFX10-NEXT:    v_exp_f32_e32 v1, v1
-; GFX10-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX10-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX10-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX10-NEXT:    v_pack_b32_f16 v0, v3, v0
-; GFX10-NEXT:    v_pack_b32_f16 v1, v2, v1
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: exp10_v4f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_mul_f32 v1, 0x3fb8aa3b, v1 :: v_dual_mul_f32 v0, 0x3fb8aa3b, v0
-; GFX11-NEXT:    v_dual_mul_f32 v2, 0x3fb8aa3b, v2 :: v_dual_mul_f32 v3, 0x3fb8aa3b, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_exp_f32_e32 v1, v1
-; GFX11-NEXT:    v_exp_f32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_exp_f32_e32 v2, v2
-; GFX11-NEXT:    v_exp_f32_e32 v3, v3
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v3
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-	%res = call <4 x half> @llvm.exp10.v4f16(<4 x half> %a)
-	ret <4 x half> %res
-}
-
-define <4 x half> @sqrt_v4f16(<4 x half> %a) {
-; GFX8-LABEL: sqrt_v4f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_sqrt_f16_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_sqrt_f16_sdwa v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_sqrt_f16_e32 v0, v0
-; GFX8-NEXT:    v_sqrt_f16_e32 v1, v1
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: sqrt_v4f16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX9-NEXT:    v_sqrt_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX9-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_sqrt_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: sqrt_v4f16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT:    v_sqrt_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_sqrt_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: sqrt_v4f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-NEXT:    v_sqrt_f16_e32 v1, v1
-; GFX11-NEXT:    v_sqrt_f16_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_sqrt_f16_e32 v2, v2
-; GFX11-NEXT:    v_sqrt_f16_e32 v3, v3
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v2
-; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v3
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-	%res = call <4 x half> @llvm.sqrt.v4f16(<4 x half> %a)
-	ret <4 x half> %res
-}
-
-define <5 x half> @sin_v5f16(<5 x half> %a) {
-; GFX8-LABEL: sin_v5f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, 0x3118
-; GFX8-NEXT:    v_mul_f16_e32 v3, 0.15915494, v1
-; GFX8-NEXT:    v_mul_f16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_mul_f16_e32 v5, 0.15915494, v0
-; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
-; GFX8-NEXT:    v_fract_f16_e32 v3, v3
-; GFX8-NEXT:    v_fract_f16_e32 v1, v1
-; GFX8-NEXT:    v_fract_f16_e32 v5, v5
-; GFX8-NEXT:    v_fract_f16_e32 v0, v0
-; GFX8-NEXT:    v_sin_f16_e32 v3, v3
-; GFX8-NEXT:    v_sin_f16_e32 v5, v5
-; GFX8-NEXT:    v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT:    v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT:    v_fract_f16_e32 v2, v2
-; GFX8-NEXT:    v_sin_f16_e32 v2, v2
-; GFX8-NEXT:    v_or_b32_e32 v0, v5, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, v3, v1
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX906-LABEL: sin_v5f16:
-; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    v_mov_b32_e32 v3, 0x3118
-; GFX906-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_sin_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX906-NEXT:    v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX906-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
-; GFX906-NEXT:    v_sin_f16_e32 v2, v2
-; GFX906-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX906-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX906-NEXT:    v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX906-NEXT:    v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX906-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX908-LABEL: sin_v5f16:
-; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v3, 0x3118
-; GFX908-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX908-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX908-NEXT:    v_sin_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX908-NEXT:    v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX908-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
-; GFX908-NEXT:    v_sin_f16_e32 v2, v2
-; GFX908-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX908-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX908-NEXT:    v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX908-NEXT:    v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX908-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: sin_v5f16:
-; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0x3118
-; GFX942-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT:    v_sin_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX942-NEXT:    v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX942-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
-; GFX942-NEXT:    v_sin_f16_e32 v2, v2
-; GFX942-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX942-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX942-NEXT:    v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX942-NEXT:    v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX942-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: sin_v5f16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0x3118
-; GFX10-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
-; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_sin_f16_e32 v2, v2
-; GFX10-NEXT:    v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_sin_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT:    v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: sin_v5f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX11-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
-; GFX11-NEXT:    v_mul_f16_e32 v0, 0.15915494, v0
-; GFX11-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
-; GFX11-NEXT:    v_mul_f16_e32 v3, 0.15915494, v3
-; GFX11-NEXT:    v_mul_f16_e32 v4, 0.15915494, v4
-; GFX11-NEXT:    v_sin_f16_e32 v1, v1
-; GFX11-NEXT:    v_sin_f16_e32 v0, v0
-; GFX11-NEXT:    v_sin_f16_e32 v2, v2
-; GFX11-NEXT:    v_sin_f16_e32 v3, v3
-; GFX11-NEXT:    v_sin_f16_e32 v4, v4
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v3
-; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v4
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-	%res = call <5 x half> @llvm.sin.v5f16(<5 x half> %a)
-	ret <5 x half> %res
-}
-
-define <5 x half> @cos_v5f16(<5 x half> %a) {
-; GFX8-LABEL: cos_v5f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, 0x3118
-; GFX8-NEXT:    v_mul_f16_e32 v3, 0.15915494, v1
-; GFX8-NEXT:    v_mul_f16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_mul_f16_e32 v5, 0.15915494, v0
-; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
-; GFX8-NEXT:    v_fract_f16_e32 v3, v3
-; GFX8-NEXT:    v_fract_f16_e32 v1, v1
-; GFX8-NEXT:    v_fract_f16_e32 v5, v5
-; GFX8-NEXT:    v_fract_f16_e32 v0, v0
-; GFX8-NEXT:    v_cos_f16_e32 v3, v3
-; GFX8-NEXT:    v_cos_f16_e32 v5, v5
-; GFX8-NEXT:    v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT:    v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT:    v_fract_f16_e32 v2, v2
-; GFX8-NEXT:    v_cos_f16_e32 v2, v2
-; GFX8-NEXT:    v_or_b32_e32 v0, v5, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, v3, v1
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX906-LABEL: cos_v5f16:
-; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    v_mov_b32_e32 v3, 0x3118
-; GFX906-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_cos_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX906-NEXT:    v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX906-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
-; GFX906-NEXT:    v_cos_f16_e32 v2, v2
-; GFX906-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX906-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX906-NEXT:    v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX906-NEXT:    v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX906-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX908-LABEL: cos_v5f16:
-; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v3, 0x3118
-; GFX908-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX908-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX908-NEXT:    v_cos_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX908-NEXT:    v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX908-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
-; GFX908-NEXT:    v_cos_f16_e32 v2, v2
-; GFX908-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX908-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX908-NEXT:    v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX908-NEXT:    v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX908-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: cos_v5f16:
-; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0x3118
-; GFX942-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT:    v_cos_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX942-NEXT:    v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX942-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
-; GFX942-NEXT:    v_cos_f16_e32 v2, v2
-; GFX942-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX942-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX942-NEXT:    v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX942-NEXT:    v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX942-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: cos_v5f16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0x3118
-; GFX10-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
-; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_cos_f16_e32 v2, v2
-; GFX10-NEXT:    v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_cos_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT:    v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: cos_v5f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX11-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
-; GFX11-NEXT:    v_mul_f16_e32 v0, 0.15915494, v0
-; GFX11-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
-; GFX11-NEXT:    v_mul_f16_e32 v3, 0.15915494, v3
-; GFX11-NEXT:    v_mul_f16_e32 v4, 0.15915494, v4
-; GFX11-NEXT:    v_cos_f16_e32 v1, v1
-; GFX11-NEXT:    v_cos_f16_e32 v0, v0
-; GFX11-NEXT:    v_cos_f16_e32 v2, v2
-; GFX11-NEXT:    v_cos_f16_e32 v3, v3
-; GFX11-NEXT:    v_cos_f16_e32 v4, v4
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v3
-; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v4
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-	%res = call <5 x half> @llvm.cos.v5f16(<5 x half> %a)
-	ret <5 x half> %res
-}
-
-define <5 x half> @log_v5f16(<5 x half> %a) {
-; GFX8-LABEL: log_v5f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_log_f16_e32 v3, v1
-; GFX8-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_log_f16_e32 v4, v0
-; GFX8-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_log_f16_e32 v2, v2
-; GFX8-NEXT:    v_mov_b32_e32 v5, 0x398c
-; GFX8-NEXT:    v_mul_f16_e32 v3, 0x398c, v3
-; GFX8-NEXT:    v_mul_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_mul_f16_e32 v4, 0x398c, v4
-; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_mul_f16_e32 v2, 0x398c, v2
-; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, v3, v1
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX906-LABEL: log_v5f16:
-; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX906-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX906-NEXT:    s_movk_i32 s4, 0x398c
-; GFX906-NEXT:    v_log_f16_e32 v2, v2
-; GFX906-NEXT:    v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX906-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX906-NEXT:    v_mul_f16_e32 v2, 0x398c, v2
-; GFX906-NEXT:    v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX906-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX906-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX908-LABEL: log_v5f16:
-; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX908-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX908-NEXT:    s_movk_i32 s4, 0x398c
-; GFX908-NEXT:    v_log_f16_e32 v2, v2
-; GFX908-NEXT:    v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX908-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX908-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX908-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX908-NEXT:    v_mul_f16_e32 v2, 0x398c, v2
-; GFX908-NEXT:    v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX908-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX908-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: log_v5f16:
-; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX942-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX942-NEXT:    s_movk_i32 s0, 0x398c
-; GFX942-NEXT:    v_log_f16_e32 v2, v2
-; GFX942-NEXT:    v_mul_f16_sdwa v1, v1, s0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT:    v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX942-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX942-NEXT:    v_mul_f16_e32 v2, 0x398c, v2
-; GFX942-NEXT:    v_mul_f16_sdwa v1, v1, s0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX942-NEXT:    v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX942-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: log_v5f16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0x398c
-; GFX10-NEXT:    v_log_f16_e32 v2, v2
-; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_mul_f16_e32 v2, 0x398c, v2
-; GFX10-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: log_v5f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX11-NEXT:    v_log_f16_e32 v1, v1
-; GFX11-NEXT:    v_log_f16_e32 v0, v0
-; GFX11-NEXT:    v_log_f16_e32 v2, v2
-; GFX11-NEXT:    v_log_f16_e32 v3, v3
-; GFX11-NEXT:    v_log_f16_e32 v4, v4
-; GFX11-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX11-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX11-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f16_e32 v2, 0x398c, v2
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f16_e32 v3, 0x398c, v3
-; GFX11-NEXT:    v_mul_f16_e32 v4, 0x398c, v4
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v4
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-	%res = call <5 x half> @llvm.log.v5f16(<5 x half> %a)
-	ret <5 x half> %res
-}
-
-define <5 x half> @log2_v5f16(<5 x half> %a) {
-; GFX8-LABEL: log2_v5f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_log_f16_sdwa v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_log_f16_sdwa v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_log_f16_e32 v0, v0
-; GFX8-NEXT:    v_log_f16_e32 v1, v1
-; GFX8-NEXT:    v_log_f16_e32 v2, v2
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: log2_v5f16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX9-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX9-NEXT:    v_log_f16_e32 v2, v2
-; GFX9-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: log2_v5f16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT:    v_log_f16_e32 v2, v2
-; GFX10-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: log2_v5f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX11-NEXT:    v_log_f16_e32 v1, v1
-; GFX11-NEXT:    v_log_f16_e32 v0, v0
-; GFX11-NEXT:    v_log_f16_e32 v2, v2
-; GFX11-NEXT:    v_log_f16_e32 v3, v3
-; GFX11-NEXT:    v_log_f16_e32 v4, v4
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v3
-; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v4
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-	%res = call <5 x half> @llvm.log2.v5f16(<5 x half> %a)
-	ret <5 x half> %res
-}
-
-define <5 x half> @log10_v5f16(<5 x half> %a) {
-; GFX8-LABEL: log10_v5f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_log_f16_e32 v3, v1
-; GFX8-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_log_f16_e32 v4, v0
-; GFX8-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_log_f16_e32 v2, v2
-; GFX8-NEXT:    v_mov_b32_e32 v5, 0x34d1
-; GFX8-NEXT:    v_mul_f16_e32 v3, 0x34d1, v3
-; GFX8-NEXT:    v_mul_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_mul_f16_e32 v4, 0x34d1, v4
-; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_mul_f16_e32 v2, 0x34d1, v2
-; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, v3, v1
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX906-LABEL: log10_v5f16:
-; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX906-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX906-NEXT:    s_movk_i32 s4, 0x34d1
-; GFX906-NEXT:    v_log_f16_e32 v2, v2
-; GFX906-NEXT:    v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX906-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX906-NEXT:    v_mul_f16_e32 v2, 0x34d1, v2
-; GFX906-NEXT:    v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX906-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX906-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX908-LABEL: log10_v5f16:
-; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX908-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX908-NEXT:    s_movk_i32 s4, 0x34d1
-; GFX908-NEXT:    v_log_f16_e32 v2, v2
-; GFX908-NEXT:    v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX908-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX908-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX908-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX908-NEXT:    v_mul_f16_e32 v2, 0x34d1, v2
-; GFX908-NEXT:    v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX908-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX908-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: log10_v5f16:
-; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX942-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX942-NEXT:    s_movk_i32 s0, 0x34d1
-; GFX942-NEXT:    v_log_f16_e32 v2, v2
-; GFX942-NEXT:    v_mul_f16_sdwa v1, v1, s0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT:    v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX942-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX942-NEXT:    v_mul_f16_e32 v2, 0x34d1, v2
-; GFX942-NEXT:    v_mul_f16_sdwa v1, v1, s0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX942-NEXT:    v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX942-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: log10_v5f16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0x34d1
-; GFX10-NEXT:    v_log_f16_e32 v2, v2
-; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_mul_f16_e32 v2, 0x34d1, v2
-; GFX10-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: log10_v5f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX11-NEXT:    v_log_f16_e32 v1, v1
-; GFX11-NEXT:    v_log_f16_e32 v0, v0
-; GFX11-NEXT:    v_log_f16_e32 v2, v2
-; GFX11-NEXT:    v_log_f16_e32 v3, v3
-; GFX11-NEXT:    v_log_f16_e32 v4, v4
-; GFX11-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX11-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX11-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f16_e32 v2, 0x34d1, v2
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f16_e32 v3, 0x34d1, v3
-; GFX11-NEXT:    v_mul_f16_e32 v4, 0x34d1, v4
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v4
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-	%res = call <5 x half> @llvm.log10.v5f16(<5 x half> %a)
-	ret <5 x half> %res
-}
-
-define <5 x half> @exp_v5f16(<5 x half> %a) {
-; GFX8-LABEL: exp_v5f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v3, v1
-; GFX8-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v4, v0
-; GFX8-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX8-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v3
-; GFX8-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX8-NEXT:    v_mul_f32_e32 v4, 0x3fb8aa3b, v4
-; GFX8-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX8-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v2
-; GFX8-NEXT:    v_exp_f32_e32 v3, v3
-; GFX8-NEXT:    v_exp_f32_e32 v4, v4
-; GFX8-NEXT:    v_exp_f32_e32 v0, v0
-; GFX8-NEXT:    v_exp_f32_e32 v1, v1
-; GFX8-NEXT:    v_exp_f32_e32 v2, v2
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX8-NEXT:    v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT:    v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, v3, v1
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: exp_v5f16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_f16_e32 v3, v1
-; GFX9-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_cvt_f32_f16_e32 v4, v0
-; GFX9-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX9-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v3
-; GFX9-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX9-NEXT:    v_mul_f32_e32 v4, 0x3fb8aa3b, v4
-; GFX9-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX9-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v2
-; GFX9-NEXT:    v_exp_f32_e32 v3, v3
-; GFX9-NEXT:    v_exp_f32_e32 v4, v4
-; GFX9-NEXT:    v_exp_f32_e32 v0, v0
-; GFX9-NEXT:    v_exp_f32_e32 v1, v1
-; GFX9-NEXT:    v_exp_f32_e32 v2, v2
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX9-NEXT:    v_pack_b32_f16 v0, v4, v0
-; GFX9-NEXT:    v_pack_b32_f16 v1, v3, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: exp_v5f16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v3, v1
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v4, v0
-; GFX10-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX10-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v3
-; GFX10-NEXT:    v_mul_f32_e32 v4, 0x3fb8aa3b, v4
-; GFX10-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX10-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX10-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v2
-; GFX10-NEXT:    v_exp_f32_e32 v3, v3
-; GFX10-NEXT:    v_exp_f32_e32 v4, v4
-; GFX10-NEXT:    v_exp_f32_e32 v0, v0
-; GFX10-NEXT:    v_exp_f32_e32 v1, v1
-; GFX10-NEXT:    v_exp_f32_e32 v2, v2
-; GFX10-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX10-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX10-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX10-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX10-NEXT:    v_pack_b32_f16 v0, v4, v0
-; GFX10-NEXT:    v_pack_b32_f16 v1, v3, v1
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: exp_v5f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_mul_f32 v1, 0x3fb8aa3b, v1 :: v_dual_mul_f32 v0, 0x3fb8aa3b, v0
-; GFX11-NEXT:    v_dual_mul_f32 v2, 0x3fb8aa3b, v2 :: v_dual_mul_f32 v3, 0x3fb8aa3b, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_mul_f32_e32 v4, 0x3fb8aa3b, v4
-; GFX11-NEXT:    v_exp_f32_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_exp_f32_e32 v0, v0
-; GFX11-NEXT:    v_exp_f32_e32 v2, v2
-; GFX11-NEXT:    v_exp_f32_e32 v3, v3
-; GFX11-NEXT:    v_exp_f32_e32 v4, v4
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v4
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-	%res = call <5 x half> @llvm.exp.v5f16(<5 x half> %a)
-	ret <5 x half> %res
-}
-
-define <5 x half> @exp2_v5f16(<5 x half> %a) {
-; GFX8-LABEL: exp2_v5f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_exp_f16_sdwa v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_exp_f16_sdwa v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_exp_f16_e32 v0, v0
-; GFX8-NEXT:    v_exp_f16_e32 v1, v1
-; GFX8-NEXT:    v_exp_f16_e32 v2, v2
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: exp2_v5f16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX9-NEXT:    v_exp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX9-NEXT:    v_exp_f16_e32 v2, v2
-; GFX9-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_exp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: exp2_v5f16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT:    v_exp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT:    v_exp_f16_e32 v2, v2
-; GFX10-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_exp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: exp2_v5f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX11-NEXT:    v_exp_f16_e32 v1, v1
-; GFX11-NEXT:    v_exp_f16_e32 v0, v0
-; GFX11-NEXT:    v_exp_f16_e32 v2, v2
-; GFX11-NEXT:    v_exp_f16_e32 v3, v3
-; GFX11-NEXT:    v_exp_f16_e32 v4, v4
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v3
-; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v4
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-	%res = call <5 x half> @llvm.exp2.v5f16(<5 x half> %a)
-	ret <5 x half> %res
-}
-
-define <5 x half> @exp10_v5f16(<5 x half> %a) {
-; GFX8-LABEL: exp10_v5f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v3, v1
-; GFX8-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v4, v0
-; GFX8-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX8-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v3
-; GFX8-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX8-NEXT:    v_mul_f32_e32 v4, 0x3fb8aa3b, v4
-; GFX8-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX8-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v2
-; GFX8-NEXT:    v_exp_f32_e32 v3, v3
-; GFX8-NEXT:    v_exp_f32_e32 v4, v4
-; GFX8-NEXT:    v_exp_f32_e32 v0, v0
-; GFX8-NEXT:    v_exp_f32_e32 v1, v1
-; GFX8-NEXT:    v_exp_f32_e32 v2, v2
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX8-NEXT:    v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT:    v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, v3, v1
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: exp10_v5f16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_f16_e32 v3, v1
-; GFX9-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_cvt_f32_f16_e32 v4, v0
-; GFX9-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX9-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v3
-; GFX9-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX9-NEXT:    v_mul_f32_e32 v4, 0x3fb8aa3b, v4
-; GFX9-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX9-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v2
-; GFX9-NEXT:    v_exp_f32_e32 v3, v3
-; GFX9-NEXT:    v_exp_f32_e32 v4, v4
-; GFX9-NEXT:    v_exp_f32_e32 v0, v0
-; GFX9-NEXT:    v_exp_f32_e32 v1, v1
-; GFX9-NEXT:    v_exp_f32_e32 v2, v2
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX9-NEXT:    v_pack_b32_f16 v0, v4, v0
-; GFX9-NEXT:    v_pack_b32_f16 v1, v3, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: exp10_v5f16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v3, v1
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v4, v0
-; GFX10-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX10-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v3
-; GFX10-NEXT:    v_mul_f32_e32 v4, 0x3fb8aa3b, v4
-; GFX10-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX10-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX10-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v2
-; GFX10-NEXT:    v_exp_f32_e32 v3, v3
-; GFX10-NEXT:    v_exp_f32_e32 v4, v4
-; GFX10-NEXT:    v_exp_f32_e32 v0, v0
-; GFX10-NEXT:    v_exp_f32_e32 v1, v1
-; GFX10-NEXT:    v_exp_f32_e32 v2, v2
-; GFX10-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX10-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX10-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX10-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX10-NEXT:    v_pack_b32_f16 v0, v4, v0
-; GFX10-NEXT:    v_pack_b32_f16 v1, v3, v1
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: exp10_v5f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_mul_f32 v1, 0x3fb8aa3b, v1 :: v_dual_mul_f32 v0, 0x3fb8aa3b, v0
-; GFX11-NEXT:    v_dual_mul_f32 v2, 0x3fb8aa3b, v2 :: v_dual_mul_f32 v3, 0x3fb8aa3b, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_mul_f32_e32 v4, 0x3fb8aa3b, v4
-; GFX11-NEXT:    v_exp_f32_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_exp_f32_e32 v0, v0
-; GFX11-NEXT:    v_exp_f32_e32 v2, v2
-; GFX11-NEXT:    v_exp_f32_e32 v3, v3
-; GFX11-NEXT:    v_exp_f32_e32 v4, v4
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v4
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-	%res = call <5 x half> @llvm.exp10.v5f16(<5 x half> %a)
-	ret <5 x half> %res
-}
-
-define <5 x half> @sqrt_v5f16(<5 x half> %a) {
-; GFX8-LABEL: sqrt_v5f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_sqrt_f16_sdwa v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_sqrt_f16_sdwa v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_sqrt_f16_e32 v0, v0
-; GFX8-NEXT:    v_sqrt_f16_e32 v1, v1
-; GFX8-NEXT:    v_sqrt_f16_e32 v2, v2
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: sqrt_v5f16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX9-NEXT:    v_sqrt_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX9-NEXT:    v_sqrt_f16_e32 v2, v2
-; GFX9-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_sqrt_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: sqrt_v5f16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT:    v_sqrt_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT:    v_sqrt_f16_e32 v2, v2
-; GFX10-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_sqrt_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: sqrt_v5f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX11-NEXT:    v_sqrt_f16_e32 v1, v1
-; GFX11-NEXT:    v_sqrt_f16_e32 v0, v0
-; GFX11-NEXT:    v_sqrt_f16_e32 v2, v2
-; GFX11-NEXT:    v_sqrt_f16_e32 v3, v3
-; GFX11-NEXT:    v_sqrt_f16_e32 v4, v4
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v3
-; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v4
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-	%res = call <5 x half> @llvm.sqrt.v5f16(<5 x half> %a)
-	ret <5 x half> %res
-}
-
-define <4 x half> @cascaded_v4f16(<4 x half> %a) {
-; GFX8-LABEL: cascaded_v4f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_log_f16_e32 v2, v1
-; GFX8-NEXT:    v_log_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_log_f16_e32 v0, v0
-; GFX8-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
-; GFX8-NEXT:    v_mul_f16_e32 v3, 0.15915494, v3
-; GFX8-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
-; GFX8-NEXT:    v_mul_f16_e32 v0, 0.15915494, v0
-; GFX8-NEXT:    v_fract_f16_e32 v2, v2
-; GFX8-NEXT:    v_fract_f16_e32 v1, v1
-; GFX8-NEXT:    v_fract_f16_e32 v0, v0
-; GFX8-NEXT:    v_fract_f16_e32 v3, v3
-; GFX8-NEXT:    v_sin_f16_e32 v2, v2
-; GFX8-NEXT:    v_sin_f16_e32 v0, v0
-; GFX8-NEXT:    v_sin_f16_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT:    v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
-; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: cascaded_v4f16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_log_f16_e32 v2, v1
-; GFX9-NEXT:    v_log_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_log_f16_e32 v0, v0
-; GFX9-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
-; GFX9-NEXT:    v_mul_f16_e32 v3, 0.15915494, v3
-; GFX9-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
-; GFX9-NEXT:    v_mul_f16_e32 v0, 0.15915494, v0
-; GFX9-NEXT:    v_sin_f16_e32 v2, v2
-; GFX9-NEXT:    v_sin_f16_e32 v0, v0
-; GFX9-NEXT:    v_sin_f16_e32 v3, v3
-; GFX9-NEXT:    v_sin_f16_e32 v1, v1
-; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v3
-; GFX9-NEXT:    v_pack_b32_f16 v1, v2, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: cascaded_v4f16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_log_f16_e32 v2, v1
-; GFX10-NEXT:    v_log_f16_e32 v3, v0
-; GFX10-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
-; GFX10-NEXT:    v_mul_f16_e32 v3, 0.15915494, v3
-; GFX10-NEXT:    v_mul_f16_e32 v0, 0.15915494, v0
-; GFX10-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
-; GFX10-NEXT:    v_sin_f16_e32 v2, v2
-; GFX10-NEXT:    v_sin_f16_e32 v3, v3
-; GFX10-NEXT:    v_sin_f16_e32 v0, v0
-; GFX10-NEXT:    v_sin_f16_e32 v1, v1
-; GFX10-NEXT:    v_pack_b32_f16 v0, v3, v0
-; GFX10-NEXT:    v_pack_b32_f16 v1, v2, v1
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: cascaded_v4f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-NEXT:    v_log_f16_e32 v1, v1
-; GFX11-NEXT:    v_log_f16_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_log_f16_e32 v2, v2
-; GFX11-NEXT:    v_log_f16_e32 v3, v3
-; GFX11-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
-; GFX11-NEXT:    s_delay_alu instid0(TRANS32_DEP_3)
-; GFX11-NEXT:    v_mul_f16_e32 v0, 0.15915494, v0
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
-; GFX11-NEXT:    v_mul_f16_e32 v3, 0.15915494, v3
-; GFX11-NEXT:    v_sin_f16_e32 v1, v1
-; GFX11-NEXT:    v_sin_f16_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_sin_f16_e32 v2, v2
-; GFX11-NEXT:    v_sin_f16_e32 v3, v3
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v2
-; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v3
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-	%b = call <4 x half> @llvm.log2.v4f16(<4 x half> %a)
-  %res = call <4 x half> @llvm.sin.v4f16(<4 x half> %b)
-	ret <4 x half> %res
-}
-
-define <5 x half> @cascaded_v5f16(<5 x half> %a) {
-; GFX8-LABEL: cascaded_v5f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_log_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_log_f16_e32 v4, v1
-; GFX8-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_log_f16_e32 v0, v0
-; GFX8-NEXT:    v_log_f16_e32 v2, v2
-; GFX8-NEXT:    v_mul_f16_e32 v4, 0.15915494, v4
-; GFX8-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
-; GFX8-NEXT:    v_mul_f16_e32 v0, 0.15915494, v0
-; GFX8-NEXT:    v_mul_f16_e32 v3, 0.15915494, v3
-; GFX8-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
-; GFX8-NEXT:    v_fract_f16_e32 v4, v4
-; GFX8-NEXT:    v_fract_f16_e32 v1, v1
-; GFX8-NEXT:    v_fract_f16_e32 v0, v0
-; GFX8-NEXT:    v_fract_f16_e32 v3, v3
-; GFX8-NEXT:    v_sin_f16_e32 v4, v4
-; GFX8-NEXT:    v_sin_f16_e32 v0, v0
-; GFX8-NEXT:    v_sin_f16_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT:    v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT:    v_fract_f16_e32 v2, v2
-; GFX8-NEXT:    v_sin_f16_e32 v2, v2
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
-; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: cascaded_v5f16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_log_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_log_f16_e32 v4, v1
-; GFX9-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_log_f16_e32 v0, v0
-; GFX9-NEXT:    v_log_f16_e32 v2, v2
-; GFX9-NEXT:    v_mul_f16_e32 v4, 0.15915494, v4
-; GFX9-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
-; GFX9-NEXT:    v_mul_f16_e32 v0, 0.15915494, v0
-; GFX9-NEXT:    v_mul_f16_e32 v3, 0.15915494, v3
-; GFX9-NEXT:    v_sin_f16_e32 v4, v4
-; GFX9-NEXT:    v_sin_f16_e32 v0, v0
-; GFX9-NEXT:    v_sin_f16_e32 v3, v3
-; GFX9-NEXT:    v_sin_f16_e32 v1, v1
-; GFX9-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
-; GFX9-NEXT:    v_sin_f16_e32 v2, v2
-; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v3
-; GFX9-NEXT:    v_pack_b32_f16 v1, v4, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: cascaded_v5f16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_log_f16_e32 v3, v1
-; GFX10-NEXT:    v_log_f16_e32 v4, v0
-; GFX10-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_log_f16_e32 v2, v2
-; GFX10-NEXT:    v_mul_f16_e32 v3, 0.15915494, v3
-; GFX10-NEXT:    v_mul_f16_e32 v4, 0.15915494, v4
-; GFX10-NEXT:    v_mul_f16_e32 v0, 0.15915494, v0
-; GFX10-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
-; GFX10-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
-; GFX10-NEXT:    v_sin_f16_e32 v3, v3
-; GFX10-NEXT:    v_sin_f16_e32 v4, v4
-; GFX10-NEXT:    v_sin_f16_e32 v0, v0
-; GFX10-NEXT:    v_sin_f16_e32 v1, v1
-; GFX10-NEXT:    v_sin_f16_e32 v2, v2
-; GFX10-NEXT:    v_pack_b32_f16 v0, v4, v0
-; GFX10-NEXT:    v_pack_b32_f16 v1, v3, v1
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: cascaded_v5f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX11-NEXT:    v_log_f16_e32 v1, v1
-; GFX11-NEXT:    v_log_f16_e32 v0, v0
-; GFX11-NEXT:    v_log_f16_e32 v2, v2
-; GFX11-NEXT:    v_log_f16_e32 v3, v3
-; GFX11-NEXT:    v_log_f16_e32 v4, v4
-; GFX11-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
-; GFX11-NEXT:    v_mul_f16_e32 v0, 0.15915494, v0
-; GFX11-NEXT:    s_delay_alu instid0(TRANS32_DEP_3)
-; GFX11-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f16_e32 v3, 0.15915494, v3
-; GFX11-NEXT:    v_mul_f16_e32 v4, 0.15915494, v4
-; GFX11-NEXT:    v_sin_f16_e32 v1, v1
-; GFX11-NEXT:    v_sin_f16_e32 v0, v0
-; GFX11-NEXT:    v_sin_f16_e32 v2, v2
-; GFX11-NEXT:    v_sin_f16_e32 v3, v3
-; GFX11-NEXT:    v_sin_f16_e32 v4, v4
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v3
-; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v4
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-	%b = call <5 x half> @llvm.log2.v5f16(<5 x half> %a)
-  %res = call <5 x half> @llvm.sin.v5f16(<5 x half> %b)
-	ret <5 x half> %res
-}



More information about the llvm-commits mailing list