[llvm] [AMDGPU] Eliminate unnecessary packing in wider f16 vectors for sdwa/opsel-able instruction (PR #137137)
Vikash Gupta via llvm-commits
llvm-commits at lists.llvm.org
Tue May 6 00:51:11 PDT 2025
https://github.com/vg0204 updated https://github.com/llvm/llvm-project/pull/137137
>From 3d836658c92823b71d5fdbac7c5fdb4d8680825b Mon Sep 17 00:00:00 2001
From: vikashgu <Vikash.Gupta at amd.com>
Date: Wed, 23 Apr 2025 12:13:49 +0000
Subject: [PATCH 1/7] [AMDGPU][PeepholeOpt] Eliminate unnecessary packing in
fp16 vector operations for SDWA/OPSEL-able instruction
As the compiler has no fp16 packed instruction,
so isel scalarizes each fp16 operation in wide fp16 vectors and generates
separate individual fp16 results, which are later packed. Now, in post-
isel pass in SIPeepholeSDWA pass, opportunistically any instructions is
eventually converted into its SDWA/OPSEL-able version.
This patch gets rids of unnecessary packing in wider fp16 vectors
operation for SDWA/OPSEL-able instruction, by overwriting the partial
fp16 result into same input register partially, while maintaining the
sanctity of rest of bits in input register, using OPSEL dst_unused
operand set as UNUSED_PRESERVED. Owing to the context of generating SDWA
instructions, it is invoked at the end of the SIPeepholeSDWA pass.
---
llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 516 +++++++++++++++++++++-
1 file changed, 514 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 22f23e4c94e2d..b8bb27430707a 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -27,6 +27,7 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include <optional>
+#include <queue>
using namespace llvm;
@@ -35,6 +36,11 @@ using namespace llvm;
STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found.");
STATISTIC(NumSDWAInstructionsPeepholed,
"Number of instruction converted to SDWA.");
+STATISTIC(Num16BitPackedInstructionsEliminated,
+ "Number of packed instruction eliminated.");
+STATISTIC(NumSDWAInstructionsToEliminateFP16Pack,
+ "Number of instruction converted/modified into SDWA to eliminate "
+ "FP16 packing.");
namespace {
@@ -66,6 +72,14 @@ class SIPeepholeSDWA {
bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const;
+ void eliminateFP16Packing(MachineBasicBlock &MBB, const GCNSubtarget &ST);
+ unsigned
+ computeMIChainsForPackedOps(MachineInstr *ParentMI,
+ std::queue<MachineOperand *> &DefSrcQueue,
+ const GCNSubtarget &ST);
+ void convertMIToSDWAWithOpsel(MachineInstr &MI, MachineOperand &SrcMO,
+ AMDGPU::SDWA::SdwaSel OpSel);
+
public:
bool run(MachineFunction &MF);
};
@@ -266,13 +280,17 @@ void SDWADstPreserveOperand::print(raw_ostream& OS) const {
#endif
-static void copyRegOperand(MachineOperand &To, const MachineOperand &From) {
+static void copyRegOperand(MachineOperand &To, const MachineOperand &From,
+ bool isKill = false) {
assert(To.isReg() && From.isReg());
To.setReg(From.getReg());
To.setSubReg(From.getSubReg());
To.setIsUndef(From.isUndef());
if (To.isUse()) {
- To.setIsKill(From.isKill());
+ if (isKill)
+ To.setIsKill(true);
+ else
+ To.setIsKill(From.isKill());
} else {
To.setIsDead(From.isDead());
}
@@ -1361,6 +1379,494 @@ bool SIPeepholeSDWALegacy::runOnMachineFunction(MachineFunction &MF) {
return SIPeepholeSDWA().run(MF);
}
+static bool isSrcDestFP16Bits(MachineInstr *MI, const SIInstrInfo *TII) {
+ unsigned Opcode = MI->getOpcode();
+ if (TII->isSDWA(Opcode))
+ Opcode = AMDGPU::getBasicFromSDWAOp(Opcode);
+
+ switch (Opcode) {
+ case AMDGPU::V_CVT_F16_U16_e32:
+ case AMDGPU::V_CVT_F16_U16_e64:
+ case AMDGPU::V_CVT_F16_I16_e32:
+ case AMDGPU::V_CVT_F16_I16_e64:
+ case AMDGPU::V_RCP_F16_e64:
+ case AMDGPU::V_RCP_F16_e32:
+ case AMDGPU::V_RSQ_F16_e64:
+ case AMDGPU::V_RSQ_F16_e32:
+ case AMDGPU::V_SQRT_F16_e64:
+ case AMDGPU::V_SQRT_F16_e32:
+ case AMDGPU::V_LOG_F16_e64:
+ case AMDGPU::V_LOG_F16_e32:
+ case AMDGPU::V_EXP_F16_e64:
+ case AMDGPU::V_EXP_F16_e32:
+ case AMDGPU::V_SIN_F16_e64:
+ case AMDGPU::V_SIN_F16_e32:
+ case AMDGPU::V_COS_F16_e64:
+ case AMDGPU::V_COS_F16_e32:
+ case AMDGPU::V_FLOOR_F16_e64:
+ case AMDGPU::V_FLOOR_F16_e32:
+ case AMDGPU::V_CEIL_F16_e64:
+ case AMDGPU::V_CEIL_F16_e32:
+ case AMDGPU::V_TRUNC_F16_e64:
+ case AMDGPU::V_TRUNC_F16_e32:
+ case AMDGPU::V_RNDNE_F16_e64:
+ case AMDGPU::V_RNDNE_F16_e32:
+ case AMDGPU::V_FRACT_F16_e64:
+ case AMDGPU::V_FRACT_F16_e32:
+ case AMDGPU::V_FREXP_MANT_F16_e64:
+ case AMDGPU::V_FREXP_MANT_F16_e32:
+ case AMDGPU::V_FREXP_EXP_I16_F16_e64:
+ case AMDGPU::V_FREXP_EXP_I16_F16_e32:
+ case AMDGPU::V_LDEXP_F16_e64:
+ case AMDGPU::V_LDEXP_F16_e32:
+ case AMDGPU::V_ADD_F16_e64:
+ case AMDGPU::V_ADD_F16_e32:
+ case AMDGPU::V_SUB_F16_e64:
+ case AMDGPU::V_SUB_F16_e32:
+ case AMDGPU::V_SUBREV_F16_e64:
+ case AMDGPU::V_SUBREV_F16_e32:
+ case AMDGPU::V_MUL_F16_e64:
+ case AMDGPU::V_MUL_F16_e32:
+ case AMDGPU::V_MAX_F16_e64:
+ case AMDGPU::V_MAX_F16_e32:
+ case AMDGPU::V_MIN_F16_e64:
+ case AMDGPU::V_MIN_F16_e32:
+ case AMDGPU::V_MAD_F16_e64:
+ case AMDGPU::V_FMA_F16_e64:
+ case AMDGPU::V_DIV_FIXUP_F16_e64:
+ return true;
+ case AMDGPU::V_MADAK_F16:
+ case AMDGPU::V_MADMK_F16:
+ case AMDGPU::V_FMAMK_F16:
+ case AMDGPU::V_FMAAK_F16:
+ // NOTE : SKEPTICAL ABOUT IT
+ return false;
+ case AMDGPU::V_FMAC_F16_e32:
+ case AMDGPU::V_FMAC_F16_e64:
+ case AMDGPU::V_MAC_F16_e32:
+ case AMDGPU::V_MAC_F16_e64:
+ // As their sdwa version allow dst_sel to be equal only set to DWORD
+ default:
+ return false;
+ }
+}
+
+static bool checkForRightSrcRootAccess(MachineInstr *Def0MI,
+ MachineInstr *Def1MI,
+ Register SrcRootReg,
+ const SIInstrInfo *TII) {
+ // As if could, the Def1MI would have been sdwa-ed
+ if (!TII->isSDWA(Def1MI->getOpcode()))
+ return false;
+
+ MachineOperand *Def1Src0 =
+ TII->getNamedOperand(*Def1MI, AMDGPU::OpName::src0);
+ MachineOperand *Def1Src1 =
+ TII->getNamedOperand(*Def1MI, AMDGPU::OpName::src1);
+ MachineOperand *Def0Src0 =
+ TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src0);
+ MachineOperand *Def0Src1 =
+ TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src1);
+
+ if (Def1Src0 && Def1Src0->isReg() && (Def1Src0->getReg() == SrcRootReg)) {
+ MachineOperand *Def1Src0Sel =
+ TII->getNamedOperand(*Def1MI, AMDGPU::OpName::src0_sel);
+ if (!Def1Src0Sel ||
+ (Def1Src0Sel->getImm() != AMDGPU::SDWA::SdwaSel::WORD_1))
+ return false;
+
+ if (Def0Src0 && Def0Src0->isReg() && (Def0Src0->getReg() == SrcRootReg)) {
+ MachineOperand *Def0Src0Sel =
+ TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src0_sel);
+ if (!Def0Src0Sel)
+ return true;
+ if (Def0Src0Sel && Def0Src0Sel->getImm() == AMDGPU::SDWA::SdwaSel::WORD_0)
+ return true;
+ }
+
+ if (Def0Src1 && Def0Src1->isReg() && (Def0Src1->getReg() == SrcRootReg)) {
+ MachineOperand *Def0Src1Sel =
+ TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src1_sel);
+ if (!Def0Src1Sel)
+ return true;
+ if (Def0Src1Sel && Def0Src1Sel->getImm() == AMDGPU::SDWA::SdwaSel::WORD_0)
+ return true;
+ }
+ }
+
+ if (Def1Src1 && Def1Src1->isReg() && (Def1Src1->getReg() == SrcRootReg)) {
+ MachineOperand *Def1Src1Sel =
+ TII->getNamedOperand(*Def1MI, AMDGPU::OpName::src1_sel);
+ if (!Def1Src1Sel ||
+ (Def1Src1Sel->getImm() != AMDGPU::SDWA::SdwaSel::WORD_1))
+ return false;
+
+ if (Def0Src0 && Def0Src0->isReg() && (Def0Src0->getReg() == SrcRootReg)) {
+ MachineOperand *Def0Src0Sel =
+ TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src0_sel);
+ if (!Def0Src0Sel)
+ return true;
+ if (Def0Src0Sel && Def0Src0Sel->getImm() == AMDGPU::SDWA::SdwaSel::WORD_0)
+ return true;
+ }
+
+ if (Def0Src1 && Def0Src1->isReg() && (Def0Src1->getReg() == SrcRootReg)) {
+ MachineOperand *Def0Src1Sel =
+ TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src1_sel);
+ if (!Def0Src1Sel)
+ return true;
+ if (Def0Src1Sel && Def0Src1Sel->getImm() == AMDGPU::SDWA::SdwaSel::WORD_0)
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/// Given A and B are in the same MBB, returns true if A comes before B.
+static bool dominates(MachineBasicBlock::const_iterator A,
+ MachineBasicBlock::const_iterator B) {
+ assert(A->getParent() == B->getParent());
+ const MachineBasicBlock *MBB = A->getParent();
+ auto MBBEnd = MBB->end();
+ if (B == MBBEnd)
+ return true;
+
+ MachineBasicBlock::const_iterator I = MBB->begin();
+ for (; &*I != A && &*I != B; ++I)
+ ;
+
+ return &*I == A;
+}
+
+// Convert MI into its SDWA version with its Dst_Sel & SrcMO_Sel set with OpSel
+// and preserving the rest of Dst's bits.
+void SIPeepholeSDWA::convertMIToSDWAWithOpsel(MachineInstr &MI,
+ MachineOperand &SrcMO,
+ AMDGPU::SDWA::SdwaSel OpSel) {
+ LLVM_DEBUG(dbgs() << "Convert instruction:" << MI);
+
+ MachineInstr *SDWAInst;
+ if (TII->isSDWA(MI.getOpcode())) {
+ SDWAInst = &MI;
+ } else {
+ SDWAInst = createSDWAVersion(MI);
+ MI.eraseFromParent();
+ }
+
+ ConvertedInstructions.push_back(SDWAInst);
+ unsigned SDWAOpcode = SDWAInst->getOpcode();
+ ++NumSDWAInstructionsToEliminateFP16Pack;
+
+ MachineOperand *Dst = TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::vdst);
+ assert(Dst && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::vdst));
+
+ MachineOperand *DstSel =
+ TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::dst_sel);
+ assert(DstSel &&
+ AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::dst_sel));
+ DstSel->setImm(OpSel);
+
+ MachineOperand *DstUnused =
+ TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::dst_unused);
+ assert(DstUnused &&
+ AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::dst_unused));
+ assert(!(DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) &&
+ "Dst_unused should not be UNUSED_PRESERVE already");
+ DstUnused->setImm(AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE);
+
+ auto PreserveDstIdx =
+ AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst);
+ assert(PreserveDstIdx != -1);
+ auto NewSrcImplitMO = MachineOperand::CreateReg(SrcMO.getReg(), false, true);
+ copyRegOperand(NewSrcImplitMO, SrcMO);
+ SDWAInst->addOperand(NewSrcImplitMO);
+ SDWAInst->tieOperands(PreserveDstIdx, SDWAInst->getNumOperands() - 1);
+
+ MachineOperand *Src0 = TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::src0);
+ assert(Src0 && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0));
+ if (Src0->isReg() && (Src0->getReg() == SrcMO.getReg())) {
+ MachineOperand *Src0Sel =
+ TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::src0_sel);
+ assert(Src0Sel &&
+ AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_sel));
+ Src0Sel->setImm(OpSel);
+
+ LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n');
+ return;
+ }
+
+ MachineOperand *Src1 = TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::src1);
+ assert(Src1 && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1));
+ if (Src1->isReg() && (Src1->getReg() == SrcMO.getReg())) {
+ MachineOperand *Src1Sel =
+ TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::src1_sel);
+ assert(Src1Sel &&
+ AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_sel));
+ Src1Sel->setImm(OpSel);
+
+ LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n');
+ return;
+ }
+}
+
+// BackTracks the given Parent MI to look for any of its use operand that has
+// been defined by FP16 (sdwa-able) in recursive fashion.
+unsigned SIPeepholeSDWA::computeMIChainsForPackedOps(
+ MachineInstr *ParentMI, std::queue<MachineOperand *> &DefSrcQueue,
+ const GCNSubtarget &ST) {
+ unsigned NumOfFP16Def;
+ do {
+ MachineInstr *NextMIInChain = nullptr;
+ NumOfFP16Def = 0;
+ for (MachineOperand ¤tMO : ParentMI->uses()) {
+ if (!currentMO.isReg() || currentMO.getReg().isPhysical() ||
+ !MRI->hasOneUse(currentMO.getReg()))
+ continue;
+
+ MachineOperand *DefCurrMO = findSingleRegDef(¤tMO, MRI);
+ if (!DefCurrMO)
+ continue;
+
+ MachineInstr *DefCurrMI = DefCurrMO->getParent();
+ if (!isSrcDestFP16Bits(DefCurrMI, TII) ||
+ !isConvertibleToSDWA(*DefCurrMI, ST, TII))
+ continue;
+
+ NextMIInChain = DefCurrMI;
+ DefSrcQueue.push(DefCurrMO);
+ NumOfFP16Def++;
+ }
+
+ if (NumOfFP16Def > 1)
+ break;
+
+ ParentMI = NextMIInChain;
+ } while (ParentMI);
+
+ return NumOfFP16Def;
+}
+
+void SIPeepholeSDWA::eliminateFP16Packing(MachineBasicBlock &MBB,
+ const GCNSubtarget &ST) {
+ if (!ST.has16BitInsts())
+ return;
+
+ for (MachineInstr &MI : make_early_inc_range(MBB)) {
+ if (MI.getOpcode() == AMDGPU::V_PACK_B32_F16_e64) {
+ LLVM_DEBUG(dbgs() << "\nCandidate FP16 Packed MI : " << MI << '\n');
+ std::queue<MachineOperand *> DefSrc0Queue;
+ std::queue<MachineOperand *> DefSrc1Queue;
+ MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+ MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+
+ if (!Src0->isReg() || Src0->getReg().isPhysical() ||
+ !MRI->hasOneUse(Src0->getReg()) || !Src1->isReg() ||
+ Src1->getReg().isPhysical() || !MRI->hasOneUse(Src1->getReg()))
+ continue;
+
+ MachineOperand *Op0 = findSingleRegDef(Src0, MRI);
+ MachineOperand *Op1 = findSingleRegDef(Src1, MRI);
+
+ if (!Op0 || !Op1)
+ continue;
+
+ MachineInstr *ParentMIOp0 = Op0->getParent();
+ MachineInstr *ParentMIOp1 = Op1->getParent();
+
+ if (!isSrcDestFP16Bits(ParentMIOp0, TII) ||
+ !isSrcDestFP16Bits(ParentMIOp1, TII) ||
+ !isConvertibleToSDWA(*ParentMIOp0, ST, TII) ||
+ !isConvertibleToSDWA(*ParentMIOp1, ST, TII))
+ continue;
+
+ DefSrc0Queue.push(Op0);
+ DefSrc1Queue.push(Op1);
+
+ // This checks for the given MI, that it only has exact one register MO
+ // use , that is defined by pure FP16 instruction (that is SDWA-able too)
+ unsigned NumOfFP16Def;
+
+ NumOfFP16Def = computeMIChainsForPackedOps(ParentMIOp0, DefSrc0Queue, ST);
+ if (NumOfFP16Def > 1)
+ continue;
+
+ NumOfFP16Def = computeMIChainsForPackedOps(ParentMIOp1, DefSrc1Queue, ST);
+ if (NumOfFP16Def > 1)
+ continue;
+
+ MachineInstr *Def0RootMI = (DefSrc0Queue.back())->getParent();
+ MachineInstr *Def1RootMI = (DefSrc1Queue.back())->getParent();
+ Register SrcRootMOReg = AMDGPU::NoRegister;
+
+ // Now, check if the last operation for each in of the DefSrcQueue
+ // has the common MO, that would be the source root MO for element-wise
+ // fp16 chain operations
+ for (MachineOperand &Current0MO : Def0RootMI->uses()) {
+ if (!Current0MO.isReg() || Current0MO.getReg().isPhysical())
+ continue;
+
+ for (MachineOperand &Current1MO : Def1RootMI->uses()) {
+ if (!Current1MO.isReg() || Current1MO.getReg().isPhysical())
+ continue;
+
+ if (Current0MO.getReg() == Current1MO.getReg() &&
+ Current0MO.getSubReg() == Current1MO.getSubReg()) {
+ SrcRootMOReg = Current0MO.getReg();
+ break;
+ }
+ }
+ // Found it, no more check needed, so break;
+ if (SrcRootMOReg != AMDGPU::NoRegister)
+ break;
+ }
+
+ if (SrcRootMOReg == AMDGPU::NoRegister)
+ continue;
+
+ // Also we need to ensure that each of the DefXRootMI should access the
+ // lower and upper half word of SrcRootMOReg respectively.
+ if (!checkForRightSrcRootAccess(Def0RootMI, Def1RootMI, SrcRootMOReg,
+ TII))
+ continue;
+
+ // The graph below represents the connection :
+ // Op0Intial --> Op0x --> ... --> Op0Final
+ // / \'
+ // SrcRootMO v_Pack_b32_f16
+ // \ /
+ // Op1Intial --> Op1x --> ... --> Op1Final
+ // The nomenclature is based upon above flow-graph
+ //
+ // Also for each of DefSrcXQueue :
+ // OpXIntial is at back & OpXFinal is at front
+ auto Op0FinalMI = (DefSrc0Queue.front())->getParent();
+ auto Op1FinalMI = (DefSrc1Queue.front())->getParent();
+ auto Op0IntialMI = (DefSrc0Queue.back())->getParent();
+ auto Op1IntialMI = (DefSrc1Queue.back())->getParent();
+
+ MachineOperand *FinalOutMO = nullptr;
+ std::queue<MachineOperand *> ChainedDefOps;
+ AMDGPU::SDWA::SdwaSel OpSel = AMDGPU::SDWA::SdwaSel::DWORD;
+ int NumOfElemInSecondOpChain = 0;
+
+ // Now, we will change the flow as per the dominace of MI as follows, if
+ // possible and store it in ChainedDefOps, so later can be used to convert
+ // into its SDWA version:
+ //
+ // If (dominates(Op0FinalMI, Op1IntialMI)) == TRUE
+ // SrcRootMO -> Op0Intial -> Op0x -> ... -> Op0Final
+ // -> Op1Intial -> Op1x -> ... -> Op1Final (FinalOutMO)
+ //
+ // If (dominates(Op1FinalMI, Op0IntialMI)) == TRUE
+ // SrcRootMO -> Op1Intial -> Op1x -> ... -> Op1Final
+ // -> Op0Intial -> Op0x -> ... -> Op0Final (FinalOutMO)
+ //
+ // TODO : Else, not handled!
+ // One such case is observed when multiple fp16 instruction are chained
+ // on a fp16 vector input. For Example :
+ //
+ // %1 = call <2 x half> @llvm.log.v2f16 (<2 x half> %0)
+ // %res = call <2 x half> @llvm.sin.v2f16 (<2 x half> %1)
+ // return <2 x half> %res
+ if (dominates(Op0FinalMI, Op1IntialMI)) {
+ int OpIdx = Op1IntialMI->findRegisterUseOperandIdx(SrcRootMOReg, TRI);
+ auto &MOTo = Op1IntialMI->getOperand(OpIdx);
+ auto MOFrom = DefSrc0Queue.front();
+ copyRegOperand(MOTo, *MOFrom, true);
+ FinalOutMO = DefSrc1Queue.front();
+
+ LLVM_DEBUG(dbgs() << "Updated Connecting MI : " << *Op1IntialMI
+ << '\n');
+ OpIdx = Op0IntialMI->findRegisterUseOperandIdx(SrcRootMOReg, TRI);
+ auto &IntialInMO = Op0IntialMI->getOperand(OpIdx);
+
+ while (!DefSrc1Queue.empty()) {
+ ChainedDefOps.push(DefSrc1Queue.front());
+ DefSrc1Queue.pop();
+ NumOfElemInSecondOpChain++;
+ }
+ while (!DefSrc0Queue.empty()) {
+ ChainedDefOps.push(DefSrc0Queue.front());
+ DefSrc0Queue.pop();
+ }
+
+ ChainedDefOps.push(&IntialInMO);
+ OpSel = AMDGPU::SDWA::SdwaSel::WORD_1;
+ } else if (dominates(Op1FinalMI, Op0IntialMI)) {
+ int OpIdx = Op0IntialMI->findRegisterUseOperandIdx(SrcRootMOReg, TRI);
+ auto &MOTo = Op0IntialMI->getOperand(OpIdx);
+ auto MOFrom = DefSrc1Queue.front();
+ copyRegOperand(MOTo, *MOFrom, true);
+ FinalOutMO = DefSrc0Queue.front();
+
+ LLVM_DEBUG(dbgs() << "Updated Connecting MI : " << *Op0IntialMI
+ << '\n');
+ OpIdx = Op1IntialMI->findRegisterUseOperandIdx(SrcRootMOReg, TRI);
+ auto &IntialInMO = Op1IntialMI->getOperand(OpIdx);
+
+ while (!DefSrc0Queue.empty()) {
+ ChainedDefOps.push(DefSrc0Queue.front());
+ DefSrc0Queue.pop();
+ NumOfElemInSecondOpChain++;
+ }
+ while (!DefSrc1Queue.empty()) {
+ ChainedDefOps.push(DefSrc1Queue.front());
+ DefSrc1Queue.pop();
+ }
+
+ ChainedDefOps.push(&IntialInMO);
+ OpSel = AMDGPU::SDWA::SdwaSel::WORD_0;
+ } else {
+ LLVM_DEBUG(dbgs() << "No Connecting MI exists" << '\n');
+ continue;
+ }
+
+ // Replace all use places of MI(v_pack) defMO with FinalOutMO.
+ MachineOperand &DefMO = MI.getOperand(0);
+ for (MachineOperand &MO :
+ make_early_inc_range(MRI->use_nodbg_operands(DefMO.getReg()))) {
+ if (!MO.isReg())
+ continue;
+
+ MO.setReg(FinalOutMO->getReg());
+ MO.setSubReg(FinalOutMO->getSubReg());
+ }
+ LLVM_DEBUG(dbgs() << "Replace uses of " << DefMO << " in " << MI
+ << "With " << *FinalOutMO << '\n');
+
+ // Delete v_pack machine instruction
+ LLVM_DEBUG(dbgs() << "\nInstruction to be deleted : " << MI << "\n\n");
+ MI.eraseFromParent();
+ ++Num16BitPackedInstructionsEliminated;
+
+ // Convert machine instruction into SDWA-version
+ while (ChainedDefOps.size() != 1) {
+ if (NumOfElemInSecondOpChain == 0) {
+ if (OpSel == AMDGPU::SDWA::SdwaSel::WORD_0)
+ OpSel = AMDGPU::SDWA::SdwaSel::WORD_1;
+ else
+ OpSel = AMDGPU::SDWA::SdwaSel::WORD_0;
+ }
+
+ MachineInstr *DefMI = ChainedDefOps.front()->getParent();
+ ChainedDefOps.pop();
+ MachineOperand *SrcMO = ChainedDefOps.front();
+
+ // Take SrcMO (which are def) as its usage in DefMI
+ if (SrcMO->isDef()) {
+ assert(MRI->hasOneUse(SrcMO->getReg()));
+ SrcMO = findSingleRegUse(SrcMO, MRI);
+ assert(DefMI == SrcMO->getParent() && "the only use is not in DefMI");
+ }
+
+ convertMIToSDWAWithOpsel(*DefMI, *SrcMO, OpSel);
+ NumOfElemInSecondOpChain--;
+ }
+ }
+ }
+}
+
bool SIPeepholeSDWA::run(MachineFunction &MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
@@ -1418,6 +1924,12 @@ bool SIPeepholeSDWA::run(MachineFunction &MF) {
while (!ConvertedInstructions.empty())
legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST);
} while (Changed);
+
+ // Process each v_pack_b32_fp16 instruction in MBB.
+ eliminateFP16Packing(MBB, ST);
+ Ret |= !ConvertedInstructions.empty();
+ while (!ConvertedInstructions.empty())
+ legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST);
}
return Ret;
>From 98bdc1d7e323ced38689b78a26edbf9b6451c006 Mon Sep 17 00:00:00 2001
From: vikashgu <Vikash.Gupta at amd.com>
Date: Thu, 24 Apr 2025 06:56:22 +0000
Subject: [PATCH 2/7] Update the LIT tests to accomodate the patch effects.
---
.../AMDGPU/GlobalISel/combine-fma-sub-mul.ll | 88 +++---
.../GlobalISel/combine-fma-sub-neg-mul.ll | 40 ++-
.../CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll | 100 ++++---
llvm/test/CodeGen/AMDGPU/fdiv.f16.ll | 20 +-
.../AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll | 51 ++--
llvm/test/CodeGen/AMDGPU/fract-match.ll | 11 +-
llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll | 18 +-
llvm/test/CodeGen/AMDGPU/llvm.exp.ll | 14 +-
llvm/test/CodeGen/AMDGPU/llvm.exp10.ll | 18 +-
llvm/test/CodeGen/AMDGPU/llvm.exp2.ll | 35 +--
llvm/test/CodeGen/AMDGPU/llvm.frexp.ll | 23 +-
llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll | 44 ++-
llvm/test/CodeGen/AMDGPU/llvm.log.ll | 260 ++++++++++--------
llvm/test/CodeGen/AMDGPU/llvm.log10.ll | 260 ++++++++++--------
llvm/test/CodeGen/AMDGPU/llvm.log2.ll | 110 +++-----
llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll | 5 +-
llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll | 18 +-
llvm/test/CodeGen/AMDGPU/repeated-divisor.ll | 12 +-
llvm/test/CodeGen/AMDGPU/roundeven.ll | 80 +++---
19 files changed, 594 insertions(+), 613 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll
index e8e29c3d4b526..e7cc28eeabffb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll
@@ -545,12 +545,10 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> %
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2
; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3
-; GFX9-NEXT: v_sub_f16_e32 v2, v0, v4
-; GFX9-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT: v_sub_f16_e32 v3, v1, v5
-; GFX9-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX9-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX9-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-CONTRACT-LABEL: test_v4f16_sub_mul:
@@ -565,12 +563,10 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> %
; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2
; GFX9-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3
-; GFX9-DENORM-NEXT: v_sub_f16_e32 v2, v0, v4
-; GFX9-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-DENORM-NEXT: v_sub_f16_e32 v3, v1, v5
-; GFX9-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-DENORM-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX9-DENORM-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX9-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_v4f16_sub_mul:
@@ -578,12 +574,10 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> %
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2
; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3
-; GFX10-NEXT: v_sub_f16_e32 v2, v0, v4
-; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT: v_sub_f16_e32 v3, v1, v5
-; GFX10-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-CONTRACT-LABEL: test_v4f16_sub_mul:
@@ -598,12 +592,10 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> %
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2
; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3
-; GFX10-DENORM-NEXT: v_sub_f16_e32 v2, v0, v4
-; GFX10-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-DENORM-NEXT: v_sub_f16_e32 v3, v1, v5
-; GFX10-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-DENORM-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX10-DENORM-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX10-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-CONTRACT-LABEL: test_v4f16_sub_mul:
@@ -644,12 +636,12 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2
; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3
-; GFX9-NEXT: v_sub_f16_e32 v2, v4, v0
-; GFX9-NEXT: v_sub_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT: v_sub_f16_e32 v3, v5, v1
-; GFX9-NEXT: v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX9-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX9-NEXT: v_sub_f16_sdwa v4, v4, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: v_sub_f16_sdwa v5, v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: v_sub_f16_sdwa v4, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT: v_sub_f16_sdwa v5, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-NEXT: v_mov_b32_e32 v1, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-CONTRACT-LABEL: test_v4f16_sub_mul_rhs:
@@ -664,12 +656,12 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal
; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2
; GFX9-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3
-; GFX9-DENORM-NEXT: v_sub_f16_e32 v2, v4, v0
-; GFX9-DENORM-NEXT: v_sub_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-DENORM-NEXT: v_sub_f16_e32 v3, v5, v1
-; GFX9-DENORM-NEXT: v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-DENORM-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX9-DENORM-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX9-DENORM-NEXT: v_sub_f16_sdwa v4, v4, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-DENORM-NEXT: v_sub_f16_sdwa v5, v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-DENORM-NEXT: v_sub_f16_sdwa v4, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-DENORM-NEXT: v_sub_f16_sdwa v5, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-DENORM-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-DENORM-NEXT: v_mov_b32_e32 v1, v5
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_v4f16_sub_mul_rhs:
@@ -677,12 +669,12 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2
; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3
-; GFX10-NEXT: v_sub_f16_e32 v2, v4, v0
-; GFX10-NEXT: v_sub_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT: v_sub_f16_e32 v3, v5, v1
-; GFX10-NEXT: v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX10-NEXT: v_sub_f16_sdwa v4, v4, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_sub_f16_sdwa v5, v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_sub_f16_sdwa v4, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT: v_sub_f16_sdwa v5, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
+; GFX10-NEXT: v_mov_b32_e32 v1, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-CONTRACT-LABEL: test_v4f16_sub_mul_rhs:
@@ -697,12 +689,12 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2
; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3
-; GFX10-DENORM-NEXT: v_sub_f16_e32 v2, v4, v0
-; GFX10-DENORM-NEXT: v_sub_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-DENORM-NEXT: v_sub_f16_e32 v3, v5, v1
-; GFX10-DENORM-NEXT: v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-DENORM-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX10-DENORM-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX10-DENORM-NEXT: v_sub_f16_sdwa v4, v4, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DENORM-NEXT: v_sub_f16_sdwa v5, v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DENORM-NEXT: v_sub_f16_sdwa v4, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-DENORM-NEXT: v_sub_f16_sdwa v5, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, v4
+; GFX10-DENORM-NEXT: v_mov_b32_e32 v1, v5
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-CONTRACT-LABEL: test_v4f16_sub_mul_rhs:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll
index 70f961e2777af..ad11c9b5f28ca 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll
@@ -221,12 +221,10 @@ define <4 x half> @test_v4f16_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
-; GFX9-NEXT: v_sub_f16_e32 v2, v0, v4
-; GFX9-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT: v_sub_f16_e32 v3, v1, v5
-; GFX9-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX9-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX9-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-CONTRACT-LABEL: test_v4f16_sub_ext_neg_mul:
@@ -241,12 +239,10 @@ define <4 x half> @test_v4f16_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x
; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
; GFX9-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
-; GFX9-DENORM-NEXT: v_sub_f16_e32 v2, v0, v4
-; GFX9-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-DENORM-NEXT: v_sub_f16_e32 v3, v1, v5
-; GFX9-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-DENORM-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX9-DENORM-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX9-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_v4f16_sub_ext_neg_mul:
@@ -254,12 +250,10 @@ define <4 x half> @test_v4f16_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
-; GFX10-NEXT: v_sub_f16_e32 v2, v0, v4
-; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT: v_sub_f16_e32 v3, v1, v5
-; GFX10-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-CONTRACT-LABEL: test_v4f16_sub_ext_neg_mul:
@@ -274,12 +268,10 @@ define <4 x half> @test_v4f16_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
-; GFX10-DENORM-NEXT: v_sub_f16_e32 v2, v0, v4
-; GFX10-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-DENORM-NEXT: v_sub_f16_e32 v3, v1, v5
-; GFX10-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-DENORM-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX10-DENORM-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX10-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
entry:
%a = fmul <4 x half> %x, %y
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
index 5ba036c386a40..56827df6f027c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
@@ -1101,21 +1101,21 @@ define <2 x half> @v_fdiv_v2f16_afn(<2 x half> %a, <2 x half> %b) {
; GFX9-LABEL: v_fdiv_v2f16_afn:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_rcp_f16_e32 v2, v1
-; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0
+; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT: v_mov_b32_e32 v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fdiv_v2f16_afn:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_rcp_f16_e32 v2, v1
-; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT: v_mov_b32_e32 v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fdiv_v2f16_afn:
@@ -2782,17 +2782,15 @@ define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) {
; GFX9-LABEL: v_rcp_v2f16_arcp:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_rcp_f16_e32 v1, v0
-; GFX9-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX9-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_rcp_v2f16_arcp:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_rcp_f16_e32 v1, v0
-; GFX10-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX10-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_rcp_v2f16_arcp:
@@ -2834,17 +2832,15 @@ define <2 x half> @v_rcp_v2f16_arcp_afn(<2 x half> %x) {
; GFX9-LABEL: v_rcp_v2f16_arcp_afn:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_rcp_f16_e32 v1, v0
-; GFX9-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX9-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_rcp_v2f16_arcp_afn:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_rcp_f16_e32 v1, v0
-; GFX10-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX10-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_rcp_v2f16_arcp_afn:
@@ -3192,21 +3188,21 @@ define <2 x half> @v_fdiv_v2f16_afn_ulp25(<2 x half> %a, <2 x half> %b) {
; GFX9-LABEL: v_fdiv_v2f16_afn_ulp25:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_rcp_f16_e32 v2, v1
-; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0
+; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT: v_mov_b32_e32 v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fdiv_v2f16_afn_ulp25:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_rcp_f16_e32 v2, v1
-; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT: v_mov_b32_e32 v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fdiv_v2f16_afn_ulp25:
@@ -3310,21 +3306,21 @@ define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) {
; GFX9-LABEL: v_fdiv_v2f16_arcp_ulp25:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_rcp_f16_e32 v2, v1
-; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0
+; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT: v_mov_b32_e32 v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fdiv_v2f16_arcp_ulp25:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_rcp_f16_e32 v2, v1
-; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT: v_mov_b32_e32 v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fdiv_v2f16_arcp_ulp25:
@@ -3372,21 +3368,21 @@ define <2 x half> @v_fdiv_v2f16_arcp_afn_ulp25(<2 x half> %a, <2 x half> %b) {
; GFX9-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_rcp_f16_e32 v2, v1
-; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0
+; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT: v_mov_b32_e32 v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_rcp_f16_e32 v2, v1
-; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT: v_mov_b32_e32 v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
index c43731893c2d7..12d7f9d4af8c4 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
@@ -2188,17 +2188,15 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX9-LABEL: v_rsq_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_rsq_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_rsq_f16_e32 v0, v0
-; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT: v_rsq_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: v_rsq_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_rsq_v2f16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_rsq_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_rsq_f16_e32 v0, v0
-; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX10-NEXT: v_rsq_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_rsq_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_rsq_v2f16:
@@ -2398,17 +2396,15 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX9-LABEL: v_neg_rsq_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_rsq_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_rsq_f16_e32 v0, v0
-; GFX9-NEXT: v_pack_b32_f16 v0, -v0, -v1
+; GFX9-NEXT: v_rsq_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: v_rsq_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_neg_rsq_v2f16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_rsq_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_rsq_f16_e32 v0, v0
-; GFX10-NEXT: v_pack_b32_f16 v0, -v0, -v1
+; GFX10-NEXT: v_rsq_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_rsq_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_neg_rsq_v2f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
index 63ba18a5433aa..68a3db1472aa2 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -187,22 +187,18 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
; GFX10-LABEL: fmul_pow2_8xhalf:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_pk_lshlrev_b16 v3, v3, 1 op_sel_hi:[1,0]
-; GFX10-NEXT: v_pk_lshlrev_b16 v2, v2, 1 op_sel_hi:[1,0]
-; GFX10-NEXT: v_pk_lshlrev_b16 v1, v1, 1 op_sel_hi:[1,0]
; GFX10-NEXT: v_pk_lshlrev_b16 v0, v0, 1 op_sel_hi:[1,0]
-; GFX10-NEXT: v_cvt_f16_u16_e32 v4, v3
-; GFX10-NEXT: v_cvt_f16_u16_e32 v5, v2
-; GFX10-NEXT: v_cvt_f16_u16_e32 v6, v1
-; GFX10-NEXT: v_cvt_f16_u16_e32 v7, v0
-; GFX10-NEXT: v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_cvt_f16_u16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_cvt_f16_u16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_cvt_f16_u16_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_pack_b32_f16 v0, v7, v0
-; GFX10-NEXT: v_pack_b32_f16 v1, v6, v1
-; GFX10-NEXT: v_pack_b32_f16 v2, v5, v2
-; GFX10-NEXT: v_pack_b32_f16 v3, v4, v3
+; GFX10-NEXT: v_pk_lshlrev_b16 v1, v1, 1 op_sel_hi:[1,0]
+; GFX10-NEXT: v_pk_lshlrev_b16 v2, v2, 1 op_sel_hi:[1,0]
+; GFX10-NEXT: v_pk_lshlrev_b16 v3, v3, 1 op_sel_hi:[1,0]
+; GFX10-NEXT: v_cvt_f16_u16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_cvt_f16_u16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_cvt_f16_u16_sdwa v2, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_cvt_f16_u16_sdwa v3, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_cvt_f16_u16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_cvt_f16_u16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_cvt_f16_u16_sdwa v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_cvt_f16_u16_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX10-NEXT: v_pk_mul_f16 v0, 0x7000, v0 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_mul_f16 v1, 0x7000, v1 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_mul_f16 v2, 0x7000, v2 op_sel_hi:[0,1]
@@ -302,18 +298,14 @@ define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v4, 0x7000
-; GFX10-NEXT: v_ldexp_f16_e32 v5, 0x7000, v3
-; GFX10-NEXT: v_ldexp_f16_e32 v6, 0x7000, v2
-; GFX10-NEXT: v_ldexp_f16_e32 v7, 0x7000, v1
-; GFX10-NEXT: v_ldexp_f16_e32 v8, 0x7000, v0
-; GFX10-NEXT: v_ldexp_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_ldexp_f16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_ldexp_f16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_ldexp_f16_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_pack_b32_f16 v0, v8, v0
-; GFX10-NEXT: v_pack_b32_f16 v1, v7, v1
-; GFX10-NEXT: v_pack_b32_f16 v2, v6, v2
-; GFX10-NEXT: v_pack_b32_f16 v3, v5, v3
+; GFX10-NEXT: v_ldexp_f16_sdwa v0, v4, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_ldexp_f16_sdwa v1, v4, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_ldexp_f16_sdwa v2, v4, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_ldexp_f16_sdwa v3, v4, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_ldexp_f16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_ldexp_f16_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_ldexp_f16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_ldexp_f16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: fmul_pow2_ldexp_8xhalf:
@@ -1085,9 +1077,8 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_pk_lshlrev_b16 v0, v0, 2 op_sel_hi:[1,0]
-; GFX10-NEXT: v_cvt_f16_u16_e32 v1, v0
-; GFX10-NEXT: v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX10-NEXT: v_cvt_f16_u16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_cvt_f16_u16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX10-NEXT: v_pk_mul_f16 v0, 0x4b80, v0 op_sel_hi:[0,1]
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/fract-match.ll b/llvm/test/CodeGen/AMDGPU/fract-match.ll
index d957ba93e4fb3..1ca358b90c58a 100644
--- a/llvm/test/CodeGen/AMDGPU/fract-match.ll
+++ b/llvm/test/CodeGen/AMDGPU/fract-match.ll
@@ -1587,9 +1587,8 @@ define <2 x half> @basic_fract_v2f16_nonan(<2 x half> nofpclass(nan) %x) {
; GFX8-LABEL: basic_fract_v2f16_nonan:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_fract_f16_e32 v1, v0
-; GFX8-NEXT: v_fract_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX8-NEXT: v_fract_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_fract_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: basic_fract_v2f16_nonan:
@@ -2610,15 +2609,15 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) writeon
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_movk_i32 s6, 0x204
-; GFX8-NEXT: v_floor_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_floor_f16_e32 v4, v0
; GFX8-NEXT: v_fract_f16_sdwa v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX8-NEXT: v_cmp_class_f16_sdwa s[4:5], v0, s6 src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_pack_b32_f16 v3, v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v0
; GFX8-NEXT: v_fract_f16_e32 v4, v0
; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, 0, s[4:5]
; GFX8-NEXT: v_cmp_class_f16_e64 s[4:5], v0, s6
+; GFX8-NEXT: v_floor_f16_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, 0, s[4:5]
+; GFX8-NEXT: v_floor_f16_sdwa v3, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX8-NEXT: v_pack_b32_f16 v0, v0, v5
; GFX8-NEXT: global_store_dword v[1:2], v3, off
; GFX8-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
index 8c5bc4a33a303..a8ddc564e4b51 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
@@ -188,11 +188,10 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mul_f16_e32 v3, 0.15915494, v1
-; GFX9-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_cos_f16_e32 v2, v3
-; GFX9-NEXT: v_cos_f16_e32 v1, v1
-; GFX9-NEXT: v_pack_b32_f16 v1, v2, v1
+; GFX9-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
@@ -204,11 +203,10 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mul_f16_e32 v3, 0.15915494, v1
-; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_cos_f16_e32 v2, v3
-; GFX10-NEXT: v_cos_f16_e32 v1, v1
-; GFX10-NEXT: v_pack_b32_f16 v1, v2, v1
+; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
index fdccacf372dfa..a3ee9655f40a4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
@@ -6719,9 +6719,8 @@ define <2 x half> @v_exp_v2f16_fast(<2 x half> %in) {
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x3dc5
; GFX900-SDAG-NEXT: v_pk_mul_f16 v0, v0, s4 op_sel_hi:[1,0]
-; GFX900-SDAG-NEXT: v_exp_f16_e32 v1, v0
-; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_exp_v2f16_fast:
@@ -6904,13 +6903,12 @@ define <3 x half> @v_exp_v3f16_afn(<3 x half> %in) {
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x3dc5
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v2, 0x3dc5, v0
-; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX900-SDAG-NEXT: v_exp_f16_e32 v2, v2
-; GFX900-SDAG-NEXT: v_exp_f16_e32 v0, v0
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x3dc5, v1
; GFX900-SDAG-NEXT: v_exp_f16_e32 v1, v1
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_exp_v3f16_afn:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
index 0c2e6f82c9115..2dff6e21f8a17 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
@@ -6811,11 +6811,10 @@ define <2 x half> @v_exp10_v2f16_fast(<2 x half> %in) {
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x3dc5
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x3dc5, v0
-; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX900-SDAG-NEXT: v_exp_f16_e32 v1, v1
-; GFX900-SDAG-NEXT: v_exp_f16_e32 v0, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_exp10_v2f16_fast:
@@ -6998,13 +6997,12 @@ define <3 x half> @v_exp10_v3f16_afn(<3 x half> %in) {
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x3dc5
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v2, 0x3dc5, v0
-; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX900-SDAG-NEXT: v_exp_f16_e32 v2, v2
-; GFX900-SDAG-NEXT: v_exp_f16_e32 v0, v0
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x3dc5, v1
; GFX900-SDAG-NEXT: v_exp_f16_e32 v1, v1
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_exp10_v3f16_afn:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
index c34113a5dfab0..e6e518d13b5f7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
@@ -3335,9 +3335,8 @@ define <2 x half> @v_exp2_v2f16(<2 x half> %in) {
; GFX900-SDAG-LABEL: v_exp2_v2f16:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_exp_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_exp_f16_e32 v0, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_exp2_v2f16:
@@ -3413,9 +3412,8 @@ define <2 x half> @v_exp2_fabs_v2f16(<2 x half> %in) {
; GFX900-SDAG-LABEL: v_exp2_fabs_v2f16:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_exp_f16_sdwa v1, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_exp_f16_e64 v0, |v0|
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, |v0| dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, |v0| dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_exp2_fabs_v2f16:
@@ -3497,9 +3495,8 @@ define <2 x half> @v_exp2_fneg_fabs_v2f16(<2 x half> %in) {
; GFX900-SDAG-LABEL: v_exp2_fneg_fabs_v2f16:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_exp_f16_sdwa v1, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_exp_f16_e64 v0, -|v0|
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, -|v0| dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, -|v0| dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_exp2_fneg_fabs_v2f16:
@@ -3582,9 +3579,8 @@ define <2 x half> @v_exp2_fneg_v2f16(<2 x half> %in) {
; GFX900-SDAG-LABEL: v_exp2_fneg_v2f16:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_exp_f16_sdwa v1, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_exp_f16_e64 v0, -v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, -v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_exp2_fneg_v2f16:
@@ -3656,9 +3652,8 @@ define <2 x half> @v_exp2_v2f16_fast(<2 x half> %in) {
; GFX900-SDAG-LABEL: v_exp2_v2f16_fast:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_exp_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_exp_f16_e32 v0, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_exp2_v2f16_fast:
@@ -3738,10 +3733,9 @@ define <3 x half> @v_exp_v3f16(<3 x half> %in) {
; GFX900-SDAG-LABEL: v_exp_v3f16:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_exp_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_exp_f16_e32 v0, v0
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX900-SDAG-NEXT: v_exp_f16_e32 v1, v1
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v2
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_exp_v3f16:
@@ -3822,10 +3816,9 @@ define <3 x half> @v_exp2_v3f16_afn(<3 x half> %in) {
; GFX900-SDAG-LABEL: v_exp2_v3f16_afn:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_exp_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_exp_f16_e32 v0, v0
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX900-SDAG-NEXT: v_exp_f16_e32 v1, v1
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v2
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_exp2_v3f16_afn:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
index f44faf4f7edba..c7dcbcfde6d89 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
@@ -411,9 +411,9 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) {
; GFX9-SDAG-LABEL: test_frexp_v2f16_v2i32:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_frexp_mant_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-SDAG-NEXT: v_frexp_mant_f16_e32 v2, v0
-; GFX9-SDAG-NEXT: v_pack_b32_f16 v3, v2, v1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, v0
+; GFX9-SDAG-NEXT: v_frexp_mant_f16_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-SDAG-NEXT: v_frexp_mant_f16_sdwa v3, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX9-SDAG-NEXT: v_frexp_exp_i16_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-SDAG-NEXT: v_frexp_exp_i16_f16_e32 v0, v0
; GFX9-SDAG-NEXT: v_bfe_i32 v2, v1, 0, 16
@@ -522,13 +522,14 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) {
; GFX9-GISEL-LABEL: test_frexp_v2f16_v2i32:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_frexp_mant_f16_e32 v3, v0
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, v0
+; GFX9-GISEL-NEXT: v_frexp_mant_f16_sdwa v3, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX9-GISEL-NEXT: v_frexp_exp_i16_f16_e32 v1, v0
-; GFX9-GISEL-NEXT: v_frexp_mant_f16_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-GISEL-NEXT: v_frexp_mant_f16_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX9-GISEL-NEXT: v_frexp_exp_i16_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-GISEL-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX9-GISEL-NEXT: v_bfe_i32 v2, v0, 0, 16
-; GFX9-GISEL-NEXT: v_pack_b32_f16 v0, v3, v4
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, v3
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-TRUE16-LABEL: test_frexp_v2f16_v2i32:
@@ -628,9 +629,8 @@ define <2 x half> @test_frexp_v2f16_v2i32_only_use_fract(<2 x half> %a) {
; GFX9-SDAG-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_frexp_mant_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-SDAG-NEXT: v_frexp_mant_f16_e32 v0, v0
-; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-SDAG-NEXT: v_frexp_mant_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-SDAG-NEXT: v_frexp_mant_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-TRUE16-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
@@ -698,9 +698,8 @@ define <2 x half> @test_frexp_v2f16_v2i32_only_use_fract(<2 x half> %a) {
; GFX9-GISEL-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_frexp_mant_f16_e32 v1, v0
-; GFX9-GISEL-NEXT: v_frexp_mant_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX9-GISEL-NEXT: v_frexp_mant_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-GISEL-NEXT: v_frexp_mant_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-TRUE16-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
index 0e66b0af99f34..fe05fdf1226ec 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
@@ -461,10 +461,9 @@ define <2 x half> @test_ldexp_v2f16_v2i32(<2 x half> %a, <2 x i32> %b) {
; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x8000
; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0x7fff
; GFX9-SDAG-NEXT: v_med3_i32 v2, v2, s4, v3
+; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
; GFX9-SDAG-NEXT: v_med3_i32 v1, v1, s4, v3
-; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v1
-; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v2
+; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-TRUE16-LABEL: test_ldexp_v2f16_v2i32:
@@ -585,9 +584,8 @@ define <2 x half> @test_ldexp_v2f16_v2i16(<2 x half> %a, <2 x i16> %b) {
; GFX9-SDAG-LABEL: test_ldexp_v2f16_v2i16:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v1
-; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v2
+; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-TRUE16-LABEL: test_ldexp_v2f16_v2i16:
@@ -697,12 +695,11 @@ define <3 x half> @test_ldexp_v3f16_v3i32(<3 x half> %a, <3 x i32> %b) {
; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x8000
; GFX9-SDAG-NEXT: v_mov_b32_e32 v5, 0x7fff
; GFX9-SDAG-NEXT: v_med3_i32 v3, v3, s4, v5
-; GFX9-SDAG-NEXT: v_med3_i32 v2, v2, s4, v5
; GFX9-SDAG-NEXT: v_med3_i32 v4, v4, s4, v5
-; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v2
+; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-SDAG-NEXT: v_med3_i32 v2, v2, s4, v5
; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v1, v1, v4
-; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v3
+; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-TRUE16-LABEL: test_ldexp_v3f16_v3i32:
@@ -844,10 +841,9 @@ define <3 x half> @test_ldexp_v3f16_v3i16(<3 x half> %a, <3 x i16> %b) {
; GFX9-SDAG-LABEL: test_ldexp_v3f16_v3i16:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v4, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v2
+; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v1, v1, v3
-; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v4
+; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-TRUE16-LABEL: test_ldexp_v3f16_v3i16:
@@ -973,15 +969,13 @@ define <4 x half> @test_ldexp_v4f16_v4i32(<4 x half> %a, <4 x i32> %b) {
; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x8000
; GFX9-SDAG-NEXT: v_mov_b32_e32 v6, 0x7fff
; GFX9-SDAG-NEXT: v_med3_i32 v5, v5, s4, v6
-; GFX9-SDAG-NEXT: v_med3_i32 v4, v4, s4, v6
; GFX9-SDAG-NEXT: v_med3_i32 v3, v3, s4, v6
+; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-SDAG-NEXT: v_med3_i32 v4, v4, s4, v6
+; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
; GFX9-SDAG-NEXT: v_med3_i32 v2, v2, s4, v6
-; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v1, v1, v4
-; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v2
-; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v3
-; GFX9-SDAG-NEXT: v_pack_b32_f16 v1, v1, v5
+; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v1, v1, v4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-TRUE16-LABEL: test_ldexp_v4f16_v4i32:
@@ -1157,12 +1151,10 @@ define <4 x half> @test_ldexp_v4f16_v4i16(<4 x half> %a, <4 x i16> %b) {
; GFX9-SDAG-LABEL: test_ldexp_v4f16_v4i16:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v1, v1, v3
-; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v2
-; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v5
-; GFX9-SDAG-NEXT: v_pack_b32_f16 v1, v1, v4
+; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-TRUE16-LABEL: test_ldexp_v4f16_v4i16:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
index 1dd6a7926029e..bb23fdd26402d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
@@ -6870,15 +6870,25 @@ define <2 x half> @v_log_v2f16(<2 x half> %in) {
; VI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0
; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_log_v2f16:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_log_f16_e32 v1, v0
-; GFX900-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT: v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-NEXT: v_pack_b32_f16 v0, v1, v0
-; GFX900-NEXT: s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_log_v2f16:
+; GFX900-SDAG: ; %bb.0:
+; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_log_v2f16:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x398c
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log_v2f16:
; GFX1100-SDAG-TRUE16: ; %bb.0:
@@ -7005,22 +7015,22 @@ define <2 x half> @v_log_fabs_v2f16(<2 x half> %in) {
; GFX900-SDAG-LABEL: v_log_fabs_v2f16:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_log_f16_e64 v1, |v0|
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, |v0| dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, |v0| dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_log_fabs_v2f16:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-GISEL-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x398c
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log_fabs_v2f16:
@@ -7157,22 +7167,22 @@ define <2 x half> @v_log_fneg_fabs_v2f16(<2 x half> %in) {
; GFX900-SDAG-LABEL: v_log_fneg_fabs_v2f16:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_log_f16_e64 v1, -|v0|
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, -|v0| dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, -|v0| dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_log_fneg_fabs_v2f16:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-GISEL-NEXT: v_or_b32_e32 v0, 0x80008000, v0
-; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x398c
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log_fneg_fabs_v2f16:
@@ -7310,22 +7320,22 @@ define <2 x half> @v_log_fneg_v2f16(<2 x half> %in) {
; GFX900-SDAG-LABEL: v_log_fneg_v2f16:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_log_f16_e64 v1, -v0
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, -v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_log_fneg_v2f16:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-GISEL-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
-; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x398c
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log_fneg_v2f16:
@@ -7449,15 +7459,25 @@ define <2 x half> @v_log_v2f16_fast(<2 x half> %in) {
; VI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0
; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_log_v2f16_fast:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_log_f16_e32 v1, v0
-; GFX900-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT: v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-NEXT: v_pack_b32_f16 v0, v1, v0
-; GFX900-NEXT: s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_log_v2f16_fast:
+; GFX900-SDAG: ; %bb.0:
+; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_log_v2f16_fast:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x398c
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log_v2f16_fast:
; GFX1100-SDAG-TRUE16: ; %bb.0:
@@ -7576,17 +7596,29 @@ define <3 x half> @v_log_v3f16(<3 x half> %in) {
; VI-NEXT: v_or_b32_e32 v0, v2, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_log_v3f16:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_log_f16_e32 v2, v0
-; GFX900-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT: v_log_f16_e32 v1, v1
-; GFX900-NEXT: v_mul_f16_e32 v2, 0x398c, v2
-; GFX900-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-NEXT: v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX900-NEXT: s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_log_v3f16:
+; GFX900-SDAG: ; %bb.0:
+; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT: v_log_f16_e32 v1, v1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x398c, v1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_log_v3f16:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x398c
+; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x398c, v1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log_v3f16:
; GFX1100-SDAG-TRUE16: ; %bb.0:
@@ -7715,17 +7747,29 @@ define <3 x half> @v_log_v3f16_fast(<3 x half> %in) {
; VI-NEXT: v_or_b32_e32 v0, v2, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_log_v3f16_fast:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_log_f16_e32 v2, v0
-; GFX900-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT: v_log_f16_e32 v1, v1
-; GFX900-NEXT: v_mul_f16_e32 v2, 0x398c, v2
-; GFX900-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-NEXT: v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX900-NEXT: s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_log_v3f16_fast:
+; GFX900-SDAG: ; %bb.0:
+; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT: v_log_f16_e32 v1, v1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x398c, v1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_log_v3f16_fast:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x398c
+; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x398c, v1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log_v3f16_fast:
; GFX1100-SDAG-TRUE16: ; %bb.0:
@@ -7886,31 +7930,29 @@ define <4 x half> @v_log_v4f16(<4 x half> %in) {
; GFX900-SDAG-LABEL: v_log_v4f16:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_log_f16_e32 v2, v1
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_log_f16_e32 v3, v0
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v2, 0x398c, v2
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v3, 0x398c, v3
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v3, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v1, v2, v1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_log_v4f16:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT: v_log_f16_e32 v2, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_log_f16_e32 v3, v1
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v2, 0x398c, v2
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v3, 0x398c, v3
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x398c
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log_v4f16:
@@ -8089,31 +8131,29 @@ define <4 x half> @v_log_v4f16_fast(<4 x half> %in) {
; GFX900-SDAG-LABEL: v_log_v4f16_fast:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_log_f16_e32 v2, v1
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_log_f16_e32 v3, v0
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v2, 0x398c, v2
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v3, 0x398c, v3
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v3, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v1, v2, v1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_log_v4f16_fast:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT: v_log_f16_e32 v2, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_log_f16_e32 v3, v1
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v2, 0x398c, v2
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v3, 0x398c, v3
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x398c
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log_v4f16_fast:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
index 86a58d26c6ae5..dcf789e26de54 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
@@ -6870,15 +6870,25 @@ define <2 x half> @v_log10_v2f16(<2 x half> %in) {
; VI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0
; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_log10_v2f16:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_log_f16_e32 v1, v0
-; GFX900-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-NEXT: v_pack_b32_f16 v0, v1, v0
-; GFX900-NEXT: s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_log10_v2f16:
+; GFX900-SDAG: ; %bb.0:
+; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_log10_v2f16:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x34d1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log10_v2f16:
; GFX1100-SDAG-TRUE16: ; %bb.0:
@@ -7005,22 +7015,22 @@ define <2 x half> @v_log10_fabs_v2f16(<2 x half> %in) {
; GFX900-SDAG-LABEL: v_log10_fabs_v2f16:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_log_f16_e64 v1, |v0|
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, |v0| dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, |v0| dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_log10_fabs_v2f16:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-GISEL-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x34d1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log10_fabs_v2f16:
@@ -7157,22 +7167,22 @@ define <2 x half> @v_log10_fneg_fabs_v2f16(<2 x half> %in) {
; GFX900-SDAG-LABEL: v_log10_fneg_fabs_v2f16:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_log_f16_e64 v1, -|v0|
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, -|v0| dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, -|v0| dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_log10_fneg_fabs_v2f16:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-GISEL-NEXT: v_or_b32_e32 v0, 0x80008000, v0
-; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x34d1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log10_fneg_fabs_v2f16:
@@ -7310,22 +7320,22 @@ define <2 x half> @v_log10_fneg_v2f16(<2 x half> %in) {
; GFX900-SDAG-LABEL: v_log10_fneg_v2f16:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_log_f16_e64 v1, -v0
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, -v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_log10_fneg_v2f16:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-GISEL-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
-; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x34d1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log10_fneg_v2f16:
@@ -7449,15 +7459,25 @@ define <2 x half> @v_log10_v2f16_fast(<2 x half> %in) {
; VI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0
; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_log10_v2f16_fast:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_log_f16_e32 v1, v0
-; GFX900-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-NEXT: v_pack_b32_f16 v0, v1, v0
-; GFX900-NEXT: s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_log10_v2f16_fast:
+; GFX900-SDAG: ; %bb.0:
+; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_log10_v2f16_fast:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x34d1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log10_v2f16_fast:
; GFX1100-SDAG-TRUE16: ; %bb.0:
@@ -7576,17 +7596,29 @@ define <3 x half> @v_log10_v3f16(<3 x half> %in) {
; VI-NEXT: v_or_b32_e32 v0, v2, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_log10_v3f16:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_log_f16_e32 v2, v0
-; GFX900-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT: v_log_f16_e32 v1, v1
-; GFX900-NEXT: v_mul_f16_e32 v2, 0x34d1, v2
-; GFX900-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX900-NEXT: s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_log10_v3f16:
+; GFX900-SDAG: ; %bb.0:
+; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT: v_log_f16_e32 v1, v1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_log10_v3f16:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x34d1
+; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log10_v3f16:
; GFX1100-SDAG-TRUE16: ; %bb.0:
@@ -7715,17 +7747,29 @@ define <3 x half> @v_log10_v3f16_fast(<3 x half> %in) {
; VI-NEXT: v_or_b32_e32 v0, v2, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_log10_v3f16_fast:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_log_f16_e32 v2, v0
-; GFX900-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT: v_log_f16_e32 v1, v1
-; GFX900-NEXT: v_mul_f16_e32 v2, 0x34d1, v2
-; GFX900-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX900-NEXT: s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_log10_v3f16_fast:
+; GFX900-SDAG: ; %bb.0:
+; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT: v_log_f16_e32 v1, v1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_log10_v3f16_fast:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x34d1
+; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log10_v3f16_fast:
; GFX1100-SDAG-TRUE16: ; %bb.0:
@@ -7886,31 +7930,29 @@ define <4 x half> @v_log10_v4f16(<4 x half> %in) {
; GFX900-SDAG-LABEL: v_log10_v4f16:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_log_f16_e32 v2, v1
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_log_f16_e32 v3, v0
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v2, 0x34d1, v2
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v3, 0x34d1, v3
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v3, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v1, v2, v1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_log10_v4f16:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT: v_log_f16_e32 v2, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_log_f16_e32 v3, v1
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v2, 0x34d1, v2
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v3, 0x34d1, v3
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x34d1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log10_v4f16:
@@ -8089,31 +8131,29 @@ define <4 x half> @v_log10_v4f16_fast(<4 x half> %in) {
; GFX900-SDAG-LABEL: v_log10_v4f16_fast:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_log_f16_e32 v2, v1
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_log_f16_e32 v3, v0
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v2, 0x34d1, v2
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v3, 0x34d1, v3
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v3, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v1, v2, v1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_log10_v4f16_fast:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT: v_log_f16_e32 v2, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_log_f16_e32 v3, v1
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v2, 0x34d1, v2
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v3, 0x34d1, v3
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x34d1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log10_v4f16_fast:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
index ea88f77f98735..5544fd764e841 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -4249,17 +4249,15 @@ define <2 x half> @v_log2_v2f16(<2 x half> %in) {
; GFX900-SDAG-LABEL: v_log2_v2f16:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_log_f16_e32 v0, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_log2_v2f16:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log2_v2f16:
@@ -4367,18 +4365,16 @@ define <2 x half> @v_log2_fabs_v2f16(<2 x half> %in) {
; GFX900-SDAG-LABEL: v_log2_fabs_v2f16:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_log_f16_e64 v0, |v0|
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, |v0| dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, |v0| dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_log2_fabs_v2f16:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-GISEL-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log2_fabs_v2f16:
@@ -4494,18 +4490,16 @@ define <2 x half> @v_log2_fneg_fabs_v2f16(<2 x half> %in) {
; GFX900-SDAG-LABEL: v_log2_fneg_fabs_v2f16:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_log_f16_e64 v0, -|v0|
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, -|v0| dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, -|v0| dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_log2_fneg_fabs_v2f16:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-GISEL-NEXT: v_or_b32_e32 v0, 0x80008000, v0
-; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log2_fneg_fabs_v2f16:
@@ -4622,18 +4616,16 @@ define <2 x half> @v_log2_fneg_v2f16(<2 x half> %in) {
; GFX900-SDAG-LABEL: v_log2_fneg_v2f16:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_log_f16_e64 v0, -v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, -v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_log2_fneg_v2f16:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-GISEL-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
-; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log2_fneg_v2f16:
@@ -4739,17 +4731,15 @@ define <2 x half> @v_log2_v2f16_fast(<2 x half> %in) {
; GFX900-SDAG-LABEL: v_log2_v2f16_fast:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_log_f16_e32 v0, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_log2_v2f16_fast:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log2_v2f16_fast:
@@ -4861,19 +4851,17 @@ define <3 x half> @v_log2_v3f16(<3 x half> %in) {
; GFX900-SDAG-LABEL: v_log2_v3f16:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_log_f16_e32 v0, v0
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX900-SDAG-NEXT: v_log_f16_e32 v1, v1
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v2
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_log2_v3f16:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT: v_log_f16_e32 v2, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v1
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log2_v3f16:
@@ -4989,19 +4977,17 @@ define <3 x half> @v_log2_v3f16_fast(<3 x half> %in) {
; GFX900-SDAG-LABEL: v_log2_v3f16_fast:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_log_f16_e32 v0, v0
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX900-SDAG-NEXT: v_log_f16_e32 v1, v1
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v2
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_log2_v3f16_fast:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT: v_log_f16_e32 v2, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v1
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log2_v3f16_fast:
@@ -5129,23 +5115,19 @@ define <4 x half> @v_log2_v4f16(<4 x half> %in) {
; GFX900-SDAG-LABEL: v_log2_v4f16:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_log_f16_e32 v0, v0
-; GFX900-SDAG-NEXT: v_log_f16_e32 v1, v1
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v3
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v1, v1, v2
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_log2_v4f16:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT: v_log_f16_e32 v2, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_log_f16_e32 v3, v1
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log2_v4f16:
@@ -5284,23 +5266,19 @@ define <4 x half> @v_log2_v4f16_fast(<4 x half> %in) {
; GFX900-SDAG-LABEL: v_log2_v4f16_fast:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_log_f16_e32 v0, v0
-; GFX900-SDAG-NEXT: v_log_f16_e32 v1, v1
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v3
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v1, v1, v2
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_log2_v4f16_fast:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT: v_log_f16_e32 v2, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_log_f16_e32 v3, v1
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log2_v4f16_fast:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
index de12f2b246f57..c6fee73f4580d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
@@ -198,9 +198,8 @@ define amdgpu_kernel void @rint_v2f16(
; GFX9-NEXT: s_mov_b32 s4, s0
; GFX9-NEXT: s_mov_b32 s5, s1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_rndne_f16_e32 v1, v0
-; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
index 1a426096da197..92e8dce75222a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
@@ -188,11 +188,10 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mul_f16_e32 v3, 0.15915494, v1
-; GFX9-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_sin_f16_e32 v2, v3
-; GFX9-NEXT: v_sin_f16_e32 v1, v1
-; GFX9-NEXT: v_pack_b32_f16 v1, v2, v1
+; GFX9-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
@@ -204,11 +203,10 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mul_f16_e32 v3, 0.15915494, v1
-; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_sin_f16_e32 v2, v3
-; GFX10-NEXT: v_sin_f16_e32 v1, v1
-; GFX10-NEXT: v_pack_b32_f16 v1, v2, v1
+; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll b/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll
index 04eea20993608..9f31bde8086d0 100644
--- a/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll
+++ b/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll
@@ -829,9 +829,8 @@ define <4 x half> @v_repeat_divisor_v2f16_x2(<2 x half> %x, <2 x half> %y, <2 x
; GFX9-LABEL: v_repeat_divisor_v2f16_x2:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_rcp_f16_sdwa v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_rcp_f16_e32 v2, v2
-; GFX9-NEXT: v_pack_b32_f16 v2, v2, v3
+; GFX9-NEXT: v_rcp_f16_sdwa v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: v_rcp_f16_sdwa v2, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2
; GFX9-NEXT: v_pk_mul_f16 v1, v1, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -932,15 +931,14 @@ define <6 x half> @v_repeat_divisor_v3f16_x2(<3 x half> %x, <3 x half> %y, <3 x
; GFX9-LABEL: v_repeat_divisor_v3f16_x2:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_rcp_f16_sdwa v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_rcp_f16_e32 v4, v4
+; GFX9-NEXT: v_rcp_f16_sdwa v4, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX9-NEXT: v_rcp_f16_e32 v5, v5
; GFX9-NEXT: s_movk_i32 s4, 0x7e00
-; GFX9-NEXT: v_pack_b32_f16 v4, v4, v6
+; GFX9-NEXT: v_rcp_f16_sdwa v4, v4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX9-NEXT: v_pack_b32_f16 v5, v5, s4
-; GFX9-NEXT: v_pk_mul_f16 v0, v0, v4
; GFX9-NEXT: v_pk_mul_f16 v1, v1, v5
; GFX9-NEXT: v_pk_mul_f16 v3, v3, v5
+; GFX9-NEXT: v_pk_mul_f16 v0, v0, v4
; GFX9-NEXT: v_pk_mul_f16 v4, v2, v4
; GFX9-NEXT: v_alignbit_b32 v2, v3, v4, 16
; GFX9-NEXT: v_pack_b32_f16 v1, v1, v4
diff --git a/llvm/test/CodeGen/AMDGPU/roundeven.ll b/llvm/test/CodeGen/AMDGPU/roundeven.ll
index 59a1fe041bf90..97358044abdaa 100644
--- a/llvm/test/CodeGen/AMDGPU/roundeven.ll
+++ b/llvm/test/CodeGen/AMDGPU/roundeven.ll
@@ -460,17 +460,15 @@ define <2 x half> @v_roundeven_v2f16(<2 x half> %x) {
; GFX9-LABEL: v_roundeven_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_rndne_f16_e32 v1, v0
-; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_roundeven_v2f16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_rndne_f16_e32 v1, v0
-; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_roundeven_v2f16:
@@ -523,17 +521,15 @@ define <2 x half> @v_roundeven_v2f16(<2 x half> %x) {
; SDAG_GFX9-LABEL: v_roundeven_v2f16:
; SDAG_GFX9: ; %bb.0:
; SDAG_GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG_GFX9-NEXT: v_rndne_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG_GFX9-NEXT: v_rndne_f16_e32 v0, v0
-; SDAG_GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
+; SDAG_GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; SDAG_GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; SDAG_GFX9-NEXT: s_setpc_b64 s[30:31]
;
; SDAG_GFX10-LABEL: v_roundeven_v2f16:
; SDAG_GFX10: ; %bb.0:
; SDAG_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG_GFX10-NEXT: v_rndne_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG_GFX10-NEXT: v_rndne_f16_e32 v0, v0
-; SDAG_GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
+; SDAG_GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; SDAG_GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; SDAG_GFX10-NEXT: s_setpc_b64 s[30:31]
;
; SDAG_GFX11-TRUE16-LABEL: v_roundeven_v2f16:
@@ -602,18 +598,16 @@ define <2 x half> @v_roundeven_v2f16_fneg(<2 x half> %x) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
-; GFX9-NEXT: v_rndne_f16_e32 v1, v0
-; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_roundeven_v2f16_fneg:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
-; GFX10-NEXT: v_rndne_f16_e32 v1, v0
-; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_roundeven_v2f16_fneg:
@@ -676,17 +670,15 @@ define <2 x half> @v_roundeven_v2f16_fneg(<2 x half> %x) {
; SDAG_GFX9-LABEL: v_roundeven_v2f16_fneg:
; SDAG_GFX9: ; %bb.0:
; SDAG_GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG_GFX9-NEXT: v_rndne_f16_sdwa v1, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG_GFX9-NEXT: v_rndne_f16_e64 v0, -v0
-; SDAG_GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
+; SDAG_GFX9-NEXT: v_rndne_f16_sdwa v0, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; SDAG_GFX9-NEXT: v_rndne_f16_sdwa v0, -v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; SDAG_GFX9-NEXT: s_setpc_b64 s[30:31]
;
; SDAG_GFX10-LABEL: v_roundeven_v2f16_fneg:
; SDAG_GFX10: ; %bb.0:
; SDAG_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG_GFX10-NEXT: v_rndne_f16_sdwa v1, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG_GFX10-NEXT: v_rndne_f16_e64 v0, -v0
-; SDAG_GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
+; SDAG_GFX10-NEXT: v_rndne_f16_sdwa v0, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; SDAG_GFX10-NEXT: v_rndne_f16_sdwa v0, -v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; SDAG_GFX10-NEXT: s_setpc_b64 s[30:31]
;
; SDAG_GFX11-TRUE16-LABEL: v_roundeven_v2f16_fneg:
@@ -759,23 +751,19 @@ define <4 x half> @v_roundeven_v4f16(<4 x half> %x) {
; GFX9-LABEL: v_roundeven_v4f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_rndne_f16_e32 v2, v0
-; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_rndne_f16_e32 v3, v1
-; GFX9-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX9-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_roundeven_v4f16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_rndne_f16_e32 v2, v0
-; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_rndne_f16_e32 v3, v1
-; GFX10-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_roundeven_v4f16:
@@ -850,23 +838,19 @@ define <4 x half> @v_roundeven_v4f16(<4 x half> %x) {
; SDAG_GFX9-LABEL: v_roundeven_v4f16:
; SDAG_GFX9: ; %bb.0:
; SDAG_GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG_GFX9-NEXT: v_rndne_f16_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG_GFX9-NEXT: v_rndne_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG_GFX9-NEXT: v_rndne_f16_e32 v1, v1
-; SDAG_GFX9-NEXT: v_rndne_f16_e32 v0, v0
-; SDAG_GFX9-NEXT: v_pack_b32_f16 v0, v0, v3
-; SDAG_GFX9-NEXT: v_pack_b32_f16 v1, v1, v2
+; SDAG_GFX9-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; SDAG_GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; SDAG_GFX9-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; SDAG_GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; SDAG_GFX9-NEXT: s_setpc_b64 s[30:31]
;
; SDAG_GFX10-LABEL: v_roundeven_v4f16:
; SDAG_GFX10: ; %bb.0:
; SDAG_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG_GFX10-NEXT: v_rndne_f16_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG_GFX10-NEXT: v_rndne_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG_GFX10-NEXT: v_rndne_f16_e32 v0, v0
-; SDAG_GFX10-NEXT: v_rndne_f16_e32 v1, v1
-; SDAG_GFX10-NEXT: v_pack_b32_f16 v0, v0, v3
-; SDAG_GFX10-NEXT: v_pack_b32_f16 v1, v1, v2
+; SDAG_GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; SDAG_GFX10-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; SDAG_GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; SDAG_GFX10-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; SDAG_GFX10-NEXT: s_setpc_b64 s[30:31]
;
; SDAG_GFX11-TRUE16-LABEL: v_roundeven_v4f16:
>From 6e3bca2f2ea8ba80626e49ee7532b34d1eda7d19 Mon Sep 17 00:00:00 2001
From: vikashgu <Vikash.Gupta at amd.com>
Date: Thu, 24 Apr 2025 10:13:40 +0000
Subject: [PATCH 3/7] [AMDGPU][NFC] Added Pre-commit tests for #137137
This adds llc LIT test for vector fp16 operations like log, exp, etc.
Its act as the pre-commit test for github PR#137137.
---
llvm/test/CodeGen/AMDGPU/vector-fp16.ll | 2758 +++++++++++++++++++++++
1 file changed, 2758 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/vector-fp16.ll
diff --git a/llvm/test/CodeGen/AMDGPU/vector-fp16.ll b/llvm/test/CodeGen/AMDGPU/vector-fp16.ll
new file mode 100644
index 0000000000000..501630e790200
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vector-fp16.ll
@@ -0,0 +1,2758 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX906 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+
+declare <1 x half> @llvm.sin.v1f16(<1 x half>)
+declare <1 x half> @llvm.cos.v1f16(<1 x half>)
+declare <1 x half> @llvm.log.v1f16(<1 x half>)
+declare <1 x half> @llvm.log2.v1f16(<1 x half>)
+declare <1 x half> @llvm.log10.v1f16(<1 x half>)
+declare <1 x half> @llvm.exp.v1f16(<1 x half>)
+declare <1 x half> @llvm.exp2.v1f16(<1 x half>)
+declare <1 x half> @llvm.exp10.v1f16(<1 x half>)
+declare <1 x half> @llvm.sqrt.v1f16(<1 x half>)
+
+declare <2 x half> @llvm.sin.v2f16(<2 x half>)
+declare <2 x half> @llvm.cos.v2f16(<2 x half>)
+declare <2 x half> @llvm.log.v2f16(<2 x half>)
+declare <2 x half> @llvm.log2.v2f16(<2 x half>)
+declare <2 x half> @llvm.log10.v2f16(<2 x half>)
+declare <2 x half> @llvm.exp.v2f16(<2 x half>)
+declare <2 x half> @llvm.exp2.v2f16(<2 x half>)
+declare <2 x half> @llvm.exp10.v2f16(<2 x half>)
+declare <2 x half> @llvm.sqrt.v2f16(<2 x half>)
+
+declare <4 x half> @llvm.sin.v4f16(<4 x half>)
+declare <4 x half> @llvm.cos.v4f16(<4 x half>)
+declare <4 x half> @llvm.log.v4f16(<4 x half>)
+declare <4 x half> @llvm.log2.v4f16(<4 x half>)
+declare <4 x half> @llvm.log10.v4f16(<4 x half>)
+declare <4 x half> @llvm.exp.v4f16(<4 x half>)
+declare <4 x half> @llvm.exp2.v4f16(<4 x half>)
+declare <4 x half> @llvm.exp10.v4f16(<4 x half>)
+declare <4 x half> @llvm.sqrt.v4f16(<4 x half>)
+
+declare <5 x half> @llvm.sin.v5f16(<5 x half>)
+declare <5 x half> @llvm.cos.v5f16(<5 x half>)
+declare <5 x half> @llvm.log.v5f16(<5 x half>)
+declare <5 x half> @llvm.log2.v5f16(<5 x half>)
+declare <5 x half> @llvm.log10.v5f16(<5 x half>)
+declare <5 x half> @llvm.exp.v5f16(<5 x half>)
+declare <5 x half> @llvm.exp2.v5f16(<5 x half>)
+declare <5 x half> @llvm.exp10.v5f16(<5 x half>)
+declare <5 x half> @llvm.sqrt.v5f16(<5 x half>)
+
+
+define <1 x half> @sin_v1f16(<1 x half> %a) {
+; GFX8-LABEL: sin_v1f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
+; GFX8-NEXT: v_fract_f16_e32 v0, v0
+; GFX8-NEXT: v_sin_f16_e32 v0, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: sin_v1f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
+; GFX9-NEXT: v_sin_f16_e32 v0, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: sin_v1f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
+; GFX10-NEXT: v_sin_f16_e32 v0, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: sin_v1f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_sin_f16_e32 v0, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <1 x half> @llvm.sin.v1f16(<1 x half> %a)
+ ret <1 x half> %res
+}
+
+define <1 x half> @cos_v1f16(<1 x half> %a) {
+; GFX8-LABEL: cos_v1f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
+; GFX8-NEXT: v_fract_f16_e32 v0, v0
+; GFX8-NEXT: v_cos_f16_e32 v0, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: cos_v1f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
+; GFX9-NEXT: v_cos_f16_e32 v0, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: cos_v1f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
+; GFX10-NEXT: v_cos_f16_e32 v0, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: cos_v1f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cos_f16_e32 v0, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <1 x half> @llvm.cos.v1f16(<1 x half> %a)
+ ret <1 x half> %res
+}
+
+define <1 x half> @log_v1f16(<1 x half> %a) {
+; GFX8-LABEL: log_v1f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_log_f16_e32 v0, v0
+; GFX8-NEXT: v_mul_f16_e32 v0, 0x398c, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: log_v1f16:
+; GFX906: ; %bb.0:
+; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT: v_log_f16_e32 v0, v0
+; GFX906-NEXT: v_mul_f16_e32 v0, 0x398c, v0
+; GFX906-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: log_v1f16:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_log_f16_e32 v0, v0
+; GFX908-NEXT: v_mul_f16_e32 v0, 0x398c, v0
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: log_v1f16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_log_f16_e32 v0, v0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mul_f16_e32 v0, 0x398c, v0
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: log_v1f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_log_f16_e32 v0, v0
+; GFX10-NEXT: v_mul_f16_e32 v0, 0x398c, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: log_v1f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_log_f16_e32 v0, v0
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f16_e32 v0, 0x398c, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <1 x half> @llvm.log.v1f16(<1 x half> %a)
+ ret <1 x half> %res
+}
+
+define <1 x half> @log2_v1f16(<1 x half> %a) {
+; GFX8-LABEL: log2_v1f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_log_f16_e32 v0, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: log2_v1f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_log_f16_e32 v0, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: log2_v1f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_log_f16_e32 v0, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: log2_v1f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_log_f16_e32 v0, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <1 x half> @llvm.log2.v1f16(<1 x half> %a)
+ ret <1 x half> %res
+}
+
+define <1 x half> @log10_v1f16(<1 x half> %a) {
+; GFX8-LABEL: log10_v1f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_log_f16_e32 v0, v0
+; GFX8-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: log10_v1f16:
+; GFX906: ; %bb.0:
+; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT: v_log_f16_e32 v0, v0
+; GFX906-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
+; GFX906-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: log10_v1f16:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_log_f16_e32 v0, v0
+; GFX908-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: log10_v1f16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_log_f16_e32 v0, v0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: log10_v1f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_log_f16_e32 v0, v0
+; GFX10-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: log10_v1f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_log_f16_e32 v0, v0
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <1 x half> @llvm.log10.v1f16(<1 x half> %a)
+ ret <1 x half> %res
+}
+
+define <1 x half> @exp_v1f16(<1 x half> %a) {
+; GFX8-LABEL: exp_v1f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX8-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX8-NEXT: v_exp_f32_e32 v0, v0
+; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: exp_v1f16:
+; GFX906: ; %bb.0:
+; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX906-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX906-NEXT: v_exp_f32_e32 v0, v0
+; GFX906-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX906-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: exp_v1f16:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX908-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX908-NEXT: v_exp_f32_e32 v0, v0
+; GFX908-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: exp_v1f16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX942-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX942-NEXT: v_exp_f32_e32 v0, v0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: exp_v1f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX10-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX10-NEXT: v_exp_f32_e32 v0, v0
+; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: exp_v1f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX11-NEXT: v_exp_f32_e32 v0, v0
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <1 x half> @llvm.exp.v1f16(<1 x half> %a)
+ ret <1 x half> %res
+}
+
+define <1 x half> @exp2_v1f16(<1 x half> %a) {
+; GFX8-LABEL: exp2_v1f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_exp_f16_e32 v0, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: exp2_v1f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_exp_f16_e32 v0, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: exp2_v1f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_exp_f16_e32 v0, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: exp2_v1f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_exp_f16_e32 v0, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <1 x half> @llvm.exp2.v1f16(<1 x half> %a)
+ ret <1 x half> %res
+}
+
+define <1 x half> @exp10_v1f16(<1 x half> %a) {
+; GFX8-LABEL: exp10_v1f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX8-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX8-NEXT: v_exp_f32_e32 v0, v0
+; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: exp10_v1f16:
+; GFX906: ; %bb.0:
+; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX906-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX906-NEXT: v_exp_f32_e32 v0, v0
+; GFX906-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX906-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: exp10_v1f16:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX908-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX908-NEXT: v_exp_f32_e32 v0, v0
+; GFX908-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: exp10_v1f16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX942-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX942-NEXT: v_exp_f32_e32 v0, v0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: exp10_v1f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX10-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX10-NEXT: v_exp_f32_e32 v0, v0
+; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: exp10_v1f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX11-NEXT: v_exp_f32_e32 v0, v0
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <1 x half> @llvm.exp10.v1f16(<1 x half> %a)
+ ret <1 x half> %res
+}
+
+define <1 x half> @sqrt_v1f16(<1 x half> %a) {
+; GFX8-LABEL: sqrt_v1f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: sqrt_v1f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: sqrt_v1f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: sqrt_v1f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <1 x half> @llvm.sqrt.v1f16(<1 x half> %a)
+ ret <1 x half> %res
+}
+
+define <2 x half> @sin_v2f16(<2 x half> %a) {
+; GFX8-LABEL: sin_v2f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, 0x3118
+; GFX8-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
+; GFX8-NEXT: v_fract_f16_e32 v1, v1
+; GFX8-NEXT: v_fract_f16_e32 v0, v0
+; GFX8-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_sin_f16_e32 v0, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: sin_v2f16:
+; GFX906: ; %bb.0:
+; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT: v_mov_b32_e32 v1, 0x3118
+; GFX906-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: sin_v2f16:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v1, 0x3118
+; GFX908-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: sin_v2f16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v1, 0x3118
+; GFX942-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: sin_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v1, 0x3118
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: sin_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
+; GFX11-NEXT: v_sin_f16_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_sin_f16_e32 v1, v1
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <2 x half> @llvm.sin.v2f16(<2 x half> %a)
+ ret <2 x half> %res
+}
+
+define <2 x half> @cos_v2f16(<2 x half> %a) {
+; GFX8-LABEL: cos_v2f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, 0x3118
+; GFX8-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
+; GFX8-NEXT: v_fract_f16_e32 v1, v1
+; GFX8-NEXT: v_fract_f16_e32 v0, v0
+; GFX8-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_cos_f16_e32 v0, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: cos_v2f16:
+; GFX906: ; %bb.0:
+; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT: v_mov_b32_e32 v1, 0x3118
+; GFX906-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: cos_v2f16:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v1, 0x3118
+; GFX908-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: cos_v2f16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v1, 0x3118
+; GFX942-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: cos_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v1, 0x3118
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: cos_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
+; GFX11-NEXT: v_cos_f16_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cos_f16_e32 v1, v1
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <2 x half> @llvm.cos.v2f16(<2 x half> %a)
+ ret <2 x half> %res
+}
+
+define <2 x half> @log_v2f16(<2 x half> %a) {
+; GFX8-LABEL: log_v2f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_log_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_log_f16_e32 v0, v0
+; GFX8-NEXT: v_mov_b32_e32 v2, 0x398c
+; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_mul_f16_e32 v0, 0x398c, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: log_v2f16:
+; GFX906: ; %bb.0:
+; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT: s_movk_i32 s4, 0x398c
+; GFX906-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: log_v2f16:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT: s_movk_i32 s4, 0x398c
+; GFX908-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: log_v2f16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT: s_movk_i32 s0, 0x398c
+; GFX942-NEXT: v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: log_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_e32 v1, 0x398c
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: log_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT: v_log_f16_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_log_f16_e32 v1, v1
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f16_e32 v0, 0x398c, v0
+; GFX11-NEXT: v_mul_f16_e32 v1, 0x398c, v1
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <2 x half> @llvm.log.v2f16(<2 x half> %a)
+ ret <2 x half> %res
+}
+
+define <2 x half> @log2_v2f16(<2 x half> %a) {
+; GFX8-LABEL: log2_v2f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_log_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_log_f16_e32 v0, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: log2_v2f16:
+; GFX906: ; %bb.0:
+; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: log2_v2f16:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: log2_v2f16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: log2_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: log2_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT: v_log_f16_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_log_f16_e32 v1, v1
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <2 x half> @llvm.log2.v2f16(<2 x half> %a)
+ ret <2 x half> %res
+}
+
+define <2 x half> @log10_v2f16(<2 x half> %a) {
+; GFX8-LABEL: log10_v2f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_log_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_log_f16_e32 v0, v0
+; GFX8-NEXT: v_mov_b32_e32 v2, 0x34d1
+; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: log10_v2f16:
+; GFX906: ; %bb.0:
+; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT: s_movk_i32 s4, 0x34d1
+; GFX906-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: log10_v2f16:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT: s_movk_i32 s4, 0x34d1
+; GFX908-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: log10_v2f16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT: s_movk_i32 s0, 0x34d1
+; GFX942-NEXT: v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: log10_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_e32 v1, 0x34d1
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: log10_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT: v_log_f16_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_log_f16_e32 v1, v1
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
+; GFX11-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <2 x half> @llvm.log10.v2f16(<2 x half> %a)
+ ret <2 x half> %res
+}
+
+define <2 x half> @exp_v2f16(<2 x half> %a) {
+; GFX8-LABEL: exp_v2f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cvt_f32_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX8-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX8-NEXT: v_exp_f32_e32 v1, v1
+; GFX8-NEXT: v_exp_f32_e32 v0, v0
+; GFX8-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: exp_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX9-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX9-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX9-NEXT: v_exp_f32_e32 v1, v1
+; GFX9-NEXT: v_exp_f32_e32 v0, v0
+; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: exp_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX10-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX10-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX10-NEXT: v_exp_f32_e32 v1, v1
+; GFX10-NEXT: v_exp_f32_e32 v0, v0
+; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: exp_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX11-NEXT: v_dual_mul_f32 v0, 0x3fb8aa3b, v0 :: v_dual_mul_f32 v1, 0x3fb8aa3b, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_exp_f32_e32 v0, v0
+; GFX11-NEXT: v_exp_f32_e32 v1, v1
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <2 x half> @llvm.exp.v2f16(<2 x half> %a)
+ ret <2 x half> %res
+}
+
+define <2 x half> @exp2_v2f16(<2 x half> %a) {
+; GFX8-LABEL: exp2_v2f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_exp_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_exp_f16_e32 v0, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: exp2_v2f16:
+; GFX906: ; %bb.0:
+; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: exp2_v2f16:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: exp2_v2f16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: exp2_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: exp2_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT: v_exp_f16_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_exp_f16_e32 v1, v1
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <2 x half> @llvm.exp2.v2f16(<2 x half> %a)
+ ret <2 x half> %res
+}
+
+define <2 x half> @exp10_v2f16(<2 x half> %a) {
+; GFX8-LABEL: exp10_v2f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cvt_f32_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX8-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX8-NEXT: v_exp_f32_e32 v1, v1
+; GFX8-NEXT: v_exp_f32_e32 v0, v0
+; GFX8-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: exp10_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX9-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX9-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX9-NEXT: v_exp_f32_e32 v1, v1
+; GFX9-NEXT: v_exp_f32_e32 v0, v0
+; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: exp10_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX10-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX10-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX10-NEXT: v_exp_f32_e32 v1, v1
+; GFX10-NEXT: v_exp_f32_e32 v0, v0
+; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: exp10_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX11-NEXT: v_dual_mul_f32 v0, 0x3fb8aa3b, v0 :: v_dual_mul_f32 v1, 0x3fb8aa3b, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_exp_f32_e32 v0, v0
+; GFX11-NEXT: v_exp_f32_e32 v1, v1
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <2 x half> @llvm.exp10.v2f16(<2 x half> %a)
+ ret <2 x half> %res
+}
+
+define <2 x half> @sqrt_v2f16(<2 x half> %a) {
+; GFX8-LABEL: sqrt_v2f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_sqrt_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: sqrt_v2f16:
+; GFX906: ; %bb.0:
+; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: sqrt_v2f16:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: sqrt_v2f16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: sqrt_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: sqrt_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_sqrt_f16_e32 v1, v1
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
+ ret <2 x half> %res
+}
+
+define <4 x half> @sin_v4f16(<4 x half> %a) {
+; GFX8-LABEL: sin_v4f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v3, 0x3118
+; GFX8-NEXT: v_mul_f16_e32 v2, 0.15915494, v1
+; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_mul_f16_e32 v4, 0.15915494, v0
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_fract_f16_e32 v2, v2
+; GFX8-NEXT: v_fract_f16_e32 v1, v1
+; GFX8-NEXT: v_fract_f16_e32 v4, v4
+; GFX8-NEXT: v_fract_f16_e32 v0, v0
+; GFX8-NEXT: v_sin_f16_e32 v2, v2
+; GFX8-NEXT: v_sin_f16_e32 v4, v4
+; GFX8-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: sin_v4f16:
+; GFX906: ; %bb.0:
+; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT: v_mov_b32_e32 v2, 0x3118
+; GFX906-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: sin_v4f16:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v2, 0x3118
+; GFX908-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: sin_v4f16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v2, 0x3118
+; GFX942-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: sin_v4f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v2, 0x3118
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: sin_v4f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
+; GFX11-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
+; GFX11-NEXT: v_mul_f16_e32 v3, 0.15915494, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_sin_f16_e32 v1, v1
+; GFX11-NEXT: v_sin_f16_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_sin_f16_e32 v2, v2
+; GFX11-NEXT: v_sin_f16_e32 v3, v3
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v2
+; GFX11-NEXT: v_pack_b32_f16 v1, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <4 x half> @llvm.sin.v4f16(<4 x half> %a)
+ ret <4 x half> %res
+}
+
+define <4 x half> @cos_v4f16(<4 x half> %a) {
+; GFX8-LABEL: cos_v4f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v3, 0x3118
+; GFX8-NEXT: v_mul_f16_e32 v2, 0.15915494, v1
+; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_mul_f16_e32 v4, 0.15915494, v0
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_fract_f16_e32 v2, v2
+; GFX8-NEXT: v_fract_f16_e32 v1, v1
+; GFX8-NEXT: v_fract_f16_e32 v4, v4
+; GFX8-NEXT: v_fract_f16_e32 v0, v0
+; GFX8-NEXT: v_cos_f16_e32 v2, v2
+; GFX8-NEXT: v_cos_f16_e32 v4, v4
+; GFX8-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: cos_v4f16:
+; GFX906: ; %bb.0:
+; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT: v_mov_b32_e32 v2, 0x3118
+; GFX906-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: cos_v4f16:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v2, 0x3118
+; GFX908-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: cos_v4f16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v2, 0x3118
+; GFX942-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: cos_v4f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v2, 0x3118
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: cos_v4f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
+; GFX11-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
+; GFX11-NEXT: v_mul_f16_e32 v3, 0.15915494, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cos_f16_e32 v1, v1
+; GFX11-NEXT: v_cos_f16_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cos_f16_e32 v2, v2
+; GFX11-NEXT: v_cos_f16_e32 v3, v3
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v2
+; GFX11-NEXT: v_pack_b32_f16 v1, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <4 x half> @llvm.cos.v4f16(<4 x half> %a)
+ ret <4 x half> %res
+}
+
+define <4 x half> @log_v4f16(<4 x half> %a) {
+; GFX8-LABEL: log_v4f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_log_f16_e32 v2, v1
+; GFX8-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_log_f16_e32 v3, v0
+; GFX8-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_mov_b32_e32 v4, 0x398c
+; GFX8-NEXT: v_mul_f16_e32 v2, 0x398c, v2
+; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_mul_f16_e32 v3, 0x398c, v3
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_e32 v0, v3, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: log_v4f16:
+; GFX906: ; %bb.0:
+; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT: s_movk_i32 s4, 0x398c
+; GFX906-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: log_v4f16:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT: s_movk_i32 s4, 0x398c
+; GFX908-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: log_v4f16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT: s_movk_i32 s0, 0x398c
+; GFX942-NEXT: v_mul_f16_sdwa v1, v1, s0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT: v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT: v_mul_f16_sdwa v1, v1, s0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT: v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: log_v4f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_e32 v2, 0x398c
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: log_v4f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT: v_log_f16_e32 v1, v1
+; GFX11-NEXT: v_log_f16_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_log_f16_e32 v2, v2
+; GFX11-NEXT: v_log_f16_e32 v3, v3
+; GFX11-NEXT: v_mul_f16_e32 v1, 0x398c, v1
+; GFX11-NEXT: s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_mul_f16_e32 v0, 0x398c, v0
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f16_e32 v2, 0x398c, v2
+; GFX11-NEXT: v_mul_f16_e32 v3, 0x398c, v3
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_pack_b32_f16 v1, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <4 x half> @llvm.log.v4f16(<4 x half> %a)
+ ret <4 x half> %res
+}
+
+define <4 x half> @log2_v4f16(<4 x half> %a) {
+; GFX8-LABEL: log2_v4f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_log_f16_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_log_f16_sdwa v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_log_f16_e32 v0, v0
+; GFX8-NEXT: v_log_f16_e32 v1, v1
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v3
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: log2_v4f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: log2_v4f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: log2_v4f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT: v_log_f16_e32 v1, v1
+; GFX11-NEXT: v_log_f16_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_log_f16_e32 v2, v2
+; GFX11-NEXT: v_log_f16_e32 v3, v3
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v2
+; GFX11-NEXT: v_pack_b32_f16 v1, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <4 x half> @llvm.log2.v4f16(<4 x half> %a)
+ ret <4 x half> %res
+}
+
+define <4 x half> @log10_v4f16(<4 x half> %a) {
+; GFX8-LABEL: log10_v4f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_log_f16_e32 v2, v1
+; GFX8-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_log_f16_e32 v3, v0
+; GFX8-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_mov_b32_e32 v4, 0x34d1
+; GFX8-NEXT: v_mul_f16_e32 v2, 0x34d1, v2
+; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_mul_f16_e32 v3, 0x34d1, v3
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_e32 v0, v3, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: log10_v4f16:
+; GFX906: ; %bb.0:
+; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT: s_movk_i32 s4, 0x34d1
+; GFX906-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: log10_v4f16:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT: s_movk_i32 s4, 0x34d1
+; GFX908-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: log10_v4f16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT: s_movk_i32 s0, 0x34d1
+; GFX942-NEXT: v_mul_f16_sdwa v1, v1, s0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT: v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT: v_mul_f16_sdwa v1, v1, s0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT: v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: log10_v4f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_e32 v2, 0x34d1
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: log10_v4f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT: v_log_f16_e32 v1, v1
+; GFX11-NEXT: v_log_f16_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_log_f16_e32 v2, v2
+; GFX11-NEXT: v_log_f16_e32 v3, v3
+; GFX11-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
+; GFX11-NEXT: s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f16_e32 v2, 0x34d1, v2
+; GFX11-NEXT: v_mul_f16_e32 v3, 0x34d1, v3
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_pack_b32_f16 v1, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <4 x half> @llvm.log10.v4f16(<4 x half> %a)
+ ret <4 x half> %res
+}
+
+define <4 x half> @exp_v4f16(<4 x half> %a) {
+; GFX8-LABEL: exp_v4f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX8-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX8-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+; GFX8-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX8-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3
+; GFX8-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX8-NEXT: v_exp_f32_e32 v2, v2
+; GFX8-NEXT: v_exp_f32_e32 v3, v3
+; GFX8-NEXT: v_exp_f32_e32 v0, v0
+; GFX8-NEXT: v_exp_f32_e32 v1, v1
+; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX8-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_or_b32_e32 v0, v3, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: exp_v4f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX9-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX9-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+; GFX9-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX9-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3
+; GFX9-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX9-NEXT: v_exp_f32_e32 v2, v2
+; GFX9-NEXT: v_exp_f32_e32 v3, v3
+; GFX9-NEXT: v_exp_f32_e32 v0, v0
+; GFX9-NEXT: v_exp_f32_e32 v1, v1
+; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT: v_pack_b32_f16 v0, v3, v0
+; GFX9-NEXT: v_pack_b32_f16 v1, v2, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: exp_v4f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX10-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+; GFX10-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3
+; GFX10-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX10-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX10-NEXT: v_exp_f32_e32 v2, v2
+; GFX10-NEXT: v_exp_f32_e32 v3, v3
+; GFX10-NEXT: v_exp_f32_e32 v0, v0
+; GFX10-NEXT: v_exp_f32_e32 v1, v1
+; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX10-NEXT: v_pack_b32_f16 v0, v3, v0
+; GFX10-NEXT: v_pack_b32_f16 v1, v2, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: exp_v4f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_dual_mul_f32 v1, 0x3fb8aa3b, v1 :: v_dual_mul_f32 v0, 0x3fb8aa3b, v0
+; GFX11-NEXT: v_dual_mul_f32 v2, 0x3fb8aa3b, v2 :: v_dual_mul_f32 v3, 0x3fb8aa3b, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_exp_f32_e32 v1, v1
+; GFX11-NEXT: v_exp_f32_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_exp_f32_e32 v2, v2
+; GFX11-NEXT: v_exp_f32_e32 v3, v3
+; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX11-NEXT: s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_pack_b32_f16 v1, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <4 x half> @llvm.exp.v4f16(<4 x half> %a)
+ ret <4 x half> %res
+}
+
+define <4 x half> @exp2_v4f16(<4 x half> %a) {
+; GFX8-LABEL: exp2_v4f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_exp_f16_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_exp_f16_sdwa v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_exp_f16_e32 v0, v0
+; GFX8-NEXT: v_exp_f16_e32 v1, v1
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v3
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: exp2_v4f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: v_exp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_exp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: exp2_v4f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_exp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_exp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: exp2_v4f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT: v_exp_f16_e32 v1, v1
+; GFX11-NEXT: v_exp_f16_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_exp_f16_e32 v2, v2
+; GFX11-NEXT: v_exp_f16_e32 v3, v3
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v2
+; GFX11-NEXT: v_pack_b32_f16 v1, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <4 x half> @llvm.exp2.v4f16(<4 x half> %a)
+ ret <4 x half> %res
+}
+
+define <4 x half> @exp10_v4f16(<4 x half> %a) {
+; GFX8-LABEL: exp10_v4f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX8-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX8-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+; GFX8-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX8-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3
+; GFX8-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX8-NEXT: v_exp_f32_e32 v2, v2
+; GFX8-NEXT: v_exp_f32_e32 v3, v3
+; GFX8-NEXT: v_exp_f32_e32 v0, v0
+; GFX8-NEXT: v_exp_f32_e32 v1, v1
+; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX8-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_or_b32_e32 v0, v3, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: exp10_v4f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX9-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX9-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+; GFX9-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX9-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3
+; GFX9-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX9-NEXT: v_exp_f32_e32 v2, v2
+; GFX9-NEXT: v_exp_f32_e32 v3, v3
+; GFX9-NEXT: v_exp_f32_e32 v0, v0
+; GFX9-NEXT: v_exp_f32_e32 v1, v1
+; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT: v_pack_b32_f16 v0, v3, v0
+; GFX9-NEXT: v_pack_b32_f16 v1, v2, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: exp10_v4f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX10-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+; GFX10-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3
+; GFX10-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX10-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX10-NEXT: v_exp_f32_e32 v2, v2
+; GFX10-NEXT: v_exp_f32_e32 v3, v3
+; GFX10-NEXT: v_exp_f32_e32 v0, v0
+; GFX10-NEXT: v_exp_f32_e32 v1, v1
+; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX10-NEXT: v_pack_b32_f16 v0, v3, v0
+; GFX10-NEXT: v_pack_b32_f16 v1, v2, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: exp10_v4f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_dual_mul_f32 v1, 0x3fb8aa3b, v1 :: v_dual_mul_f32 v0, 0x3fb8aa3b, v0
+; GFX11-NEXT: v_dual_mul_f32 v2, 0x3fb8aa3b, v2 :: v_dual_mul_f32 v3, 0x3fb8aa3b, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_exp_f32_e32 v1, v1
+; GFX11-NEXT: v_exp_f32_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_exp_f32_e32 v2, v2
+; GFX11-NEXT: v_exp_f32_e32 v3, v3
+; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX11-NEXT: s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_pack_b32_f16 v1, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <4 x half> @llvm.exp10.v4f16(<4 x half> %a)
+ ret <4 x half> %res
+}
+
+define <4 x half> @sqrt_v4f16(<4 x half> %a) {
+; GFX8-LABEL: sqrt_v4f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_sqrt_f16_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_sqrt_f16_sdwa v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX8-NEXT: v_sqrt_f16_e32 v1, v1
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v3
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: sqrt_v4f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: v_sqrt_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_sqrt_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: sqrt_v4f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_sqrt_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_sqrt_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: sqrt_v4f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT: v_sqrt_f16_e32 v1, v1
+; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_sqrt_f16_e32 v2, v2
+; GFX11-NEXT: v_sqrt_f16_e32 v3, v3
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v2
+; GFX11-NEXT: v_pack_b32_f16 v1, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <4 x half> @llvm.sqrt.v4f16(<4 x half> %a)
+ ret <4 x half> %res
+}
+
+define <5 x half> @sin_v5f16(<5 x half> %a) {
+; GFX8-LABEL: sin_v5f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v4, 0x3118
+; GFX8-NEXT: v_mul_f16_e32 v3, 0.15915494, v1
+; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_mul_f16_e32 v5, 0.15915494, v0
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
+; GFX8-NEXT: v_fract_f16_e32 v3, v3
+; GFX8-NEXT: v_fract_f16_e32 v1, v1
+; GFX8-NEXT: v_fract_f16_e32 v5, v5
+; GFX8-NEXT: v_fract_f16_e32 v0, v0
+; GFX8-NEXT: v_sin_f16_e32 v3, v3
+; GFX8-NEXT: v_sin_f16_e32 v5, v5
+; GFX8-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_fract_f16_e32 v2, v2
+; GFX8-NEXT: v_sin_f16_e32 v2, v2
+; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v3, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: sin_v5f16:
+; GFX906: ; %bb.0:
+; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT: v_mov_b32_e32 v3, 0x3118
+; GFX906-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
+; GFX906-NEXT: v_sin_f16_e32 v2, v2
+; GFX906-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: sin_v5f16:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v3, 0x3118
+; GFX908-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
+; GFX908-NEXT: v_sin_f16_e32 v2, v2
+; GFX908-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: sin_v5f16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0x3118
+; GFX942-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
+; GFX942-NEXT: v_sin_f16_e32 v2, v2
+; GFX942-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: sin_v5f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v3, 0x3118
+; GFX10-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_sin_f16_e32 v2, v2
+; GFX10-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: sin_v5f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
+; GFX11-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
+; GFX11-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
+; GFX11-NEXT: v_mul_f16_e32 v3, 0.15915494, v3
+; GFX11-NEXT: v_mul_f16_e32 v4, 0.15915494, v4
+; GFX11-NEXT: v_sin_f16_e32 v1, v1
+; GFX11-NEXT: v_sin_f16_e32 v0, v0
+; GFX11-NEXT: v_sin_f16_e32 v2, v2
+; GFX11-NEXT: v_sin_f16_e32 v3, v3
+; GFX11-NEXT: v_sin_f16_e32 v4, v4
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v3
+; GFX11-NEXT: v_pack_b32_f16 v1, v1, v4
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <5 x half> @llvm.sin.v5f16(<5 x half> %a)
+ ret <5 x half> %res
+}
+
+define <5 x half> @cos_v5f16(<5 x half> %a) {
+; GFX8-LABEL: cos_v5f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v4, 0x3118
+; GFX8-NEXT: v_mul_f16_e32 v3, 0.15915494, v1
+; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_mul_f16_e32 v5, 0.15915494, v0
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
+; GFX8-NEXT: v_fract_f16_e32 v3, v3
+; GFX8-NEXT: v_fract_f16_e32 v1, v1
+; GFX8-NEXT: v_fract_f16_e32 v5, v5
+; GFX8-NEXT: v_fract_f16_e32 v0, v0
+; GFX8-NEXT: v_cos_f16_e32 v3, v3
+; GFX8-NEXT: v_cos_f16_e32 v5, v5
+; GFX8-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_fract_f16_e32 v2, v2
+; GFX8-NEXT: v_cos_f16_e32 v2, v2
+; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v3, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: cos_v5f16:
+; GFX906: ; %bb.0:
+; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT: v_mov_b32_e32 v3, 0x3118
+; GFX906-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
+; GFX906-NEXT: v_cos_f16_e32 v2, v2
+; GFX906-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: cos_v5f16:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v3, 0x3118
+; GFX908-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
+; GFX908-NEXT: v_cos_f16_e32 v2, v2
+; GFX908-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: cos_v5f16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0x3118
+; GFX942-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
+; GFX942-NEXT: v_cos_f16_e32 v2, v2
+; GFX942-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: cos_v5f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v3, 0x3118
+; GFX10-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_cos_f16_e32 v2, v2
+; GFX10-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: cos_v5f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
+; GFX11-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
+; GFX11-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
+; GFX11-NEXT: v_mul_f16_e32 v3, 0.15915494, v3
+; GFX11-NEXT: v_mul_f16_e32 v4, 0.15915494, v4
+; GFX11-NEXT: v_cos_f16_e32 v1, v1
+; GFX11-NEXT: v_cos_f16_e32 v0, v0
+; GFX11-NEXT: v_cos_f16_e32 v2, v2
+; GFX11-NEXT: v_cos_f16_e32 v3, v3
+; GFX11-NEXT: v_cos_f16_e32 v4, v4
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v3
+; GFX11-NEXT: v_pack_b32_f16 v1, v1, v4
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <5 x half> @llvm.cos.v5f16(<5 x half> %a)
+ ret <5 x half> %res
+}
+
+define <5 x half> @log_v5f16(<5 x half> %a) {
+; GFX8-LABEL: log_v5f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_log_f16_e32 v3, v1
+; GFX8-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_log_f16_e32 v4, v0
+; GFX8-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_log_f16_e32 v2, v2
+; GFX8-NEXT: v_mov_b32_e32 v5, 0x398c
+; GFX8-NEXT: v_mul_f16_e32 v3, 0x398c, v3
+; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_mul_f16_e32 v4, 0x398c, v4
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_mul_f16_e32 v2, 0x398c, v2
+; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v3, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: log_v5f16:
+; GFX906: ; %bb.0:
+; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT: s_movk_i32 s4, 0x398c
+; GFX906-NEXT: v_log_f16_e32 v2, v2
+; GFX906-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT: v_mul_f16_e32 v2, 0x398c, v2
+; GFX906-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: log_v5f16:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT: s_movk_i32 s4, 0x398c
+; GFX908-NEXT: v_log_f16_e32 v2, v2
+; GFX908-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT: v_mul_f16_e32 v2, 0x398c, v2
+; GFX908-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: log_v5f16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT: s_movk_i32 s0, 0x398c
+; GFX942-NEXT: v_log_f16_e32 v2, v2
+; GFX942-NEXT: v_mul_f16_sdwa v1, v1, s0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT: v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT: v_mul_f16_e32 v2, 0x398c, v2
+; GFX942-NEXT: v_mul_f16_sdwa v1, v1, s0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT: v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: log_v5f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_e32 v3, 0x398c
+; GFX10-NEXT: v_log_f16_e32 v2, v2
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_mul_f16_e32 v2, 0x398c, v2
+; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: log_v5f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-NEXT: v_log_f16_e32 v1, v1
+; GFX11-NEXT: v_log_f16_e32 v0, v0
+; GFX11-NEXT: v_log_f16_e32 v2, v2
+; GFX11-NEXT: v_log_f16_e32 v3, v3
+; GFX11-NEXT: v_log_f16_e32 v4, v4
+; GFX11-NEXT: v_mul_f16_e32 v1, 0x398c, v1
+; GFX11-NEXT: v_mul_f16_e32 v0, 0x398c, v0
+; GFX11-NEXT: s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_mul_f16_e32 v2, 0x398c, v2
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f16_e32 v3, 0x398c, v3
+; GFX11-NEXT: v_mul_f16_e32 v4, 0x398c, v4
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_pack_b32_f16 v1, v1, v4
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <5 x half> @llvm.log.v5f16(<5 x half> %a)
+ ret <5 x half> %res
+}
+
+define <5 x half> @log2_v5f16(<5 x half> %a) {
+; GFX8-LABEL: log2_v5f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_log_f16_sdwa v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_log_f16_sdwa v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_log_f16_e32 v0, v0
+; GFX8-NEXT: v_log_f16_e32 v1, v1
+; GFX8-NEXT: v_log_f16_e32 v2, v2
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: log2_v5f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: v_log_f16_e32 v2, v2
+; GFX9-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: log2_v5f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_log_f16_e32 v2, v2
+; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: log2_v5f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-NEXT: v_log_f16_e32 v1, v1
+; GFX11-NEXT: v_log_f16_e32 v0, v0
+; GFX11-NEXT: v_log_f16_e32 v2, v2
+; GFX11-NEXT: v_log_f16_e32 v3, v3
+; GFX11-NEXT: v_log_f16_e32 v4, v4
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v3
+; GFX11-NEXT: v_pack_b32_f16 v1, v1, v4
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <5 x half> @llvm.log2.v5f16(<5 x half> %a)
+ ret <5 x half> %res
+}
+
+define <5 x half> @log10_v5f16(<5 x half> %a) {
+; GFX8-LABEL: log10_v5f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_log_f16_e32 v3, v1
+; GFX8-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_log_f16_e32 v4, v0
+; GFX8-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_log_f16_e32 v2, v2
+; GFX8-NEXT: v_mov_b32_e32 v5, 0x34d1
+; GFX8-NEXT: v_mul_f16_e32 v3, 0x34d1, v3
+; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_mul_f16_e32 v4, 0x34d1, v4
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_mul_f16_e32 v2, 0x34d1, v2
+; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v3, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: log10_v5f16:
+; GFX906: ; %bb.0:
+; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT: s_movk_i32 s4, 0x34d1
+; GFX906-NEXT: v_log_f16_e32 v2, v2
+; GFX906-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT: v_mul_f16_e32 v2, 0x34d1, v2
+; GFX906-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: log10_v5f16:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT: s_movk_i32 s4, 0x34d1
+; GFX908-NEXT: v_log_f16_e32 v2, v2
+; GFX908-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT: v_mul_f16_e32 v2, 0x34d1, v2
+; GFX908-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: log10_v5f16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT: s_movk_i32 s0, 0x34d1
+; GFX942-NEXT: v_log_f16_e32 v2, v2
+; GFX942-NEXT: v_mul_f16_sdwa v1, v1, s0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT: v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT: v_mul_f16_e32 v2, 0x34d1, v2
+; GFX942-NEXT: v_mul_f16_sdwa v1, v1, s0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT: v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: log10_v5f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_e32 v3, 0x34d1
+; GFX10-NEXT: v_log_f16_e32 v2, v2
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_mul_f16_e32 v2, 0x34d1, v2
+; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: log10_v5f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-NEXT: v_log_f16_e32 v1, v1
+; GFX11-NEXT: v_log_f16_e32 v0, v0
+; GFX11-NEXT: v_log_f16_e32 v2, v2
+; GFX11-NEXT: v_log_f16_e32 v3, v3
+; GFX11-NEXT: v_log_f16_e32 v4, v4
+; GFX11-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
+; GFX11-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
+; GFX11-NEXT: s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_mul_f16_e32 v2, 0x34d1, v2
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f16_e32 v3, 0x34d1, v3
+; GFX11-NEXT: v_mul_f16_e32 v4, 0x34d1, v4
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_pack_b32_f16 v1, v1, v4
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <5 x half> @llvm.log10.v5f16(<5 x half> %a)
+ ret <5 x half> %res
+}
+
+define <5 x half> @exp_v5f16(<5 x half> %a) {
+; GFX8-LABEL: exp_v5f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX8-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_cvt_f32_f16_e32 v4, v0
+; GFX8-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX8-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3
+; GFX8-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX8-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v4
+; GFX8-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX8-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+; GFX8-NEXT: v_exp_f32_e32 v3, v3
+; GFX8-NEXT: v_exp_f32_e32 v4, v4
+; GFX8-NEXT: v_exp_f32_e32 v0, v0
+; GFX8-NEXT: v_exp_f32_e32 v1, v1
+; GFX8-NEXT: v_exp_f32_e32 v2, v2
+; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX8-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX8-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v3, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: exp_v5f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX9-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v0
+; GFX9-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX9-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3
+; GFX9-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX9-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v4
+; GFX9-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX9-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+; GFX9-NEXT: v_exp_f32_e32 v3, v3
+; GFX9-NEXT: v_exp_f32_e32 v4, v4
+; GFX9-NEXT: v_exp_f32_e32 v0, v0
+; GFX9-NEXT: v_exp_f32_e32 v1, v1
+; GFX9-NEXT: v_exp_f32_e32 v2, v2
+; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX9-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX9-NEXT: v_pack_b32_f16 v0, v4, v0
+; GFX9-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: exp_v5f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v0
+; GFX10-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX10-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3
+; GFX10-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v4
+; GFX10-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX10-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX10-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+; GFX10-NEXT: v_exp_f32_e32 v3, v3
+; GFX10-NEXT: v_exp_f32_e32 v4, v4
+; GFX10-NEXT: v_exp_f32_e32 v0, v0
+; GFX10-NEXT: v_exp_f32_e32 v1, v1
+; GFX10-NEXT: v_exp_f32_e32 v2, v2
+; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX10-NEXT: v_pack_b32_f16 v0, v4, v0
+; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: exp_v5f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_dual_mul_f32 v1, 0x3fb8aa3b, v1 :: v_dual_mul_f32 v0, 0x3fb8aa3b, v0
+; GFX11-NEXT: v_dual_mul_f32 v2, 0x3fb8aa3b, v2 :: v_dual_mul_f32 v3, 0x3fb8aa3b, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v4
+; GFX11-NEXT: v_exp_f32_e32 v1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_exp_f32_e32 v0, v0
+; GFX11-NEXT: v_exp_f32_e32 v2, v2
+; GFX11-NEXT: v_exp_f32_e32 v3, v3
+; GFX11-NEXT: v_exp_f32_e32 v4, v4
+; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_pack_b32_f16 v1, v1, v4
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <5 x half> @llvm.exp.v5f16(<5 x half> %a)
+ ret <5 x half> %res
+}
+
+define <5 x half> @exp2_v5f16(<5 x half> %a) {
+; GFX8-LABEL: exp2_v5f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_exp_f16_sdwa v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_exp_f16_sdwa v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_exp_f16_e32 v0, v0
+; GFX8-NEXT: v_exp_f16_e32 v1, v1
+; GFX8-NEXT: v_exp_f16_e32 v2, v2
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: exp2_v5f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: v_exp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: v_exp_f16_e32 v2, v2
+; GFX9-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_exp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: exp2_v5f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_exp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_exp_f16_e32 v2, v2
+; GFX10-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_exp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: exp2_v5f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-NEXT: v_exp_f16_e32 v1, v1
+; GFX11-NEXT: v_exp_f16_e32 v0, v0
+; GFX11-NEXT: v_exp_f16_e32 v2, v2
+; GFX11-NEXT: v_exp_f16_e32 v3, v3
+; GFX11-NEXT: v_exp_f16_e32 v4, v4
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v3
+; GFX11-NEXT: v_pack_b32_f16 v1, v1, v4
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <5 x half> @llvm.exp2.v5f16(<5 x half> %a)
+ ret <5 x half> %res
+}
+
+define <5 x half> @exp10_v5f16(<5 x half> %a) {
+; GFX8-LABEL: exp10_v5f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX8-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_cvt_f32_f16_e32 v4, v0
+; GFX8-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX8-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3
+; GFX8-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX8-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v4
+; GFX8-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX8-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+; GFX8-NEXT: v_exp_f32_e32 v3, v3
+; GFX8-NEXT: v_exp_f32_e32 v4, v4
+; GFX8-NEXT: v_exp_f32_e32 v0, v0
+; GFX8-NEXT: v_exp_f32_e32 v1, v1
+; GFX8-NEXT: v_exp_f32_e32 v2, v2
+; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX8-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX8-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v3, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: exp10_v5f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX9-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v0
+; GFX9-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX9-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3
+; GFX9-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX9-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v4
+; GFX9-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX9-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+; GFX9-NEXT: v_exp_f32_e32 v3, v3
+; GFX9-NEXT: v_exp_f32_e32 v4, v4
+; GFX9-NEXT: v_exp_f32_e32 v0, v0
+; GFX9-NEXT: v_exp_f32_e32 v1, v1
+; GFX9-NEXT: v_exp_f32_e32 v2, v2
+; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX9-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX9-NEXT: v_pack_b32_f16 v0, v4, v0
+; GFX9-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: exp10_v5f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v0
+; GFX10-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX10-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3
+; GFX10-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v4
+; GFX10-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX10-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX10-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+; GFX10-NEXT: v_exp_f32_e32 v3, v3
+; GFX10-NEXT: v_exp_f32_e32 v4, v4
+; GFX10-NEXT: v_exp_f32_e32 v0, v0
+; GFX10-NEXT: v_exp_f32_e32 v1, v1
+; GFX10-NEXT: v_exp_f32_e32 v2, v2
+; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX10-NEXT: v_pack_b32_f16 v0, v4, v0
+; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: exp10_v5f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_dual_mul_f32 v1, 0x3fb8aa3b, v1 :: v_dual_mul_f32 v0, 0x3fb8aa3b, v0
+; GFX11-NEXT: v_dual_mul_f32 v2, 0x3fb8aa3b, v2 :: v_dual_mul_f32 v3, 0x3fb8aa3b, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v4
+; GFX11-NEXT: v_exp_f32_e32 v1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_exp_f32_e32 v0, v0
+; GFX11-NEXT: v_exp_f32_e32 v2, v2
+; GFX11-NEXT: v_exp_f32_e32 v3, v3
+; GFX11-NEXT: v_exp_f32_e32 v4, v4
+; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_pack_b32_f16 v1, v1, v4
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <5 x half> @llvm.exp10.v5f16(<5 x half> %a)
+ ret <5 x half> %res
+}
+
+define <5 x half> @sqrt_v5f16(<5 x half> %a) {
+; GFX8-LABEL: sqrt_v5f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_sqrt_f16_sdwa v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_sqrt_f16_sdwa v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX8-NEXT: v_sqrt_f16_e32 v1, v1
+; GFX8-NEXT: v_sqrt_f16_e32 v2, v2
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: sqrt_v5f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: v_sqrt_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: v_sqrt_f16_e32 v2, v2
+; GFX9-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_sqrt_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: sqrt_v5f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_sqrt_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_sqrt_f16_e32 v2, v2
+; GFX10-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_sqrt_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: sqrt_v5f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-NEXT: v_sqrt_f16_e32 v1, v1
+; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX11-NEXT: v_sqrt_f16_e32 v2, v2
+; GFX11-NEXT: v_sqrt_f16_e32 v3, v3
+; GFX11-NEXT: v_sqrt_f16_e32 v4, v4
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v3
+; GFX11-NEXT: v_pack_b32_f16 v1, v1, v4
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <5 x half> @llvm.sqrt.v5f16(<5 x half> %a)
+ ret <5 x half> %res
+}
+
+define <4 x half> @cascaded_v4f16(<4 x half> %a) {
+; GFX8-LABEL: cascaded_v4f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_log_f16_e32 v2, v1
+; GFX8-NEXT: v_log_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_log_f16_e32 v0, v0
+; GFX8-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
+; GFX8-NEXT: v_mul_f16_e32 v3, 0.15915494, v3
+; GFX8-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
+; GFX8-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
+; GFX8-NEXT: v_fract_f16_e32 v2, v2
+; GFX8-NEXT: v_fract_f16_e32 v1, v1
+; GFX8-NEXT: v_fract_f16_e32 v0, v0
+; GFX8-NEXT: v_fract_f16_e32 v3, v3
+; GFX8-NEXT: v_sin_f16_e32 v2, v2
+; GFX8-NEXT: v_sin_f16_e32 v0, v0
+; GFX8-NEXT: v_sin_f16_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v3
+; GFX8-NEXT: v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: cascaded_v4f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_log_f16_e32 v2, v1
+; GFX9-NEXT: v_log_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_log_f16_e32 v0, v0
+; GFX9-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
+; GFX9-NEXT: v_mul_f16_e32 v3, 0.15915494, v3
+; GFX9-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
+; GFX9-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
+; GFX9-NEXT: v_sin_f16_e32 v2, v2
+; GFX9-NEXT: v_sin_f16_e32 v0, v0
+; GFX9-NEXT: v_sin_f16_e32 v3, v3
+; GFX9-NEXT: v_sin_f16_e32 v1, v1
+; GFX9-NEXT: v_pack_b32_f16 v0, v0, v3
+; GFX9-NEXT: v_pack_b32_f16 v1, v2, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: cascaded_v4f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_log_f16_e32 v2, v1
+; GFX10-NEXT: v_log_f16_e32 v3, v0
+; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
+; GFX10-NEXT: v_mul_f16_e32 v3, 0.15915494, v3
+; GFX10-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
+; GFX10-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
+; GFX10-NEXT: v_sin_f16_e32 v2, v2
+; GFX10-NEXT: v_sin_f16_e32 v3, v3
+; GFX10-NEXT: v_sin_f16_e32 v0, v0
+; GFX10-NEXT: v_sin_f16_e32 v1, v1
+; GFX10-NEXT: v_pack_b32_f16 v0, v3, v0
+; GFX10-NEXT: v_pack_b32_f16 v1, v2, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: cascaded_v4f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT: v_log_f16_e32 v1, v1
+; GFX11-NEXT: v_log_f16_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_log_f16_e32 v2, v2
+; GFX11-NEXT: v_log_f16_e32 v3, v3
+; GFX11-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
+; GFX11-NEXT: s_delay_alu instid0(TRANS32_DEP_3)
+; GFX11-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
+; GFX11-NEXT: v_mul_f16_e32 v3, 0.15915494, v3
+; GFX11-NEXT: v_sin_f16_e32 v1, v1
+; GFX11-NEXT: v_sin_f16_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_sin_f16_e32 v2, v2
+; GFX11-NEXT: v_sin_f16_e32 v3, v3
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v2
+; GFX11-NEXT: v_pack_b32_f16 v1, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %b = call <4 x half> @llvm.log2.v4f16(<4 x half> %a)
+ %res = call <4 x half> @llvm.sin.v4f16(<4 x half> %b)
+ ret <4 x half> %res
+}
+
+define <5 x half> @cascaded_v5f16(<5 x half> %a) {
+; GFX8-LABEL: cascaded_v5f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_log_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_log_f16_e32 v4, v1
+; GFX8-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_log_f16_e32 v0, v0
+; GFX8-NEXT: v_log_f16_e32 v2, v2
+; GFX8-NEXT: v_mul_f16_e32 v4, 0.15915494, v4
+; GFX8-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
+; GFX8-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
+; GFX8-NEXT: v_mul_f16_e32 v3, 0.15915494, v3
+; GFX8-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
+; GFX8-NEXT: v_fract_f16_e32 v4, v4
+; GFX8-NEXT: v_fract_f16_e32 v1, v1
+; GFX8-NEXT: v_fract_f16_e32 v0, v0
+; GFX8-NEXT: v_fract_f16_e32 v3, v3
+; GFX8-NEXT: v_sin_f16_e32 v4, v4
+; GFX8-NEXT: v_sin_f16_e32 v0, v0
+; GFX8-NEXT: v_sin_f16_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_fract_f16_e32 v2, v2
+; GFX8-NEXT: v_sin_f16_e32 v2, v2
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v3
+; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: cascaded_v5f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_log_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_log_f16_e32 v4, v1
+; GFX9-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_log_f16_e32 v0, v0
+; GFX9-NEXT: v_log_f16_e32 v2, v2
+; GFX9-NEXT: v_mul_f16_e32 v4, 0.15915494, v4
+; GFX9-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
+; GFX9-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
+; GFX9-NEXT: v_mul_f16_e32 v3, 0.15915494, v3
+; GFX9-NEXT: v_sin_f16_e32 v4, v4
+; GFX9-NEXT: v_sin_f16_e32 v0, v0
+; GFX9-NEXT: v_sin_f16_e32 v3, v3
+; GFX9-NEXT: v_sin_f16_e32 v1, v1
+; GFX9-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
+; GFX9-NEXT: v_sin_f16_e32 v2, v2
+; GFX9-NEXT: v_pack_b32_f16 v0, v0, v3
+; GFX9-NEXT: v_pack_b32_f16 v1, v4, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: cascaded_v5f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_log_f16_e32 v3, v1
+; GFX10-NEXT: v_log_f16_e32 v4, v0
+; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_log_f16_e32 v2, v2
+; GFX10-NEXT: v_mul_f16_e32 v3, 0.15915494, v3
+; GFX10-NEXT: v_mul_f16_e32 v4, 0.15915494, v4
+; GFX10-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
+; GFX10-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
+; GFX10-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
+; GFX10-NEXT: v_sin_f16_e32 v3, v3
+; GFX10-NEXT: v_sin_f16_e32 v4, v4
+; GFX10-NEXT: v_sin_f16_e32 v0, v0
+; GFX10-NEXT: v_sin_f16_e32 v1, v1
+; GFX10-NEXT: v_sin_f16_e32 v2, v2
+; GFX10-NEXT: v_pack_b32_f16 v0, v4, v0
+; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: cascaded_v5f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-NEXT: v_log_f16_e32 v1, v1
+; GFX11-NEXT: v_log_f16_e32 v0, v0
+; GFX11-NEXT: v_log_f16_e32 v2, v2
+; GFX11-NEXT: v_log_f16_e32 v3, v3
+; GFX11-NEXT: v_log_f16_e32 v4, v4
+; GFX11-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
+; GFX11-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
+; GFX11-NEXT: s_delay_alu instid0(TRANS32_DEP_3)
+; GFX11-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f16_e32 v3, 0.15915494, v3
+; GFX11-NEXT: v_mul_f16_e32 v4, 0.15915494, v4
+; GFX11-NEXT: v_sin_f16_e32 v1, v1
+; GFX11-NEXT: v_sin_f16_e32 v0, v0
+; GFX11-NEXT: v_sin_f16_e32 v2, v2
+; GFX11-NEXT: v_sin_f16_e32 v3, v3
+; GFX11-NEXT: v_sin_f16_e32 v4, v4
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v3
+; GFX11-NEXT: v_pack_b32_f16 v1, v1, v4
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %b = call <5 x half> @llvm.log2.v5f16(<5 x half> %a)
+ %res = call <5 x half> @llvm.sin.v5f16(<5 x half> %b)
+ ret <5 x half> %res
+}
>From 6f2c15c4e8dc4df989679c6dbf36e8a7386db07b Mon Sep 17 00:00:00 2001
From: vikashgu <Vikash.Gupta at amd.com>
Date: Thu, 24 Apr 2025 11:10:42 +0000
Subject: [PATCH 4/7] Refactored the code in order to encapsulate redundant
code as lambda function.
---
llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 57 ++++++++++++-----------
1 file changed, 31 insertions(+), 26 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index b8bb27430707a..7d591bc801d3d 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -1455,8 +1455,21 @@ static bool checkForRightSrcRootAccess(MachineInstr *Def0MI,
MachineInstr *Def1MI,
Register SrcRootReg,
const SIInstrInfo *TII) {
- // As if could, the Def1MI would have been sdwa-ed
- if (!TII->isSDWA(Def1MI->getOpcode()))
+ // As if could, the Def1MI would have been sdwa-ed in order to access
+ // upper half, and Def0MI should not be as it accessing lower half.
+ if (!TII->isSDWA(Def1MI->getOpcode()) || TII->isSDWA(Def0MI->getOpcode()))
+ return false;
+
+ // Def1 should be writing into entire DWORD of dst, with unused part set
+ // to zero-pad.
+ MachineOperand *Def1DstSel =
+ TII->getNamedOperand(*Def1MI, AMDGPU::OpName::dst_sel);
+ if (!Def1DstSel || Def1DstSel->getImm() != AMDGPU::SDWA::SdwaSel::DWORD)
+ return false;
+ MachineOperand *Def1DstUnused =
+ TII->getNamedOperand(*Def1MI, AMDGPU::OpName::dst_unused);
+ if (!Def1DstUnused ||
+ Def1DstUnused->getImm() != AMDGPU::SDWA::DstUnused::UNUSED_PAD)
return false;
MachineOperand *Def1Src0 =
@@ -1468,13 +1481,7 @@ static bool checkForRightSrcRootAccess(MachineInstr *Def0MI,
MachineOperand *Def0Src1 =
TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src1);
- if (Def1Src0 && Def1Src0->isReg() && (Def1Src0->getReg() == SrcRootReg)) {
- MachineOperand *Def1Src0Sel =
- TII->getNamedOperand(*Def1MI, AMDGPU::OpName::src0_sel);
- if (!Def1Src0Sel ||
- (Def1Src0Sel->getImm() != AMDGPU::SDWA::SdwaSel::WORD_1))
- return false;
-
+ auto chkForDef0MIAccess = [&]() -> bool {
if (Def0Src0 && Def0Src0->isReg() && (Def0Src0->getReg() == SrcRootReg)) {
MachineOperand *Def0Src0Sel =
TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src0_sel);
@@ -1492,6 +1499,19 @@ static bool checkForRightSrcRootAccess(MachineInstr *Def0MI,
if (Def0Src1Sel && Def0Src1Sel->getImm() == AMDGPU::SDWA::SdwaSel::WORD_0)
return true;
}
+
+ return false;
+ };
+
+ if (Def1Src0 && Def1Src0->isReg() && (Def1Src0->getReg() == SrcRootReg)) {
+ MachineOperand *Def1Src0Sel =
+ TII->getNamedOperand(*Def1MI, AMDGPU::OpName::src0_sel);
+ if (!Def1Src0Sel ||
+ (Def1Src0Sel->getImm() != AMDGPU::SDWA::SdwaSel::WORD_1))
+ return false;
+
+ if (chkForDef0MIAccess())
+ return true;
}
if (Def1Src1 && Def1Src1->isReg() && (Def1Src1->getReg() == SrcRootReg)) {
@@ -1501,23 +1521,8 @@ static bool checkForRightSrcRootAccess(MachineInstr *Def0MI,
(Def1Src1Sel->getImm() != AMDGPU::SDWA::SdwaSel::WORD_1))
return false;
- if (Def0Src0 && Def0Src0->isReg() && (Def0Src0->getReg() == SrcRootReg)) {
- MachineOperand *Def0Src0Sel =
- TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src0_sel);
- if (!Def0Src0Sel)
- return true;
- if (Def0Src0Sel && Def0Src0Sel->getImm() == AMDGPU::SDWA::SdwaSel::WORD_0)
- return true;
- }
-
- if (Def0Src1 && Def0Src1->isReg() && (Def0Src1->getReg() == SrcRootReg)) {
- MachineOperand *Def0Src1Sel =
- TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src1_sel);
- if (!Def0Src1Sel)
- return true;
- if (Def0Src1Sel && Def0Src1Sel->getImm() == AMDGPU::SDWA::SdwaSel::WORD_0)
- return true;
- }
+ if (chkForDef0MIAccess())
+ return true;
}
return false;
>From ea24f4ca53372dff93e93e6de84b3c15c2ad5edc Mon Sep 17 00:00:00 2001
From: vikashgu <Vikash.Gupta at amd.com>
Date: Mon, 28 Apr 2025 09:33:53 +0000
Subject: [PATCH 5/7] Added reviewed changes addressing redundant code &
complex logic.
---
llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 450 ++++++++++------------
1 file changed, 214 insertions(+), 236 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 7d591bc801d3d..d00e8bb83343c 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -77,7 +77,7 @@ class SIPeepholeSDWA {
computeMIChainsForPackedOps(MachineInstr *ParentMI,
std::queue<MachineOperand *> &DefSrcQueue,
const GCNSubtarget &ST);
- void convertMIToSDWAWithOpsel(MachineInstr &MI, MachineOperand &SrcMO,
+ void convertMIToSDWAWithOpsel(MachineInstr *MI, MachineOperand &SrcMO,
AMDGPU::SDWA::SdwaSel OpSel);
public:
@@ -280,17 +280,13 @@ void SDWADstPreserveOperand::print(raw_ostream& OS) const {
#endif
-static void copyRegOperand(MachineOperand &To, const MachineOperand &From,
- bool isKill = false) {
+static void copyRegOperand(MachineOperand &To, const MachineOperand &From) {
assert(To.isReg() && From.isReg());
To.setReg(From.getReg());
To.setSubReg(From.getSubReg());
To.setIsUndef(From.isUndef());
if (To.isUse()) {
- if (isKill)
- To.setIsKill(true);
- else
- To.setIsKill(From.isKill());
+ To.setIsKill(From.isKill());
} else {
To.setIsDead(From.isDead());
}
@@ -1481,22 +1477,20 @@ static bool checkForRightSrcRootAccess(MachineInstr *Def0MI,
MachineOperand *Def0Src1 =
TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src1);
- auto chkForDef0MIAccess = [&]() -> bool {
+ auto checkForDef0MIAccess = [&]() -> bool {
if (Def0Src0 && Def0Src0->isReg() && (Def0Src0->getReg() == SrcRootReg)) {
MachineOperand *Def0Src0Sel =
TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src0_sel);
- if (!Def0Src0Sel)
- return true;
- if (Def0Src0Sel && Def0Src0Sel->getImm() == AMDGPU::SDWA::SdwaSel::WORD_0)
+ if (!Def0Src0Sel ||
+ Def0Src0Sel->getImm() == AMDGPU::SDWA::SdwaSel::WORD_0)
return true;
}
if (Def0Src1 && Def0Src1->isReg() && (Def0Src1->getReg() == SrcRootReg)) {
MachineOperand *Def0Src1Sel =
TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src1_sel);
- if (!Def0Src1Sel)
- return true;
- if (Def0Src1Sel && Def0Src1Sel->getImm() == AMDGPU::SDWA::SdwaSel::WORD_0)
+ if (!Def0Src1Sel ||
+ Def0Src1Sel->getImm() == AMDGPU::SDWA::SdwaSel::WORD_0)
return true;
}
@@ -1506,22 +1500,20 @@ static bool checkForRightSrcRootAccess(MachineInstr *Def0MI,
if (Def1Src0 && Def1Src0->isReg() && (Def1Src0->getReg() == SrcRootReg)) {
MachineOperand *Def1Src0Sel =
TII->getNamedOperand(*Def1MI, AMDGPU::OpName::src0_sel);
- if (!Def1Src0Sel ||
- (Def1Src0Sel->getImm() != AMDGPU::SDWA::SdwaSel::WORD_1))
+ if (!Def1Src0Sel || Def1Src0Sel->getImm() != AMDGPU::SDWA::SdwaSel::WORD_1)
return false;
- if (chkForDef0MIAccess())
+ if (checkForDef0MIAccess())
return true;
}
if (Def1Src1 && Def1Src1->isReg() && (Def1Src1->getReg() == SrcRootReg)) {
MachineOperand *Def1Src1Sel =
TII->getNamedOperand(*Def1MI, AMDGPU::OpName::src1_sel);
- if (!Def1Src1Sel ||
- (Def1Src1Sel->getImm() != AMDGPU::SDWA::SdwaSel::WORD_1))
+ if (!Def1Src1Sel || Def1Src1Sel->getImm() != AMDGPU::SDWA::SdwaSel::WORD_1)
return false;
- if (chkForDef0MIAccess())
+ if (checkForDef0MIAccess())
return true;
}
@@ -1546,71 +1538,69 @@ static bool dominates(MachineBasicBlock::const_iterator A,
// Convert MI into its SDWA version with its Dst_Sel & SrcMO_Sel set with OpSel
// and preserving the rest of Dst's bits.
-void SIPeepholeSDWA::convertMIToSDWAWithOpsel(MachineInstr &MI,
+void SIPeepholeSDWA::convertMIToSDWAWithOpsel(MachineInstr *MI,
MachineOperand &SrcMO,
AMDGPU::SDWA::SdwaSel OpSel) {
LLVM_DEBUG(dbgs() << "Convert instruction:" << MI);
- MachineInstr *SDWAInst;
- if (TII->isSDWA(MI.getOpcode())) {
- SDWAInst = &MI;
- } else {
- SDWAInst = createSDWAVersion(MI);
- MI.eraseFromParent();
+ if (!TII->isSDWA(MI->getOpcode())) {
+ MachineInstr *SDWAInst = createSDWAVersion(*MI);
+ MI->eraseFromParent();
+ MI = SDWAInst;
}
- ConvertedInstructions.push_back(SDWAInst);
- unsigned SDWAOpcode = SDWAInst->getOpcode();
+ ConvertedInstructions.push_back(MI);
+ unsigned SDWAOpcode = MI->getOpcode();
++NumSDWAInstructionsToEliminateFP16Pack;
- MachineOperand *Dst = TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::vdst);
+ MachineOperand *Dst = TII->getNamedOperand(*MI, AMDGPU::OpName::vdst);
assert(Dst && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::vdst));
- MachineOperand *DstSel =
- TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::dst_sel);
+ MachineOperand *DstSel = TII->getNamedOperand(*MI, AMDGPU::OpName::dst_sel);
assert(DstSel &&
AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::dst_sel));
DstSel->setImm(OpSel);
MachineOperand *DstUnused =
- TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::dst_unused);
+ TII->getNamedOperand(*MI, AMDGPU::OpName::dst_unused);
assert(DstUnused &&
AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::dst_unused));
assert(!(DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) &&
"Dst_unused should not be UNUSED_PRESERVE already");
DstUnused->setImm(AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE);
- auto PreserveDstIdx =
+ int PreserveDstIdx =
AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst);
assert(PreserveDstIdx != -1);
- auto NewSrcImplitMO = MachineOperand::CreateReg(SrcMO.getReg(), false, true);
+ MachineOperand NewSrcImplitMO =
+ MachineOperand::CreateReg(SrcMO.getReg(), false, true);
copyRegOperand(NewSrcImplitMO, SrcMO);
- SDWAInst->addOperand(NewSrcImplitMO);
- SDWAInst->tieOperands(PreserveDstIdx, SDWAInst->getNumOperands() - 1);
+ MI->addOperand(NewSrcImplitMO);
+ MI->tieOperands(PreserveDstIdx, MI->getNumOperands() - 1);
- MachineOperand *Src0 = TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::src0);
+ MachineOperand *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
assert(Src0 && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0));
if (Src0->isReg() && (Src0->getReg() == SrcMO.getReg())) {
MachineOperand *Src0Sel =
- TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::src0_sel);
+ TII->getNamedOperand(*MI, AMDGPU::OpName::src0_sel);
assert(Src0Sel &&
AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_sel));
Src0Sel->setImm(OpSel);
- LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n');
+ LLVM_DEBUG(dbgs() << "\nInto:" << *MI << '\n');
return;
}
- MachineOperand *Src1 = TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::src1);
+ MachineOperand *Src1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1);
assert(Src1 && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1));
if (Src1->isReg() && (Src1->getReg() == SrcMO.getReg())) {
MachineOperand *Src1Sel =
- TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::src1_sel);
+ TII->getNamedOperand(*MI, AMDGPU::OpName::src1_sel);
assert(Src1Sel &&
AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_sel));
Src1Sel->setImm(OpSel);
- LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n');
+ LLVM_DEBUG(dbgs() << "\nInto:" << *MI << '\n');
return;
}
}
@@ -1621,15 +1611,20 @@ unsigned SIPeepholeSDWA::computeMIChainsForPackedOps(
MachineInstr *ParentMI, std::queue<MachineOperand *> &DefSrcQueue,
const GCNSubtarget &ST) {
unsigned NumOfFP16Def;
+
+ // We will go up the use-def chain for ParentMI, until we encounter the
+ // exit condition, where we don't find any such defs of use operands
+ // which satisfy convertibility to SDWA OR find such uses more than 1 as now
+ // we don't know which path to follow-up.
do {
- MachineInstr *NextMIInChain = nullptr;
NumOfFP16Def = 0;
- for (MachineOperand ¤tMO : ParentMI->uses()) {
- if (!currentMO.isReg() || currentMO.getReg().isPhysical() ||
- !MRI->hasOneUse(currentMO.getReg()))
+ MachineInstr *NextMIInChain = nullptr;
+ for (MachineOperand &CurrentMO : ParentMI->uses()) {
+ if (!CurrentMO.isReg() || CurrentMO.getReg().isPhysical() ||
+ !MRI->hasOneUse(CurrentMO.getReg()))
continue;
- MachineOperand *DefCurrMO = findSingleRegDef(¤tMO, MRI);
+ MachineOperand *DefCurrMO = findSingleRegDef(&CurrentMO, MRI);
if (!DefCurrMO)
continue;
@@ -1643,11 +1638,8 @@ unsigned SIPeepholeSDWA::computeMIChainsForPackedOps(
NumOfFP16Def++;
}
- if (NumOfFP16Def > 1)
- break;
-
ParentMI = NextMIInChain;
- } while (ParentMI);
+ } while (NumOfFP16Def == 1);
return NumOfFP16Def;
}
@@ -1658,216 +1650,202 @@ void SIPeepholeSDWA::eliminateFP16Packing(MachineBasicBlock &MBB,
return;
for (MachineInstr &MI : make_early_inc_range(MBB)) {
- if (MI.getOpcode() == AMDGPU::V_PACK_B32_F16_e64) {
- LLVM_DEBUG(dbgs() << "\nCandidate FP16 Packed MI : " << MI << '\n');
- std::queue<MachineOperand *> DefSrc0Queue;
- std::queue<MachineOperand *> DefSrc1Queue;
- MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
- MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
-
- if (!Src0->isReg() || Src0->getReg().isPhysical() ||
- !MRI->hasOneUse(Src0->getReg()) || !Src1->isReg() ||
- Src1->getReg().isPhysical() || !MRI->hasOneUse(Src1->getReg()))
- continue;
+ if (MI.getOpcode() != AMDGPU::V_PACK_B32_F16_e64)
+ continue;
+ LLVM_DEBUG(dbgs() << "\nCandidate FP16 Packed MI : " << MI << '\n');
- MachineOperand *Op0 = findSingleRegDef(Src0, MRI);
- MachineOperand *Op1 = findSingleRegDef(Src1, MRI);
+ std::queue<MachineOperand *> DefSrc0Queue;
+ std::queue<MachineOperand *> DefSrc1Queue;
+ MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+ MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
- if (!Op0 || !Op1)
- continue;
+ if (!Src0->isReg() || Src0->getReg().isPhysical() ||
+ !MRI->hasOneUse(Src0->getReg()) || !Src1->isReg() ||
+ Src1->getReg().isPhysical() || !MRI->hasOneUse(Src1->getReg()))
+ continue;
- MachineInstr *ParentMIOp0 = Op0->getParent();
- MachineInstr *ParentMIOp1 = Op1->getParent();
+ MachineOperand *Op0 = findSingleRegDef(Src0, MRI);
+ MachineOperand *Op1 = findSingleRegDef(Src1, MRI);
- if (!isSrcDestFP16Bits(ParentMIOp0, TII) ||
- !isSrcDestFP16Bits(ParentMIOp1, TII) ||
- !isConvertibleToSDWA(*ParentMIOp0, ST, TII) ||
- !isConvertibleToSDWA(*ParentMIOp1, ST, TII))
- continue;
+ if (!Op0 || !Op1)
+ continue;
- DefSrc0Queue.push(Op0);
- DefSrc1Queue.push(Op1);
+ MachineInstr *ParentMIOp0 = Op0->getParent();
+ MachineInstr *ParentMIOp1 = Op1->getParent();
- // This checks for the given MI, that it only has exact one register MO
- // use , that is defined by pure FP16 instruction (that is SDWA-able too)
- unsigned NumOfFP16Def;
+ if (!isSrcDestFP16Bits(ParentMIOp0, TII) ||
+ !isSrcDestFP16Bits(ParentMIOp1, TII) ||
+ !isConvertibleToSDWA(*ParentMIOp0, ST, TII) ||
+ !isConvertibleToSDWA(*ParentMIOp1, ST, TII))
+ continue;
- NumOfFP16Def = computeMIChainsForPackedOps(ParentMIOp0, DefSrc0Queue, ST);
- if (NumOfFP16Def > 1)
- continue;
+ DefSrc0Queue.push(Op0);
+ DefSrc1Queue.push(Op1);
- NumOfFP16Def = computeMIChainsForPackedOps(ParentMIOp1, DefSrc1Queue, ST);
- if (NumOfFP16Def > 1)
- continue;
+ // This checks for the given MI, that it only has exact one register MO
+ // use , that is defined by pure FP16 instruction (that is SDWA-able too)
+ unsigned NumOfFP16Def;
- MachineInstr *Def0RootMI = (DefSrc0Queue.back())->getParent();
- MachineInstr *Def1RootMI = (DefSrc1Queue.back())->getParent();
- Register SrcRootMOReg = AMDGPU::NoRegister;
+ NumOfFP16Def = computeMIChainsForPackedOps(ParentMIOp0, DefSrc0Queue, ST);
+ if (NumOfFP16Def > 1)
+ continue;
- // Now, check if the last operation for each in of the DefSrcQueue
- // has the common MO, that would be the source root MO for element-wise
- // fp16 chain operations
- for (MachineOperand &Current0MO : Def0RootMI->uses()) {
- if (!Current0MO.isReg() || Current0MO.getReg().isPhysical())
- continue;
+ NumOfFP16Def = computeMIChainsForPackedOps(ParentMIOp1, DefSrc1Queue, ST);
+ if (NumOfFP16Def > 1)
+ continue;
- for (MachineOperand &Current1MO : Def1RootMI->uses()) {
- if (!Current1MO.isReg() || Current1MO.getReg().isPhysical())
- continue;
+ MachineInstr *Def0RootMI = (DefSrc0Queue.back())->getParent();
+ MachineInstr *Def1RootMI = (DefSrc1Queue.back())->getParent();
+ Register SrcRootMOReg = AMDGPU::NoRegister;
- if (Current0MO.getReg() == Current1MO.getReg() &&
- Current0MO.getSubReg() == Current1MO.getSubReg()) {
- SrcRootMOReg = Current0MO.getReg();
- break;
- }
- }
- // Found it, no more check needed, so break;
- if (SrcRootMOReg != AMDGPU::NoRegister)
+ // Now, check if the last operation for each in of the DefSrcQueue
+ // has the common MO, that would be the source root MO for element-wise
+ // fp16 chain operations
+ for (MachineOperand &Current0MO : Def0RootMI->uses()) {
+ if (!Current0MO.isReg() || Current0MO.getReg().isPhysical())
+ continue;
+
+ for (MachineOperand &Current1MO : Def1RootMI->uses()) {
+ if (!Current1MO.isReg() || Current1MO.getReg().isPhysical())
+ continue;
+
+ if (Current0MO.getReg() == Current1MO.getReg() &&
+ Current0MO.getSubReg() == Current1MO.getSubReg()) {
+ SrcRootMOReg = Current0MO.getReg();
break;
+ }
}
+ // Found it, no more check needed, so break;
+ if (SrcRootMOReg != AMDGPU::NoRegister)
+ break;
+ }
- if (SrcRootMOReg == AMDGPU::NoRegister)
- continue;
+ if (SrcRootMOReg == AMDGPU::NoRegister)
+ continue;
- // Also we need to ensure that each of the DefXRootMI should access the
- // lower and upper half word of SrcRootMOReg respectively.
- if (!checkForRightSrcRootAccess(Def0RootMI, Def1RootMI, SrcRootMOReg,
- TII))
- continue;
+ // Also we need to ensure that each of the DefXRootMI should access the
+ // lower and upper half word of SrcRootMOReg respectively.
+ if (!checkForRightSrcRootAccess(Def0RootMI, Def1RootMI, SrcRootMOReg, TII))
+ continue;
- // The graph below represents the connection :
- // Op0Intial --> Op0x --> ... --> Op0Final
- // / \'
- // SrcRootMO v_Pack_b32_f16
- // \ /
- // Op1Intial --> Op1x --> ... --> Op1Final
- // The nomenclature is based upon above flow-graph
- //
- // Also for each of DefSrcXQueue :
- // OpXIntial is at back & OpXFinal is at front
- auto Op0FinalMI = (DefSrc0Queue.front())->getParent();
- auto Op1FinalMI = (DefSrc1Queue.front())->getParent();
- auto Op0IntialMI = (DefSrc0Queue.back())->getParent();
- auto Op1IntialMI = (DefSrc1Queue.back())->getParent();
-
- MachineOperand *FinalOutMO = nullptr;
- std::queue<MachineOperand *> ChainedDefOps;
- AMDGPU::SDWA::SdwaSel OpSel = AMDGPU::SDWA::SdwaSel::DWORD;
- int NumOfElemInSecondOpChain = 0;
-
- // Now, we will change the flow as per the dominace of MI as follows, if
- // possible and store it in ChainedDefOps, so later can be used to convert
- // into its SDWA version:
- //
- // If (dominates(Op0FinalMI, Op1IntialMI)) == TRUE
- // SrcRootMO -> Op0Intial -> Op0x -> ... -> Op0Final
- // -> Op1Intial -> Op1x -> ... -> Op1Final (FinalOutMO)
- //
- // If (dominates(Op1FinalMI, Op0IntialMI)) == TRUE
- // SrcRootMO -> Op1Intial -> Op1x -> ... -> Op1Final
- // -> Op0Intial -> Op0x -> ... -> Op0Final (FinalOutMO)
- //
- // TODO : Else, not handled!
- // One such case is observed when multiple fp16 instruction are chained
- // on a fp16 vector input. For Example :
- //
- // %1 = call <2 x half> @llvm.log.v2f16 (<2 x half> %0)
- // %res = call <2 x half> @llvm.sin.v2f16 (<2 x half> %1)
- // return <2 x half> %res
- if (dominates(Op0FinalMI, Op1IntialMI)) {
- int OpIdx = Op1IntialMI->findRegisterUseOperandIdx(SrcRootMOReg, TRI);
- auto &MOTo = Op1IntialMI->getOperand(OpIdx);
- auto MOFrom = DefSrc0Queue.front();
- copyRegOperand(MOTo, *MOFrom, true);
- FinalOutMO = DefSrc1Queue.front();
-
- LLVM_DEBUG(dbgs() << "Updated Connecting MI : " << *Op1IntialMI
- << '\n');
- OpIdx = Op0IntialMI->findRegisterUseOperandIdx(SrcRootMOReg, TRI);
- auto &IntialInMO = Op0IntialMI->getOperand(OpIdx);
-
- while (!DefSrc1Queue.empty()) {
- ChainedDefOps.push(DefSrc1Queue.front());
- DefSrc1Queue.pop();
- NumOfElemInSecondOpChain++;
- }
- while (!DefSrc0Queue.empty()) {
- ChainedDefOps.push(DefSrc0Queue.front());
- DefSrc0Queue.pop();
- }
+ // The graph below represents the connection :
+ // Op0Intial --> Op0x --> ... --> Op0Final
+ // / \'
+ // SrcRootMO v_Pack_b32_f16
+ // \ /
+ // Op1Intial --> Op1x --> ... --> Op1Final
+ // The nomenclature is based upon above flow-graph
+ //
+ // Also for each of DefSrcXQueue :
+ // OpXIntial is at back & OpXFinal is at front
+ auto Op0FinalMI = (DefSrc0Queue.front())->getParent();
+ auto Op1FinalMI = (DefSrc1Queue.front())->getParent();
+ auto Op0IntialMI = (DefSrc0Queue.back())->getParent();
+ auto Op1IntialMI = (DefSrc1Queue.back())->getParent();
+
+ MachineOperand *FinalOutMO = nullptr;
+ std::queue<MachineOperand *> ChainedDefOps;
+ AMDGPU::SDWA::SdwaSel OpSel = AMDGPU::SDWA::SdwaSel::DWORD;
+ int NumOfElemInSecondOpChain = 0;
+
+ auto canonicalizedMIFlow =
+ [&](std::queue<MachineOperand *> DefFromQueue,
+ std::queue<MachineOperand *> DefToQueue) -> void {
+ MachineInstr *OpToIntialMI = (DefToQueue.back())->getParent();
+ int OpIdx = OpToIntialMI->findRegisterUseOperandIdx(SrcRootMOReg, TRI);
+ auto &MOTo = OpToIntialMI->getOperand(OpIdx);
+ auto MOFrom = DefFromQueue.front();
+ copyRegOperand(MOTo, *MOFrom);
+
+ LLVM_DEBUG(dbgs() << "Updated Connecting MI : " << *OpToIntialMI << '\n');
+
+ FinalOutMO = DefToQueue.front();
+ MachineInstr *OpFromIntialMI = (DefFromQueue.back())->getParent();
+ OpIdx = OpFromIntialMI->findRegisterUseOperandIdx(SrcRootMOReg, TRI);
+ auto &IntialInMO = OpFromIntialMI->getOperand(OpIdx);
+
+ while (!DefToQueue.empty()) {
+ ChainedDefOps.push(DefToQueue.front());
+ DefToQueue.pop();
+ NumOfElemInSecondOpChain++;
+ }
+ while (!DefFromQueue.empty()) {
+ ChainedDefOps.push(DefFromQueue.front());
+ DefFromQueue.pop();
+ }
+ ChainedDefOps.push(&IntialInMO);
+ };
- ChainedDefOps.push(&IntialInMO);
- OpSel = AMDGPU::SDWA::SdwaSel::WORD_1;
- } else if (dominates(Op1FinalMI, Op0IntialMI)) {
- int OpIdx = Op0IntialMI->findRegisterUseOperandIdx(SrcRootMOReg, TRI);
- auto &MOTo = Op0IntialMI->getOperand(OpIdx);
- auto MOFrom = DefSrc1Queue.front();
- copyRegOperand(MOTo, *MOFrom, true);
- FinalOutMO = DefSrc0Queue.front();
-
- LLVM_DEBUG(dbgs() << "Updated Connecting MI : " << *Op0IntialMI
- << '\n');
- OpIdx = Op1IntialMI->findRegisterUseOperandIdx(SrcRootMOReg, TRI);
- auto &IntialInMO = Op1IntialMI->getOperand(OpIdx);
-
- while (!DefSrc0Queue.empty()) {
- ChainedDefOps.push(DefSrc0Queue.front());
- DefSrc0Queue.pop();
- NumOfElemInSecondOpChain++;
- }
- while (!DefSrc1Queue.empty()) {
- ChainedDefOps.push(DefSrc1Queue.front());
- DefSrc1Queue.pop();
- }
+ // Now, we will change the flow as per the dominace of MI as follows, if
+ // possible and store it in ChainedDefOps, so later can be used to convert
+ // into its SDWA version:
+ //
+ // If (dominates(Op0FinalMI, Op1IntialMI)) == TRUE
+ // SrcRootMO -> Op0Intial -> Op0x -> ... -> Op0Final
+ // -> Op1Intial -> Op1x -> ... -> Op1Final (FinalOutMO)
+ //
+ // If (dominates(Op1FinalMI, Op0IntialMI)) == TRUE
+ // SrcRootMO -> Op1Intial -> Op1x -> ... -> Op1Final
+ // -> Op0Intial -> Op0x -> ... -> Op0Final (FinalOutMO)
+ //
+ // TODO : Else, not handled!
+ // One such case is observed when multiple fp16 instruction are chained
+ // on a fp16 vector input. For Example :
+ //
+ // %1 = call <2 x half> @llvm.log.v2f16 (<2 x half> %0)
+ // %res = call <2 x half> @llvm.sin.v2f16 (<2 x half> %1)
+ // return <2 x half> %res
+ if (dominates(Op0FinalMI, Op1IntialMI)) {
+ canonicalizedMIFlow(DefSrc0Queue, DefSrc1Queue);
+ OpSel = AMDGPU::SDWA::SdwaSel::WORD_1;
+ } else if (dominates(Op1FinalMI, Op0IntialMI)) {
+ canonicalizedMIFlow(DefSrc1Queue, DefSrc0Queue);
+ OpSel = AMDGPU::SDWA::SdwaSel::WORD_0;
+ } else {
+ LLVM_DEBUG(dbgs() << "No Connecting MI exists" << '\n');
+ continue;
+ }
- ChainedDefOps.push(&IntialInMO);
- OpSel = AMDGPU::SDWA::SdwaSel::WORD_0;
- } else {
- LLVM_DEBUG(dbgs() << "No Connecting MI exists" << '\n');
+ // Replace all use places of MI(v_pack) defMO with FinalOutMO.
+ MachineOperand &DefMO = MI.getOperand(0);
+ for (MachineOperand &MO :
+ make_early_inc_range(MRI->use_nodbg_operands(DefMO.getReg()))) {
+ if (!MO.isReg())
continue;
- }
- // Replace all use places of MI(v_pack) defMO with FinalOutMO.
- MachineOperand &DefMO = MI.getOperand(0);
- for (MachineOperand &MO :
- make_early_inc_range(MRI->use_nodbg_operands(DefMO.getReg()))) {
- if (!MO.isReg())
- continue;
+ MO.setReg(FinalOutMO->getReg());
+ MO.setSubReg(FinalOutMO->getSubReg());
+ }
+ LLVM_DEBUG(dbgs() << "Replace uses of " << DefMO << " in " << MI << "With "
+ << *FinalOutMO << '\n');
- MO.setReg(FinalOutMO->getReg());
- MO.setSubReg(FinalOutMO->getSubReg());
+ // Delete v_pack machine instruction
+ LLVM_DEBUG(dbgs() << "\nInstruction to be deleted : " << MI << "\n\n");
+ MI.eraseFromParent();
+ ++Num16BitPackedInstructionsEliminated;
+
+ // Convert machine instruction into SDWA-version
+ while (ChainedDefOps.size() != 1) {
+ if (NumOfElemInSecondOpChain == 0) {
+ if (OpSel == AMDGPU::SDWA::SdwaSel::WORD_0)
+ OpSel = AMDGPU::SDWA::SdwaSel::WORD_1;
+ else
+ OpSel = AMDGPU::SDWA::SdwaSel::WORD_0;
}
- LLVM_DEBUG(dbgs() << "Replace uses of " << DefMO << " in " << MI
- << "With " << *FinalOutMO << '\n');
-
- // Delete v_pack machine instruction
- LLVM_DEBUG(dbgs() << "\nInstruction to be deleted : " << MI << "\n\n");
- MI.eraseFromParent();
- ++Num16BitPackedInstructionsEliminated;
-
- // Convert machine instruction into SDWA-version
- while (ChainedDefOps.size() != 1) {
- if (NumOfElemInSecondOpChain == 0) {
- if (OpSel == AMDGPU::SDWA::SdwaSel::WORD_0)
- OpSel = AMDGPU::SDWA::SdwaSel::WORD_1;
- else
- OpSel = AMDGPU::SDWA::SdwaSel::WORD_0;
- }
-
- MachineInstr *DefMI = ChainedDefOps.front()->getParent();
- ChainedDefOps.pop();
- MachineOperand *SrcMO = ChainedDefOps.front();
- // Take SrcMO (which are def) as its usage in DefMI
- if (SrcMO->isDef()) {
- assert(MRI->hasOneUse(SrcMO->getReg()));
- SrcMO = findSingleRegUse(SrcMO, MRI);
- assert(DefMI == SrcMO->getParent() && "the only use is not in DefMI");
- }
+ MachineInstr *DefMI = ChainedDefOps.front()->getParent();
+ ChainedDefOps.pop();
+ MachineOperand *SrcMO = ChainedDefOps.front();
- convertMIToSDWAWithOpsel(*DefMI, *SrcMO, OpSel);
- NumOfElemInSecondOpChain--;
+ // Take SrcMO (which are def) as its usage in DefMI
+ if (SrcMO->isDef()) {
+ assert(MRI->hasOneUse(SrcMO->getReg()));
+ SrcMO = findSingleRegUse(SrcMO, MRI);
+ assert(DefMI == SrcMO->getParent() && "the only use is not in DefMI");
}
+
+ convertMIToSDWAWithOpsel(DefMI, *SrcMO, OpSel);
+ NumOfElemInSecondOpChain--;
}
}
}
>From 06a65430e56f54d5c586e46be9b639040dd86fc6 Mon Sep 17 00:00:00 2001
From: vikashgu <Vikash.Gupta at amd.com>
Date: Mon, 28 Apr 2025 10:41:00 +0000
Subject: [PATCH 6/7] Added MIR test to demonstrate the specific MIR pattern
handling to eliminate packing for fp16.
---
llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 11 +-
llvm/test/CodeGen/AMDGPU/packed-vec-fp16.mir | 337 +++++++++++++++++++
2 files changed, 344 insertions(+), 4 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/packed-vec-fp16.mir
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index d00e8bb83343c..622dfe77cda78 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -1529,11 +1529,14 @@ static bool dominates(MachineBasicBlock::const_iterator A,
if (B == MBBEnd)
return true;
- MachineBasicBlock::const_iterator I = MBB->begin();
- for (; &*I != A && &*I != B; ++I)
- ;
+ if (A == MBBEnd)
+ return false;
+
+ MachineBasicBlock::const_iterator I = A;
+ while (I != B && I != MBBEnd)
+ I++;
- return &*I == A;
+ return (I == B);
}
// Convert MI into its SDWA version with its Dst_Sel & SrcMO_Sel set with OpSel
diff --git a/llvm/test/CodeGen/AMDGPU/packed-vec-fp16.mir b/llvm/test/CodeGen/AMDGPU/packed-vec-fp16.mir
new file mode 100644
index 0000000000000..2b2dce0d26a09
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/packed-vec-fp16.mir
@@ -0,0 +1,337 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -run-pass si-peephole-sdwa,dead-mi-elimination -o - %s | FileCheck -check-prefixes=GFX9 %s
+
+--- |
+ source_filename = "/home/vikashgu/work/upstream/llvm-project/llvm/test/CodeGen/AMDGPU/vector-fp16.ll"
+ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+ target triple = "amdgcn-amd-amdhsa"
+
+ define <4 x half> @sin_v4f16(<4 x half> %a) #0 {
+ %res = call <4 x half> @llvm.sin.v4f16(<4 x half> %a)
+ ret <4 x half> %res
+ }
+
+ define <4 x half> @cos_v4f16(<4 x half> %a) #0 {
+ %res = call <4 x half> @llvm.cos.v4f16(<4 x half> %a)
+ ret <4 x half> %res
+ }
+
+ define <4 x half> @log_v4f16(<4 x half> %a) #0 {
+ %res = call <4 x half> @llvm.log.v4f16(<4 x half> %a)
+ ret <4 x half> %res
+ }
+
+ define <4 x half> @log2_v4f16(<4 x half> %a) #0 {
+ %res = call <4 x half> @llvm.log2.v4f16(<4 x half> %a)
+ ret <4 x half> %res
+ }
+
+ define <4 x half> @exp_v4f16(<4 x half> %a) #0 {
+ %res = call <4 x half> @llvm.exp.v4f16(<4 x half> %a)
+ ret <4 x half> %res
+ }
+
+ define <4 x half> @cascaded_v4f16(<4 x half> %a) #0 {
+ %b = call <4 x half> @llvm.log2.v4f16(<4 x half> %a)
+ %res = call <4 x half> @llvm.sin.v4f16(<4 x half> %b)
+ ret <4 x half> %res
+ }
+
+ declare <4 x half> @llvm.cos.v4f16(<4 x half>) #1
+
+ declare <4 x half> @llvm.exp.v4f16(<4 x half>) #1
+
+ declare <4 x half> @llvm.log.v4f16(<4 x half>) #1
+
+ declare <4 x half> @llvm.log2.v4f16(<4 x half>) #1
+
+ declare <4 x half> @llvm.sin.v4f16(<4 x half>) #1
+
+ attributes #0 = { "target-cpu"="gfx942" }
+ attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx942" }
+...
+
+
+---
+name: sin_v4f16
+tracksRegLiveness: true
+body: |
+ bb.0 (%ir-block.0):
+ liveins: $vgpr0, $vgpr1
+
+ ; GFX9-LABEL: name: sin_v4f16
+ ; GFX9: liveins: $vgpr0, $vgpr1
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 12568, implicit $exec
+ ; GFX9-NEXT: [[V_MUL_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, [[COPY]], 0, [[V_MOV_B32_e32_]], 0, 0, 4, 2, 4, 6, implicit $mode, implicit $exec, implicit [[COPY]](tied-def 0)
+ ; GFX9-NEXT: [[V_SIN_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_SIN_F16_sdwa 0, killed [[V_MUL_F16_sdwa]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit killed [[V_MUL_F16_sdwa]](tied-def 0)
+ ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 12568, implicit $exec
+ ; GFX9-NEXT: [[V_MUL_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, [[V_SIN_F16_sdwa]], 0, [[V_MOV_B32_e32_1]], 0, 0, 5, 2, 5, 6, implicit $mode, implicit $exec, implicit [[V_SIN_F16_sdwa]](tied-def 0)
+ ; GFX9-NEXT: [[V_SIN_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_SIN_F16_sdwa 0, killed [[V_MUL_F16_sdwa1]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit killed [[V_MUL_F16_sdwa1]](tied-def 0)
+ ; GFX9-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 12568, implicit $exec
+ ; GFX9-NEXT: [[V_MUL_F16_sdwa2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, [[COPY1]], 0, [[V_MOV_B32_e32_2]], 0, 0, 4, 2, 4, 6, implicit $mode, implicit $exec, implicit [[COPY1]](tied-def 0)
+ ; GFX9-NEXT: [[V_SIN_F16_sdwa2:%[0-9]+]]:vgpr_32 = nofpexcept V_SIN_F16_sdwa 0, killed [[V_MUL_F16_sdwa2]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit killed [[V_MUL_F16_sdwa2]](tied-def 0)
+ ; GFX9-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 12568, implicit $exec
+ ; GFX9-NEXT: [[V_MUL_F16_sdwa3:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, [[V_SIN_F16_sdwa2]], 0, [[V_MOV_B32_e32_3]], 0, 0, 5, 2, 5, 6, implicit $mode, implicit $exec, implicit [[V_SIN_F16_sdwa2]](tied-def 0)
+ ; GFX9-NEXT: [[V_SIN_F16_sdwa3:%[0-9]+]]:vgpr_32 = nofpexcept V_SIN_F16_sdwa 0, killed [[V_MUL_F16_sdwa3]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit killed [[V_MUL_F16_sdwa3]](tied-def 0)
+ ; GFX9-NEXT: $vgpr0 = COPY [[V_SIN_F16_sdwa3]]
+ ; GFX9-NEXT: $vgpr1 = COPY [[V_SIN_F16_sdwa1]]
+ ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+ %9:vgpr_32 = COPY $vgpr1
+ %8:vgpr_32 = COPY $vgpr0
+ %12:vgpr_32 = nofpexcept V_MUL_F16_e64 0, %9, 0, 12568, 0, 0, implicit $mode, implicit $exec
+ %13:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed %12, 0, 0, implicit $mode, implicit $exec
+ %15:vgpr_32 = V_LSHRREV_B32_e64 16, %9, implicit $exec
+ %17:vgpr_32 = nofpexcept V_MUL_F16_e64 0, %15, 0, 12568, 0, 0, implicit $mode, implicit $exec
+ %18:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed %17, 0, 0, implicit $mode, implicit $exec
+ %20:vgpr_32 = nofpexcept V_MUL_F16_e64 0, %8, 0, 12568, 0, 0, implicit $mode, implicit $exec
+ %21:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed %20, 0, 0, implicit $mode, implicit $exec
+ %22:vgpr_32 = V_LSHRREV_B32_e64 16, %8, implicit $exec
+ %24:vgpr_32 = nofpexcept V_MUL_F16_e64 0, %22, 0, 12568, 0, 0, implicit $mode, implicit $exec
+ %25:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed %24, 0, 0, implicit $mode, implicit $exec
+ %26:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %21, 0, killed %25, 0, 0, implicit $mode, implicit $exec
+ %27:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %13, 0, killed %18, 0, 0, implicit $mode, implicit $exec
+ $vgpr0 = COPY %26
+ $vgpr1 = COPY %27
+ SI_RETURN implicit $vgpr0, implicit $vgpr1
+...
+
+---
+name: cos_v4f16
+tracksRegLiveness: true
+body: |
+ bb.0 (%ir-block.0):
+ liveins: $vgpr0, $vgpr1
+
+ ; GFX9-LABEL: name: cos_v4f16
+ ; GFX9: liveins: $vgpr0, $vgpr1
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 12568, implicit $exec
+ ; GFX9-NEXT: [[V_MUL_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, [[COPY]], 0, [[V_MOV_B32_e32_]], 0, 0, 4, 2, 4, 6, implicit $mode, implicit $exec, implicit [[COPY]](tied-def 0)
+ ; GFX9-NEXT: [[V_COS_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_COS_F16_sdwa 0, killed [[V_MUL_F16_sdwa]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit killed [[V_MUL_F16_sdwa]](tied-def 0)
+ ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 12568, implicit $exec
+ ; GFX9-NEXT: [[V_MUL_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, [[V_COS_F16_sdwa]], 0, [[V_MOV_B32_e32_1]], 0, 0, 5, 2, 5, 6, implicit $mode, implicit $exec, implicit [[V_COS_F16_sdwa]](tied-def 0)
+ ; GFX9-NEXT: [[V_COS_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_COS_F16_sdwa 0, killed [[V_MUL_F16_sdwa1]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit killed [[V_MUL_F16_sdwa1]](tied-def 0)
+ ; GFX9-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 12568, implicit $exec
+ ; GFX9-NEXT: [[V_MUL_F16_sdwa2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, [[COPY1]], 0, [[V_MOV_B32_e32_2]], 0, 0, 4, 2, 4, 6, implicit $mode, implicit $exec, implicit [[COPY1]](tied-def 0)
+ ; GFX9-NEXT: [[V_COS_F16_sdwa2:%[0-9]+]]:vgpr_32 = nofpexcept V_COS_F16_sdwa 0, killed [[V_MUL_F16_sdwa2]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit killed [[V_MUL_F16_sdwa2]](tied-def 0)
+ ; GFX9-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 12568, implicit $exec
+ ; GFX9-NEXT: [[V_MUL_F16_sdwa3:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, [[V_COS_F16_sdwa2]], 0, [[V_MOV_B32_e32_3]], 0, 0, 5, 2, 5, 6, implicit $mode, implicit $exec, implicit [[V_COS_F16_sdwa2]](tied-def 0)
+ ; GFX9-NEXT: [[V_COS_F16_sdwa3:%[0-9]+]]:vgpr_32 = nofpexcept V_COS_F16_sdwa 0, killed [[V_MUL_F16_sdwa3]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit killed [[V_MUL_F16_sdwa3]](tied-def 0)
+ ; GFX9-NEXT: $vgpr0 = COPY [[V_COS_F16_sdwa3]]
+ ; GFX9-NEXT: $vgpr1 = COPY [[V_COS_F16_sdwa1]]
+ ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+ %9:vgpr_32 = COPY $vgpr1
+ %8:vgpr_32 = COPY $vgpr0
+ %12:vgpr_32 = nofpexcept V_MUL_F16_e64 0, %9, 0, 12568, 0, 0, implicit $mode, implicit $exec
+ %13:vgpr_32 = nofpexcept V_COS_F16_e64 0, killed %12, 0, 0, implicit $mode, implicit $exec
+ %15:vgpr_32 = V_LSHRREV_B32_e64 16, %9, implicit $exec
+ %17:vgpr_32 = nofpexcept V_MUL_F16_e64 0, %15, 0, 12568, 0, 0, implicit $mode, implicit $exec
+ %18:vgpr_32 = nofpexcept V_COS_F16_e64 0, killed %17, 0, 0, implicit $mode, implicit $exec
+ %20:vgpr_32 = nofpexcept V_MUL_F16_e64 0, %8, 0, 12568, 0, 0, implicit $mode, implicit $exec
+ %21:vgpr_32 = nofpexcept V_COS_F16_e64 0, killed %20, 0, 0, implicit $mode, implicit $exec
+ %22:vgpr_32 = V_LSHRREV_B32_e64 16, %8, implicit $exec
+ %24:vgpr_32 = nofpexcept V_MUL_F16_e64 0, %22, 0, 12568, 0, 0, implicit $mode, implicit $exec
+ %25:vgpr_32 = nofpexcept V_COS_F16_e64 0, killed %24, 0, 0, implicit $mode, implicit $exec
+ %26:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %21, 0, killed %25, 0, 0, implicit $mode, implicit $exec
+ %27:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %13, 0, killed %18, 0, 0, implicit $mode, implicit $exec
+ $vgpr0 = COPY %26
+ $vgpr1 = COPY %27
+ SI_RETURN implicit $vgpr0, implicit $vgpr1
+...
+
+---
+name: log_v4f16
+tracksRegLiveness: true
+body: |
+ bb.0 (%ir-block.0):
+ liveins: $vgpr0, $vgpr1
+
+ ; GFX9-LABEL: name: log_v4f16
+ ; GFX9: liveins: $vgpr0, $vgpr1
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9-NEXT: [[V_LOG_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[COPY]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit [[COPY]](tied-def 0)
+ ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 14732
+ ; GFX9-NEXT: [[V_MUL_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, killed [[V_LOG_F16_sdwa]], 0, [[S_MOV_B32_]], 0, 0, 4, 2, 4, 6, implicit $mode, implicit $exec, implicit killed [[V_LOG_F16_sdwa]](tied-def 0)
+ ; GFX9-NEXT: [[V_LOG_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[V_MUL_F16_sdwa]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit [[V_MUL_F16_sdwa]](tied-def 0)
+ ; GFX9-NEXT: [[V_MUL_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, killed [[V_LOG_F16_sdwa1]], 0, [[S_MOV_B32_]], 0, 0, 5, 2, 5, 6, implicit $mode, implicit $exec, implicit killed [[V_LOG_F16_sdwa1]](tied-def 0)
+ ; GFX9-NEXT: [[V_LOG_F16_sdwa2:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[COPY1]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit [[COPY1]](tied-def 0)
+ ; GFX9-NEXT: [[V_MUL_F16_sdwa2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, killed [[V_LOG_F16_sdwa2]], 0, [[S_MOV_B32_]], 0, 0, 4, 2, 4, 6, implicit $mode, implicit $exec, implicit killed [[V_LOG_F16_sdwa2]](tied-def 0)
+ ; GFX9-NEXT: [[V_LOG_F16_sdwa3:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[V_MUL_F16_sdwa2]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit [[V_MUL_F16_sdwa2]](tied-def 0)
+ ; GFX9-NEXT: [[V_MUL_F16_sdwa3:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, killed [[V_LOG_F16_sdwa3]], 0, [[S_MOV_B32_]], 0, 0, 5, 2, 5, 6, implicit $mode, implicit $exec, implicit killed [[V_LOG_F16_sdwa3]](tied-def 0)
+ ; GFX9-NEXT: $vgpr0 = COPY [[V_MUL_F16_sdwa3]]
+ ; GFX9-NEXT: $vgpr1 = COPY [[V_MUL_F16_sdwa1]]
+ ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+ %9:vgpr_32 = COPY $vgpr1
+ %8:vgpr_32 = COPY $vgpr0
+ %11:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %9, 0, 0, implicit $mode, implicit $exec
+ %12:sreg_32 = S_MOV_B32 14732
+ %13:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %11, 0, %12, 0, 0, implicit $mode, implicit $exec
+ %15:vgpr_32 = V_LSHRREV_B32_e64 16, %9, implicit $exec
+ %17:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %15, 0, 0, implicit $mode, implicit $exec
+ %18:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %17, 0, %12, 0, 0, implicit $mode, implicit $exec
+ %20:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %8, 0, 0, implicit $mode, implicit $exec
+ %21:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %20, 0, %12, 0, 0, implicit $mode, implicit $exec
+ %22:vgpr_32 = V_LSHRREV_B32_e64 16, %8, implicit $exec
+ %24:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %22, 0, 0, implicit $mode, implicit $exec
+ %25:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %24, 0, %12, 0, 0, implicit $mode, implicit $exec
+ %26:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %21, 0, killed %25, 0, 0, implicit $mode, implicit $exec
+ %27:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %13, 0, killed %18, 0, 0, implicit $mode, implicit $exec
+ $vgpr0 = COPY %26
+ $vgpr1 = COPY %27
+ SI_RETURN implicit $vgpr0, implicit $vgpr1
+...
+
+---
+name: log2_v4f16
+tracksRegLiveness: true
+body: |
+ bb.0 (%ir-block.0):
+ liveins: $vgpr0, $vgpr1
+
+ ; GFX9-LABEL: name: log2_v4f16
+ ; GFX9: liveins: $vgpr0, $vgpr1
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9-NEXT: [[V_LOG_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[COPY]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit [[COPY]](tied-def 0)
+ ; GFX9-NEXT: [[V_LOG_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[COPY1]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit [[COPY1]](tied-def 0)
+ ; GFX9-NEXT: [[V_LOG_F16_sdwa2:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[V_LOG_F16_sdwa]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit [[V_LOG_F16_sdwa]](tied-def 0)
+ ; GFX9-NEXT: [[V_LOG_F16_sdwa3:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[V_LOG_F16_sdwa1]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit [[V_LOG_F16_sdwa1]](tied-def 0)
+ ; GFX9-NEXT: $vgpr0 = COPY [[V_LOG_F16_sdwa3]]
+ ; GFX9-NEXT: $vgpr1 = COPY [[V_LOG_F16_sdwa2]]
+ ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+ %9:vgpr_32 = COPY $vgpr1
+ %8:vgpr_32 = COPY $vgpr0
+ %11:vgpr_32 = V_LSHRREV_B32_e64 16, %9, implicit $exec
+ %13:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %11, 0, 0, implicit $mode, implicit $exec
+ %14:vgpr_32 = V_LSHRREV_B32_e64 16, %8, implicit $exec
+ %16:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %14, 0, 0, implicit $mode, implicit $exec
+ %18:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %9, 0, 0, implicit $mode, implicit $exec
+ %20:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %8, 0, 0, implicit $mode, implicit $exec
+ %21:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %20, 0, killed %16, 0, 0, implicit $mode, implicit $exec
+ %22:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %18, 0, killed %13, 0, 0, implicit $mode, implicit $exec
+ $vgpr0 = COPY %21
+ $vgpr1 = COPY %22
+ SI_RETURN implicit $vgpr0, implicit $vgpr1
+...
+
+---
+name: exp_v4f16
+tracksRegLiveness: true
+body: |
+ bb.0 (%ir-block.0):
+ liveins: $vgpr0, $vgpr1
+
+ ; GFX9-LABEL: name: exp_v4f16
+ ; GFX9: liveins: $vgpr0, $vgpr1
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9-NEXT: [[V_CVT_F32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 1069066811
+ ; GFX9-NEXT: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed [[V_CVT_F32_F16_e64_]], 0, [[S_MOV_B32_]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_EXP_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F32_e64 0, killed [[V_MUL_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_CVT_F16_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, killed [[V_EXP_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_CVT_F32_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_sdwa 0, [[COPY]], 0, 0, 6, 0, 5, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_MUL_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed [[V_CVT_F32_F16_sdwa]], 0, [[S_MOV_B32_]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_EXP_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F32_e64 0, killed [[V_MUL_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_CVT_F16_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, killed [[V_EXP_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_CVT_F32_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_MUL_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed [[V_CVT_F32_F16_e64_1]], 0, [[S_MOV_B32_]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_EXP_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F32_e64 0, killed [[V_MUL_F32_e64_2]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_CVT_F16_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, killed [[V_EXP_F32_e64_2]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_CVT_F32_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_sdwa 0, [[COPY1]], 0, 0, 6, 0, 5, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_MUL_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed [[V_CVT_F32_F16_sdwa1]], 0, [[S_MOV_B32_]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_EXP_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F32_e64 0, killed [[V_MUL_F32_e64_3]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_CVT_F16_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, killed [[V_EXP_F32_e64_3]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_PACK_B32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed [[V_CVT_F16_F32_e64_2]], 0, killed [[V_CVT_F16_F32_e64_3]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_PACK_B32_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed [[V_CVT_F16_F32_e64_]], 0, killed [[V_CVT_F16_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: $vgpr0 = COPY [[V_PACK_B32_F16_e64_]]
+ ; GFX9-NEXT: $vgpr1 = COPY [[V_PACK_B32_F16_e64_1]]
+ ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+ %9:vgpr_32 = COPY $vgpr1
+ %8:vgpr_32 = COPY $vgpr0
+ %11:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, %9, 0, 0, implicit $mode, implicit $exec
+ %12:sgpr_32 = S_MOV_B32 1069066811
+ %13:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed %11, 0, %12, 0, 0, implicit $mode, implicit $exec
+ %14:vgpr_32 = nofpexcept V_EXP_F32_e64 0, killed %13, 0, 0, implicit $mode, implicit $exec
+ %15:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, killed %14, 0, 0, implicit $mode, implicit $exec
+ %17:vgpr_32 = V_LSHRREV_B32_e64 16, %9, implicit $exec
+ %19:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, %17, 0, 0, implicit $mode, implicit $exec
+ %20:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed %19, 0, %12, 0, 0, implicit $mode, implicit $exec
+ %21:vgpr_32 = nofpexcept V_EXP_F32_e64 0, killed %20, 0, 0, implicit $mode, implicit $exec
+ %22:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, killed %21, 0, 0, implicit $mode, implicit $exec
+ %24:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, %8, 0, 0, implicit $mode, implicit $exec
+ %25:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed %24, 0, %12, 0, 0, implicit $mode, implicit $exec
+ %26:vgpr_32 = nofpexcept V_EXP_F32_e64 0, killed %25, 0, 0, implicit $mode, implicit $exec
+ %27:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, killed %26, 0, 0, implicit $mode, implicit $exec
+ %28:vgpr_32 = V_LSHRREV_B32_e64 16, %8, implicit $exec
+ %30:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, %28, 0, 0, implicit $mode, implicit $exec
+ %31:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed %30, 0, %12, 0, 0, implicit $mode, implicit $exec
+ %32:vgpr_32 = nofpexcept V_EXP_F32_e64 0, killed %31, 0, 0, implicit $mode, implicit $exec
+ %33:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, killed %32, 0, 0, implicit $mode, implicit $exec
+ %34:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %27, 0, killed %33, 0, 0, implicit $mode, implicit $exec
+ %35:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %15, 0, killed %22, 0, 0, implicit $mode, implicit $exec
+ $vgpr0 = COPY %34
+ $vgpr1 = COPY %35
+ SI_RETURN implicit $vgpr0, implicit $vgpr1
+...
+
+---
+name: cascaded_v4f16
+tracksRegLiveness: true
+body: |
+ bb.0 (%ir-block.0):
+ liveins: $vgpr0, $vgpr1
+
+ ; GFX9-LABEL: name: cascaded_v4f16
+ ; GFX9: liveins: $vgpr0, $vgpr1
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9-NEXT: [[V_LOG_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[COPY1]], 0, 0, 6, 0, 5, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_LOG_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[COPY]], 0, 0, 6, 0, 5, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_LOG_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_LOG_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_MUL_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed [[V_LOG_F16_e64_1]], 0, 12568, 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_SIN_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed [[V_MUL_F16_e64_]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_MUL_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed [[V_LOG_F16_sdwa1]], 0, 12568, 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_SIN_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed [[V_MUL_F16_e64_1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_MUL_F16_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed [[V_LOG_F16_e64_]], 0, 12568, 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_SIN_F16_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed [[V_MUL_F16_e64_2]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_MUL_F16_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed [[V_LOG_F16_sdwa]], 0, 12568, 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_SIN_F16_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed [[V_MUL_F16_e64_3]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_PACK_B32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed [[V_SIN_F16_e64_2]], 0, killed [[V_SIN_F16_e64_3]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_PACK_B32_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed [[V_SIN_F16_e64_]], 0, killed [[V_SIN_F16_e64_1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: $vgpr0 = COPY [[V_PACK_B32_F16_e64_]]
+ ; GFX9-NEXT: $vgpr1 = COPY [[V_PACK_B32_F16_e64_1]]
+ ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+ %9:vgpr_32 = COPY $vgpr1
+ %8:vgpr_32 = COPY $vgpr0
+ %11:vgpr_32 = V_LSHRREV_B32_e64 16, %8, implicit $exec
+ %13:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %11, 0, 0, implicit $mode, implicit $exec
+ %14:vgpr_32 = V_LSHRREV_B32_e64 16, %9, implicit $exec
+ %16:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %14, 0, 0, implicit $mode, implicit $exec
+ %18:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %8, 0, 0, implicit $mode, implicit $exec
+ %20:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %9, 0, 0, implicit $mode, implicit $exec
+ %22:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %20, 0, 12568, 0, 0, implicit $mode, implicit $exec
+ %23:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed %22, 0, 0, implicit $mode, implicit $exec
+ %24:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %16, 0, 12568, 0, 0, implicit $mode, implicit $exec
+ %25:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed %24, 0, 0, implicit $mode, implicit $exec
+ %26:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %18, 0, 12568, 0, 0, implicit $mode, implicit $exec
+ %27:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed %26, 0, 0, implicit $mode, implicit $exec
+ %28:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %13, 0, 12568, 0, 0, implicit $mode, implicit $exec
+ %29:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed %28, 0, 0, implicit $mode, implicit $exec
+ %30:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %27, 0, killed %29, 0, 0, implicit $mode, implicit $exec
+ %31:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %23, 0, killed %25, 0, 0, implicit $mode, implicit $exec
+ $vgpr0 = COPY %30
+ $vgpr1 = COPY %31
+ SI_RETURN implicit $vgpr0, implicit $vgpr1
+...
>From 64c89cfbb249f6ad5bf6328cddf27d6c60b4dea3 Mon Sep 17 00:00:00 2001
From: vikashgu <Vikash.Gupta at amd.com>
Date: Tue, 6 May 2025 07:49:38 +0000
Subject: [PATCH 7/7] Reduced duplicate code length & added a new MIR test in
existing testFile.
---
llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 106 ++++++++-----------
llvm/test/CodeGen/AMDGPU/packed-vec-fp16.mir | 43 ++++++++
2 files changed, 89 insertions(+), 60 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 622dfe77cda78..15e3de04237eb 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -1468,54 +1468,44 @@ static bool checkForRightSrcRootAccess(MachineInstr *Def0MI,
Def1DstUnused->getImm() != AMDGPU::SDWA::DstUnused::UNUSED_PAD)
return false;
- MachineOperand *Def1Src0 =
- TII->getNamedOperand(*Def1MI, AMDGPU::OpName::src0);
- MachineOperand *Def1Src1 =
- TII->getNamedOperand(*Def1MI, AMDGPU::OpName::src1);
- MachineOperand *Def0Src0 =
- TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src0);
- MachineOperand *Def0Src1 =
- TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src1);
-
- auto checkForDef0MIAccess = [&]() -> bool {
- if (Def0Src0 && Def0Src0->isReg() && (Def0Src0->getReg() == SrcRootReg)) {
- MachineOperand *Def0Src0Sel =
- TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src0_sel);
- if (!Def0Src0Sel ||
- Def0Src0Sel->getImm() == AMDGPU::SDWA::SdwaSel::WORD_0)
- return true;
- }
-
- if (Def0Src1 && Def0Src1->isReg() && (Def0Src1->getReg() == SrcRootReg)) {
- MachineOperand *Def0Src1Sel =
- TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src1_sel);
- if (!Def0Src1Sel ||
- Def0Src1Sel->getImm() == AMDGPU::SDWA::SdwaSel::WORD_0)
- return true;
+ const auto checkSrcSel = [&](MachineInstr *DefMI, AMDGPU::OpName SrcName,
+ AMDGPU::OpName SrcSelName,
+ AMDGPU::SDWA::SdwaSel SdwaSel) -> bool {
+ MachineOperand *DefSrc = TII->getNamedOperand(*DefMI, SrcName);
+ if (DefSrc && DefSrc->isReg() && (DefSrc->getReg() == SrcRootReg)) {
+ MachineOperand *DefSrcSel = TII->getNamedOperand(*DefMI, SrcSelName);
+ if (SdwaSel == AMDGPU::SDWA::SdwaSel::WORD_0) {
+ if (!DefSrcSel || DefSrcSel->getImm() == SdwaSel)
+ return true;
+ } else {
+ assert(SdwaSel == AMDGPU::SDWA::SdwaSel::WORD_1 &&
+ "Not valid SDWA SrcSel operand");
+ if (DefSrcSel && DefSrcSel->getImm() == SdwaSel)
+ return true;
+ }
}
-
return false;
};
- if (Def1Src0 && Def1Src0->isReg() && (Def1Src0->getReg() == SrcRootReg)) {
- MachineOperand *Def1Src0Sel =
- TII->getNamedOperand(*Def1MI, AMDGPU::OpName::src0_sel);
- if (!Def1Src0Sel || Def1Src0Sel->getImm() != AMDGPU::SDWA::SdwaSel::WORD_1)
- return false;
+ const auto checkForDef0MIAccess = [&]() -> bool {
+ if (checkSrcSel(Def0MI, AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel,
+ AMDGPU::SDWA::SdwaSel::WORD_0))
+ return true;
+ if (checkSrcSel(Def0MI, AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel,
+ AMDGPU::SDWA::SdwaSel::WORD_0))
+ return true;
+ return false;
+ };
+ if (checkSrcSel(Def1MI, AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel,
+ AMDGPU::SDWA::SdwaSel::WORD_1))
if (checkForDef0MIAccess())
return true;
- }
-
- if (Def1Src1 && Def1Src1->isReg() && (Def1Src1->getReg() == SrcRootReg)) {
- MachineOperand *Def1Src1Sel =
- TII->getNamedOperand(*Def1MI, AMDGPU::OpName::src1_sel);
- if (!Def1Src1Sel || Def1Src1Sel->getImm() != AMDGPU::SDWA::SdwaSel::WORD_1)
- return false;
+ if (checkSrcSel(Def1MI, AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel,
+ AMDGPU::SDWA::SdwaSel::WORD_1))
if (checkForDef0MIAccess())
return true;
- }
return false;
}
@@ -1568,7 +1558,7 @@ void SIPeepholeSDWA::convertMIToSDWAWithOpsel(MachineInstr *MI,
TII->getNamedOperand(*MI, AMDGPU::OpName::dst_unused);
assert(DstUnused &&
AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::dst_unused));
- assert(!(DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) &&
+ assert(DstUnused->getImm() != AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE &&
"Dst_unused should not be UNUSED_PRESERVE already");
DstUnused->setImm(AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE);
@@ -1581,31 +1571,27 @@ void SIPeepholeSDWA::convertMIToSDWAWithOpsel(MachineInstr *MI,
MI->addOperand(NewSrcImplitMO);
MI->tieOperands(PreserveDstIdx, MI->getNumOperands() - 1);
- MachineOperand *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
- assert(Src0 && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0));
- if (Src0->isReg() && (Src0->getReg() == SrcMO.getReg())) {
- MachineOperand *Src0Sel =
- TII->getNamedOperand(*MI, AMDGPU::OpName::src0_sel);
- assert(Src0Sel &&
- AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_sel));
- Src0Sel->setImm(OpSel);
+ auto modifySrcSelIntoOpSel = [&](AMDGPU::OpName SrcName,
+ AMDGPU::OpName SrcSelName) -> bool {
+ MachineOperand *Src = TII->getNamedOperand(*MI, SrcName);
+ assert(Src && AMDGPU::hasNamedOperand(SDWAOpcode, SrcName));
+ if (Src->isReg() && (Src->getReg() == SrcMO.getReg())) {
+ MachineOperand *SrcSel = TII->getNamedOperand(*MI, SrcSelName);
+ assert(SrcSel && AMDGPU::hasNamedOperand(SDWAOpcode, SrcSelName));
+ SrcSel->setImm(OpSel);
- LLVM_DEBUG(dbgs() << "\nInto:" << *MI << '\n');
- return;
- }
+ LLVM_DEBUG(dbgs() << "\nInto:" << *MI << '\n');
+ return true;
+ }
- MachineOperand *Src1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1);
- assert(Src1 && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1));
- if (Src1->isReg() && (Src1->getReg() == SrcMO.getReg())) {
- MachineOperand *Src1Sel =
- TII->getNamedOperand(*MI, AMDGPU::OpName::src1_sel);
- assert(Src1Sel &&
- AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_sel));
- Src1Sel->setImm(OpSel);
+ return false;
+ };
- LLVM_DEBUG(dbgs() << "\nInto:" << *MI << '\n');
+ if (modifySrcSelIntoOpSel(AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel))
+ return;
+
+ if (modifySrcSelIntoOpSel(AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel))
return;
- }
}
// BackTracks the given Parent MI to look for any of its use operand that has
diff --git a/llvm/test/CodeGen/AMDGPU/packed-vec-fp16.mir b/llvm/test/CodeGen/AMDGPU/packed-vec-fp16.mir
index 2b2dce0d26a09..9318f8dd2bbea 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-vec-fp16.mir
+++ b/llvm/test/CodeGen/AMDGPU/packed-vec-fp16.mir
@@ -37,6 +37,10 @@
ret <4 x half> %res
}
+ define void @unbalanced_operations_packed(<4 x half> %a) #0 {
+ ret void
+ }
+
declare <4 x half> @llvm.cos.v4f16(<4 x half>) #1
declare <4 x half> @llvm.exp.v4f16(<4 x half>) #1
@@ -335,3 +339,42 @@ body: |
$vgpr1 = COPY %31
SI_RETURN implicit $vgpr0, implicit $vgpr1
...
+
+---
+name: unbalanced_operations_packed
+tracksRegLiveness: true
+body: |
+ bb.0 (%ir-block.0):
+ liveins: $vgpr0, $vgpr1
+ ; GFX9-LABEL: name: unbalanced_operations_packed
+ ; GFX9: liveins: $vgpr0, $vgpr1
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9-NEXT: [[V_LOG_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[COPY]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit [[COPY]](tied-def 0)
+ ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 14732
+ ; GFX9-NEXT: [[V_MUL_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, killed [[V_LOG_F16_sdwa]], 0, [[S_MOV_B32_]], 0, 0, 4, 2, 4, 6, implicit $mode, implicit $exec, implicit killed [[V_LOG_F16_sdwa]](tied-def 0)
+ ; GFX9-NEXT: [[V_LOG_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[V_MUL_F16_sdwa]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit [[V_MUL_F16_sdwa]](tied-def 0)
+ ; GFX9-NEXT: [[V_LOG_F16_sdwa2:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[COPY1]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit [[COPY1]](tied-def 0)
+ ; GFX9-NEXT: [[V_LOG_F16_sdwa3:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[V_LOG_F16_sdwa2]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit [[V_LOG_F16_sdwa2]](tied-def 0)
+ ; GFX9-NEXT: [[V_MUL_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, killed [[V_LOG_F16_sdwa3]], 0, [[S_MOV_B32_]], 0, 0, 5, 2, 5, 6, implicit $mode, implicit $exec, implicit killed [[V_LOG_F16_sdwa3]](tied-def 0)
+ ; GFX9-NEXT: $vgpr0 = COPY [[V_MUL_F16_sdwa1]]
+ ; GFX9-NEXT: $vgpr1 = COPY [[V_LOG_F16_sdwa1]]
+ ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+ %9:vgpr_32 = COPY $vgpr1
+ %8:vgpr_32 = COPY $vgpr0
+ %11:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %9, 0, 0, implicit $mode, implicit $exec
+ %12:sreg_32 = S_MOV_B32 14732
+ %13:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %11, 0, %12, 0, 0, implicit $mode, implicit $exec
+ %15:vgpr_32 = V_LSHRREV_B32_e64 16, %9, implicit $exec
+ %17:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %15, 0, 0, implicit $mode, implicit $exec
+ %20:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %8, 0, 0, implicit $mode, implicit $exec
+ %22:vgpr_32 = V_LSHRREV_B32_e64 16, %8, implicit $exec
+ %24:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %22, 0, 0, implicit $mode, implicit $exec
+ %25:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %24, 0, %12, 0, 0, implicit $mode, implicit $exec
+ %26:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %20, 0, killed %25, 0, 0, implicit $mode, implicit $exec
+ %27:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %13, 0, killed %17, 0, 0, implicit $mode, implicit $exec
+ $vgpr0 = COPY %26
+ $vgpr1 = COPY %27
+ SI_RETURN implicit $vgpr0, implicit $vgpr1
+...
More information about the llvm-commits
mailing list