[llvm] [AMDGPU] Eliminate unnecessary packing in wider f16 vectors for sdwa/opsel-able instruction (PR #137137)
Vikash Gupta via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 9 02:42:46 PST 2025
https://github.com/vg0204 updated https://github.com/llvm/llvm-project/pull/137137
>From 14d7d1ebf1a23a8084c851e7b749977a65000c8f Mon Sep 17 00:00:00 2001
From: vikashgu <Vikash.Gupta at amd.com>
Date: Wed, 23 Apr 2025 12:13:49 +0000
Subject: [PATCH 01/11] [AMDGPU][PeepholeOpt] Eliminate unnecessary packing in
fp16 vector operations for SDWA/OPSEL-able instruction
As the compiler has no fp16 packed instruction,
so isel scalarizes each fp16 operation in wide fp16 vectors and generates
separate individual fp16 results, which are later packed. Now, in post-
isel pass in SIPeepholeSDWA pass, opportunistically any instructions is
eventually converted into its SDWA/OPSEL-able version.
This patch gets rids of unnecessary packing in wider fp16 vectors
operation for SDWA/OPSEL-able instruction, by overwriting the partial
fp16 result into same input register partially, while maintaining the
sanctity of rest of bits in input register, using OPSEL dst_unused
operand set as UNUSED_PRESERVED. Owing to the context of generating SDWA
instructions, it is invoked at the end of the SIPeepholeSDWA pass.
---
llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 516 +++++++++++++++++++++-
1 file changed, 514 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index acc4b3f0a68b4..b5b743e029e1b 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -27,6 +27,7 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include <optional>
+#include <queue>
using namespace llvm;
@@ -35,6 +36,11 @@ using namespace llvm;
STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found.");
STATISTIC(NumSDWAInstructionsPeepholed,
"Number of instruction converted to SDWA.");
+STATISTIC(Num16BitPackedInstructionsEliminated,
+ "Number of packed instruction eliminated.");
+STATISTIC(NumSDWAInstructionsToEliminateFP16Pack,
+ "Number of instruction converted/modified into SDWA to eliminate "
+ "FP16 packing.");
namespace {
@@ -67,6 +73,14 @@ class SIPeepholeSDWA {
bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const;
+ void eliminateFP16Packing(MachineBasicBlock &MBB, const GCNSubtarget &ST);
+ unsigned
+ computeMIChainsForPackedOps(MachineInstr *ParentMI,
+ std::queue<MachineOperand *> &DefSrcQueue,
+ const GCNSubtarget &ST);
+ void convertMIToSDWAWithOpsel(MachineInstr &MI, MachineOperand &SrcMO,
+ AMDGPU::SDWA::SdwaSel OpSel);
+
public:
bool run(MachineFunction &MF);
};
@@ -267,13 +281,17 @@ void SDWADstPreserveOperand::print(raw_ostream& OS) const {
#endif
-static void copyRegOperand(MachineOperand &To, const MachineOperand &From) {
+static void copyRegOperand(MachineOperand &To, const MachineOperand &From,
+ bool isKill = false) {
assert(To.isReg() && From.isReg());
To.setReg(From.getReg());
To.setSubReg(From.getSubReg());
To.setIsUndef(From.isUndef());
if (To.isUse()) {
- To.setIsKill(From.isKill());
+ if (isKill)
+ To.setIsKill(true);
+ else
+ To.setIsKill(From.isKill());
} else {
To.setIsDead(From.isDead());
}
@@ -1369,6 +1387,494 @@ bool SIPeepholeSDWALegacy::runOnMachineFunction(MachineFunction &MF) {
return SIPeepholeSDWA().run(MF);
}
+static bool isSrcDestFP16Bits(MachineInstr *MI, const SIInstrInfo *TII) {
+ unsigned Opcode = MI->getOpcode();
+ if (TII->isSDWA(Opcode))
+ Opcode = AMDGPU::getBasicFromSDWAOp(Opcode);
+
+ switch (Opcode) {
+ case AMDGPU::V_CVT_F16_U16_e32:
+ case AMDGPU::V_CVT_F16_U16_e64:
+ case AMDGPU::V_CVT_F16_I16_e32:
+ case AMDGPU::V_CVT_F16_I16_e64:
+ case AMDGPU::V_RCP_F16_e64:
+ case AMDGPU::V_RCP_F16_e32:
+ case AMDGPU::V_RSQ_F16_e64:
+ case AMDGPU::V_RSQ_F16_e32:
+ case AMDGPU::V_SQRT_F16_e64:
+ case AMDGPU::V_SQRT_F16_e32:
+ case AMDGPU::V_LOG_F16_e64:
+ case AMDGPU::V_LOG_F16_e32:
+ case AMDGPU::V_EXP_F16_e64:
+ case AMDGPU::V_EXP_F16_e32:
+ case AMDGPU::V_SIN_F16_e64:
+ case AMDGPU::V_SIN_F16_e32:
+ case AMDGPU::V_COS_F16_e64:
+ case AMDGPU::V_COS_F16_e32:
+ case AMDGPU::V_FLOOR_F16_e64:
+ case AMDGPU::V_FLOOR_F16_e32:
+ case AMDGPU::V_CEIL_F16_e64:
+ case AMDGPU::V_CEIL_F16_e32:
+ case AMDGPU::V_TRUNC_F16_e64:
+ case AMDGPU::V_TRUNC_F16_e32:
+ case AMDGPU::V_RNDNE_F16_e64:
+ case AMDGPU::V_RNDNE_F16_e32:
+ case AMDGPU::V_FRACT_F16_e64:
+ case AMDGPU::V_FRACT_F16_e32:
+ case AMDGPU::V_FREXP_MANT_F16_e64:
+ case AMDGPU::V_FREXP_MANT_F16_e32:
+ case AMDGPU::V_FREXP_EXP_I16_F16_e64:
+ case AMDGPU::V_FREXP_EXP_I16_F16_e32:
+ case AMDGPU::V_LDEXP_F16_e64:
+ case AMDGPU::V_LDEXP_F16_e32:
+ case AMDGPU::V_ADD_F16_e64:
+ case AMDGPU::V_ADD_F16_e32:
+ case AMDGPU::V_SUB_F16_e64:
+ case AMDGPU::V_SUB_F16_e32:
+ case AMDGPU::V_SUBREV_F16_e64:
+ case AMDGPU::V_SUBREV_F16_e32:
+ case AMDGPU::V_MUL_F16_e64:
+ case AMDGPU::V_MUL_F16_e32:
+ case AMDGPU::V_MAX_F16_e64:
+ case AMDGPU::V_MAX_F16_e32:
+ case AMDGPU::V_MIN_F16_e64:
+ case AMDGPU::V_MIN_F16_e32:
+ case AMDGPU::V_MAD_F16_e64:
+ case AMDGPU::V_FMA_F16_e64:
+ case AMDGPU::V_DIV_FIXUP_F16_e64:
+ return true;
+ case AMDGPU::V_MADAK_F16:
+ case AMDGPU::V_MADMK_F16:
+ case AMDGPU::V_FMAMK_F16:
+ case AMDGPU::V_FMAAK_F16:
+ // NOTE : SKEPTICAL ABOUT IT
+ return false;
+ case AMDGPU::V_FMAC_F16_e32:
+ case AMDGPU::V_FMAC_F16_e64:
+ case AMDGPU::V_MAC_F16_e32:
+ case AMDGPU::V_MAC_F16_e64:
+ // As their sdwa version allow dst_sel to be equal only set to DWORD
+ default:
+ return false;
+ }
+}
+
+static bool checkForRightSrcRootAccess(MachineInstr *Def0MI,
+ MachineInstr *Def1MI,
+ Register SrcRootReg,
+ const SIInstrInfo *TII) {
+ // As if could, the Def1MI would have been sdwa-ed
+ if (!TII->isSDWA(Def1MI->getOpcode()))
+ return false;
+
+ MachineOperand *Def1Src0 =
+ TII->getNamedOperand(*Def1MI, AMDGPU::OpName::src0);
+ MachineOperand *Def1Src1 =
+ TII->getNamedOperand(*Def1MI, AMDGPU::OpName::src1);
+ MachineOperand *Def0Src0 =
+ TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src0);
+ MachineOperand *Def0Src1 =
+ TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src1);
+
+ if (Def1Src0 && Def1Src0->isReg() && (Def1Src0->getReg() == SrcRootReg)) {
+ MachineOperand *Def1Src0Sel =
+ TII->getNamedOperand(*Def1MI, AMDGPU::OpName::src0_sel);
+ if (!Def1Src0Sel ||
+ (Def1Src0Sel->getImm() != AMDGPU::SDWA::SdwaSel::WORD_1))
+ return false;
+
+ if (Def0Src0 && Def0Src0->isReg() && (Def0Src0->getReg() == SrcRootReg)) {
+ MachineOperand *Def0Src0Sel =
+ TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src0_sel);
+ if (!Def0Src0Sel)
+ return true;
+ if (Def0Src0Sel && Def0Src0Sel->getImm() == AMDGPU::SDWA::SdwaSel::WORD_0)
+ return true;
+ }
+
+ if (Def0Src1 && Def0Src1->isReg() && (Def0Src1->getReg() == SrcRootReg)) {
+ MachineOperand *Def0Src1Sel =
+ TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src1_sel);
+ if (!Def0Src1Sel)
+ return true;
+ if (Def0Src1Sel && Def0Src1Sel->getImm() == AMDGPU::SDWA::SdwaSel::WORD_0)
+ return true;
+ }
+ }
+
+ if (Def1Src1 && Def1Src1->isReg() && (Def1Src1->getReg() == SrcRootReg)) {
+ MachineOperand *Def1Src1Sel =
+ TII->getNamedOperand(*Def1MI, AMDGPU::OpName::src1_sel);
+ if (!Def1Src1Sel ||
+ (Def1Src1Sel->getImm() != AMDGPU::SDWA::SdwaSel::WORD_1))
+ return false;
+
+ if (Def0Src0 && Def0Src0->isReg() && (Def0Src0->getReg() == SrcRootReg)) {
+ MachineOperand *Def0Src0Sel =
+ TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src0_sel);
+ if (!Def0Src0Sel)
+ return true;
+ if (Def0Src0Sel && Def0Src0Sel->getImm() == AMDGPU::SDWA::SdwaSel::WORD_0)
+ return true;
+ }
+
+ if (Def0Src1 && Def0Src1->isReg() && (Def0Src1->getReg() == SrcRootReg)) {
+ MachineOperand *Def0Src1Sel =
+ TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src1_sel);
+ if (!Def0Src1Sel)
+ return true;
+ if (Def0Src1Sel && Def0Src1Sel->getImm() == AMDGPU::SDWA::SdwaSel::WORD_0)
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/// Given A and B are in the same MBB, returns true if A comes before B.
+static bool dominates(MachineBasicBlock::const_iterator A,
+ MachineBasicBlock::const_iterator B) {
+ assert(A->getParent() == B->getParent());
+ const MachineBasicBlock *MBB = A->getParent();
+ auto MBBEnd = MBB->end();
+ if (B == MBBEnd)
+ return true;
+
+ MachineBasicBlock::const_iterator I = MBB->begin();
+ for (; &*I != A && &*I != B; ++I)
+ ;
+
+ return &*I == A;
+}
+
+// Convert MI into its SDWA version with its Dst_Sel & SrcMO_Sel set with OpSel
+// and preserving the rest of Dst's bits.
+void SIPeepholeSDWA::convertMIToSDWAWithOpsel(MachineInstr &MI,
+ MachineOperand &SrcMO,
+ AMDGPU::SDWA::SdwaSel OpSel) {
+ LLVM_DEBUG(dbgs() << "Convert instruction:" << MI);
+
+ MachineInstr *SDWAInst;
+ if (TII->isSDWA(MI.getOpcode())) {
+ SDWAInst = &MI;
+ } else {
+ SDWAInst = createSDWAVersion(MI);
+ MI.eraseFromParent();
+ }
+
+ ConvertedInstructions.push_back(SDWAInst);
+ unsigned SDWAOpcode = SDWAInst->getOpcode();
+ ++NumSDWAInstructionsToEliminateFP16Pack;
+
+ MachineOperand *Dst = TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::vdst);
+ assert(Dst && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::vdst));
+
+ MachineOperand *DstSel =
+ TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::dst_sel);
+ assert(DstSel &&
+ AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::dst_sel));
+ DstSel->setImm(OpSel);
+
+ MachineOperand *DstUnused =
+ TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::dst_unused);
+ assert(DstUnused &&
+ AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::dst_unused));
+ assert(!(DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) &&
+ "Dst_unused should not be UNUSED_PRESERVE already");
+ DstUnused->setImm(AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE);
+
+ auto PreserveDstIdx =
+ AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst);
+ assert(PreserveDstIdx != -1);
+ auto NewSrcImplitMO = MachineOperand::CreateReg(SrcMO.getReg(), false, true);
+ copyRegOperand(NewSrcImplitMO, SrcMO);
+ SDWAInst->addOperand(NewSrcImplitMO);
+ SDWAInst->tieOperands(PreserveDstIdx, SDWAInst->getNumOperands() - 1);
+
+ MachineOperand *Src0 = TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::src0);
+ assert(Src0 && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0));
+ if (Src0->isReg() && (Src0->getReg() == SrcMO.getReg())) {
+ MachineOperand *Src0Sel =
+ TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::src0_sel);
+ assert(Src0Sel &&
+ AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_sel));
+ Src0Sel->setImm(OpSel);
+
+ LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n');
+ return;
+ }
+
+ MachineOperand *Src1 = TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::src1);
+ assert(Src1 && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1));
+ if (Src1->isReg() && (Src1->getReg() == SrcMO.getReg())) {
+ MachineOperand *Src1Sel =
+ TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::src1_sel);
+ assert(Src1Sel &&
+ AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_sel));
+ Src1Sel->setImm(OpSel);
+
+ LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n');
+ return;
+ }
+}
+
+// BackTracks the given Parent MI to look for any of its use operand that has
+// been defined by FP16 (sdwa-able) in recursive fashion.
+unsigned SIPeepholeSDWA::computeMIChainsForPackedOps(
+ MachineInstr *ParentMI, std::queue<MachineOperand *> &DefSrcQueue,
+ const GCNSubtarget &ST) {
+ unsigned NumOfFP16Def;
+ do {
+ MachineInstr *NextMIInChain = nullptr;
+ NumOfFP16Def = 0;
+ for (MachineOperand ¤tMO : ParentMI->uses()) {
+ if (!currentMO.isReg() || currentMO.getReg().isPhysical() ||
+ !MRI->hasOneUse(currentMO.getReg()))
+ continue;
+
+ MachineOperand *DefCurrMO = findSingleRegDef(¤tMO, MRI);
+ if (!DefCurrMO)
+ continue;
+
+ MachineInstr *DefCurrMI = DefCurrMO->getParent();
+ if (!isSrcDestFP16Bits(DefCurrMI, TII) ||
+ !isConvertibleToSDWA(*DefCurrMI, ST, TII))
+ continue;
+
+ NextMIInChain = DefCurrMI;
+ DefSrcQueue.push(DefCurrMO);
+ NumOfFP16Def++;
+ }
+
+ if (NumOfFP16Def > 1)
+ break;
+
+ ParentMI = NextMIInChain;
+ } while (ParentMI);
+
+ return NumOfFP16Def;
+}
+
+void SIPeepholeSDWA::eliminateFP16Packing(MachineBasicBlock &MBB,
+ const GCNSubtarget &ST) {
+ if (!ST.has16BitInsts())
+ return;
+
+ for (MachineInstr &MI : make_early_inc_range(MBB)) {
+ if (MI.getOpcode() == AMDGPU::V_PACK_B32_F16_e64) {
+ LLVM_DEBUG(dbgs() << "\nCandidate FP16 Packed MI : " << MI << '\n');
+ std::queue<MachineOperand *> DefSrc0Queue;
+ std::queue<MachineOperand *> DefSrc1Queue;
+ MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+ MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+
+ if (!Src0->isReg() || Src0->getReg().isPhysical() ||
+ !MRI->hasOneUse(Src0->getReg()) || !Src1->isReg() ||
+ Src1->getReg().isPhysical() || !MRI->hasOneUse(Src1->getReg()))
+ continue;
+
+ MachineOperand *Op0 = findSingleRegDef(Src0, MRI);
+ MachineOperand *Op1 = findSingleRegDef(Src1, MRI);
+
+ if (!Op0 || !Op1)
+ continue;
+
+ MachineInstr *ParentMIOp0 = Op0->getParent();
+ MachineInstr *ParentMIOp1 = Op1->getParent();
+
+ if (!isSrcDestFP16Bits(ParentMIOp0, TII) ||
+ !isSrcDestFP16Bits(ParentMIOp1, TII) ||
+ !isConvertibleToSDWA(*ParentMIOp0, ST, TII) ||
+ !isConvertibleToSDWA(*ParentMIOp1, ST, TII))
+ continue;
+
+ DefSrc0Queue.push(Op0);
+ DefSrc1Queue.push(Op1);
+
+ // This checks for the given MI, that it only has exact one register MO
+ // use , that is defined by pure FP16 instruction (that is SDWA-able too)
+ unsigned NumOfFP16Def;
+
+ NumOfFP16Def = computeMIChainsForPackedOps(ParentMIOp0, DefSrc0Queue, ST);
+ if (NumOfFP16Def > 1)
+ continue;
+
+ NumOfFP16Def = computeMIChainsForPackedOps(ParentMIOp1, DefSrc1Queue, ST);
+ if (NumOfFP16Def > 1)
+ continue;
+
+ MachineInstr *Def0RootMI = (DefSrc0Queue.back())->getParent();
+ MachineInstr *Def1RootMI = (DefSrc1Queue.back())->getParent();
+ Register SrcRootMOReg = AMDGPU::NoRegister;
+
+ // Now, check if the last operation for each in of the DefSrcQueue
+ // has the common MO, that would be the source root MO for element-wise
+ // fp16 chain operations
+ for (MachineOperand &Current0MO : Def0RootMI->uses()) {
+ if (!Current0MO.isReg() || Current0MO.getReg().isPhysical())
+ continue;
+
+ for (MachineOperand &Current1MO : Def1RootMI->uses()) {
+ if (!Current1MO.isReg() || Current1MO.getReg().isPhysical())
+ continue;
+
+ if (Current0MO.getReg() == Current1MO.getReg() &&
+ Current0MO.getSubReg() == Current1MO.getSubReg()) {
+ SrcRootMOReg = Current0MO.getReg();
+ break;
+ }
+ }
+ // Found it, no more check needed, so break;
+ if (SrcRootMOReg != AMDGPU::NoRegister)
+ break;
+ }
+
+ if (SrcRootMOReg == AMDGPU::NoRegister)
+ continue;
+
+ // Also we need to ensure that each of the DefXRootMI should access the
+ // lower and upper half word of SrcRootMOReg respectively.
+ if (!checkForRightSrcRootAccess(Def0RootMI, Def1RootMI, SrcRootMOReg,
+ TII))
+ continue;
+
+ // The graph below represents the connection :
+ // Op0Intial --> Op0x --> ... --> Op0Final
+ // / \'
+ // SrcRootMO v_Pack_b32_f16
+ // \ /
+ // Op1Intial --> Op1x --> ... --> Op1Final
+ // The nomenclature is based upon above flow-graph
+ //
+ // Also for each of DefSrcXQueue :
+ // OpXIntial is at back & OpXFinal is at front
+ auto Op0FinalMI = (DefSrc0Queue.front())->getParent();
+ auto Op1FinalMI = (DefSrc1Queue.front())->getParent();
+ auto Op0IntialMI = (DefSrc0Queue.back())->getParent();
+ auto Op1IntialMI = (DefSrc1Queue.back())->getParent();
+
+ MachineOperand *FinalOutMO = nullptr;
+ std::queue<MachineOperand *> ChainedDefOps;
+ AMDGPU::SDWA::SdwaSel OpSel = AMDGPU::SDWA::SdwaSel::DWORD;
+ int NumOfElemInSecondOpChain = 0;
+
+ // Now, we will change the flow as per the dominace of MI as follows, if
+ // possible and store it in ChainedDefOps, so later can be used to convert
+ // into its SDWA version:
+ //
+ // If (dominates(Op0FinalMI, Op1IntialMI)) == TRUE
+ // SrcRootMO -> Op0Intial -> Op0x -> ... -> Op0Final
+ // -> Op1Intial -> Op1x -> ... -> Op1Final (FinalOutMO)
+ //
+ // If (dominates(Op1FinalMI, Op0IntialMI)) == TRUE
+ // SrcRootMO -> Op1Intial -> Op1x -> ... -> Op1Final
+ // -> Op0Intial -> Op0x -> ... -> Op0Final (FinalOutMO)
+ //
+ // TODO : Else, not handled!
+ // One such case is observed when multiple fp16 instruction are chained
+ // on a fp16 vector input. For Example :
+ //
+ // %1 = call <2 x half> @llvm.log.v2f16 (<2 x half> %0)
+ // %res = call <2 x half> @llvm.sin.v2f16 (<2 x half> %1)
+ // return <2 x half> %res
+ if (dominates(Op0FinalMI, Op1IntialMI)) {
+ int OpIdx = Op1IntialMI->findRegisterUseOperandIdx(SrcRootMOReg, TRI);
+ auto &MOTo = Op1IntialMI->getOperand(OpIdx);
+ auto MOFrom = DefSrc0Queue.front();
+ copyRegOperand(MOTo, *MOFrom, true);
+ FinalOutMO = DefSrc1Queue.front();
+
+ LLVM_DEBUG(dbgs() << "Updated Connecting MI : " << *Op1IntialMI
+ << '\n');
+ OpIdx = Op0IntialMI->findRegisterUseOperandIdx(SrcRootMOReg, TRI);
+ auto &IntialInMO = Op0IntialMI->getOperand(OpIdx);
+
+ while (!DefSrc1Queue.empty()) {
+ ChainedDefOps.push(DefSrc1Queue.front());
+ DefSrc1Queue.pop();
+ NumOfElemInSecondOpChain++;
+ }
+ while (!DefSrc0Queue.empty()) {
+ ChainedDefOps.push(DefSrc0Queue.front());
+ DefSrc0Queue.pop();
+ }
+
+ ChainedDefOps.push(&IntialInMO);
+ OpSel = AMDGPU::SDWA::SdwaSel::WORD_1;
+ } else if (dominates(Op1FinalMI, Op0IntialMI)) {
+ int OpIdx = Op0IntialMI->findRegisterUseOperandIdx(SrcRootMOReg, TRI);
+ auto &MOTo = Op0IntialMI->getOperand(OpIdx);
+ auto MOFrom = DefSrc1Queue.front();
+ copyRegOperand(MOTo, *MOFrom, true);
+ FinalOutMO = DefSrc0Queue.front();
+
+ LLVM_DEBUG(dbgs() << "Updated Connecting MI : " << *Op0IntialMI
+ << '\n');
+ OpIdx = Op1IntialMI->findRegisterUseOperandIdx(SrcRootMOReg, TRI);
+ auto &IntialInMO = Op1IntialMI->getOperand(OpIdx);
+
+ while (!DefSrc0Queue.empty()) {
+ ChainedDefOps.push(DefSrc0Queue.front());
+ DefSrc0Queue.pop();
+ NumOfElemInSecondOpChain++;
+ }
+ while (!DefSrc1Queue.empty()) {
+ ChainedDefOps.push(DefSrc1Queue.front());
+ DefSrc1Queue.pop();
+ }
+
+ ChainedDefOps.push(&IntialInMO);
+ OpSel = AMDGPU::SDWA::SdwaSel::WORD_0;
+ } else {
+ LLVM_DEBUG(dbgs() << "No Connecting MI exists" << '\n');
+ continue;
+ }
+
+ // Replace all use places of MI(v_pack) defMO with FinalOutMO.
+ MachineOperand &DefMO = MI.getOperand(0);
+ for (MachineOperand &MO :
+ make_early_inc_range(MRI->use_nodbg_operands(DefMO.getReg()))) {
+ if (!MO.isReg())
+ continue;
+
+ MO.setReg(FinalOutMO->getReg());
+ MO.setSubReg(FinalOutMO->getSubReg());
+ }
+ LLVM_DEBUG(dbgs() << "Replace uses of " << DefMO << " in " << MI
+ << "With " << *FinalOutMO << '\n');
+
+ // Delete v_pack machine instruction
+ LLVM_DEBUG(dbgs() << "\nInstruction to be deleted : " << MI << "\n\n");
+ MI.eraseFromParent();
+ ++Num16BitPackedInstructionsEliminated;
+
+ // Convert machine instruction into SDWA-version
+ while (ChainedDefOps.size() != 1) {
+ if (NumOfElemInSecondOpChain == 0) {
+ if (OpSel == AMDGPU::SDWA::SdwaSel::WORD_0)
+ OpSel = AMDGPU::SDWA::SdwaSel::WORD_1;
+ else
+ OpSel = AMDGPU::SDWA::SdwaSel::WORD_0;
+ }
+
+ MachineInstr *DefMI = ChainedDefOps.front()->getParent();
+ ChainedDefOps.pop();
+ MachineOperand *SrcMO = ChainedDefOps.front();
+
+ // Take SrcMO (which are def) as its usage in DefMI
+ if (SrcMO->isDef()) {
+ assert(MRI->hasOneUse(SrcMO->getReg()));
+ SrcMO = findSingleRegUse(SrcMO, MRI);
+ assert(DefMI == SrcMO->getParent() && "the only use is not in DefMI");
+ }
+
+ convertMIToSDWAWithOpsel(*DefMI, *SrcMO, OpSel);
+ NumOfElemInSecondOpChain--;
+ }
+ }
+ }
+}
+
bool SIPeepholeSDWA::run(MachineFunction &MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
@@ -1434,6 +1940,12 @@ bool SIPeepholeSDWA::run(MachineFunction &MF) {
while (!ConvertedInstructions.empty())
legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST);
} while (Changed);
+
+ // Process each v_pack_b32_fp16 instruction in MBB.
+ eliminateFP16Packing(MBB, ST);
+ Ret |= !ConvertedInstructions.empty();
+ while (!ConvertedInstructions.empty())
+ legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST);
}
return Ret;
>From 20d75a7a0fa3b4f2bc7688cf06e2c0b654bb6826 Mon Sep 17 00:00:00 2001
From: vikashgu <Vikash.Gupta at amd.com>
Date: Thu, 24 Apr 2025 06:56:22 +0000
Subject: [PATCH 02/11] Update the LIT tests to accomodate the patch effects.
---
.../AMDGPU/GlobalISel/combine-fma-sub-mul.ll | 88 +++---
.../GlobalISel/combine-fma-sub-neg-mul.ll | 40 ++-
.../CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll | 100 ++++---
llvm/test/CodeGen/AMDGPU/fdiv.f16.ll | 20 +-
.../AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll | 51 ++--
llvm/test/CodeGen/AMDGPU/fract-match.ll | 11 +-
llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll | 18 +-
llvm/test/CodeGen/AMDGPU/llvm.exp.ll | 14 +-
llvm/test/CodeGen/AMDGPU/llvm.exp10.ll | 18 +-
llvm/test/CodeGen/AMDGPU/llvm.exp2.ll | 35 +--
llvm/test/CodeGen/AMDGPU/llvm.frexp.ll | 23 +-
llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll | 44 ++-
llvm/test/CodeGen/AMDGPU/llvm.log.ll | 260 ++++++++++--------
llvm/test/CodeGen/AMDGPU/llvm.log10.ll | 260 ++++++++++--------
llvm/test/CodeGen/AMDGPU/llvm.log2.ll | 110 +++-----
llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll | 5 +-
llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll | 18 +-
llvm/test/CodeGen/AMDGPU/repeated-divisor.ll | 12 +-
llvm/test/CodeGen/AMDGPU/roundeven.ll | 80 +++---
19 files changed, 594 insertions(+), 613 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll
index 99bdcdd1f31e5..6bf833f067b32 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll
@@ -545,12 +545,10 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> %
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2
; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3
-; GFX9-NEXT: v_sub_f16_e32 v2, v0, v4
-; GFX9-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT: v_sub_f16_e32 v3, v1, v5
-; GFX9-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX9-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX9-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-CONTRACT-LABEL: test_v4f16_sub_mul:
@@ -565,12 +563,10 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> %
; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2
; GFX9-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3
-; GFX9-DENORM-NEXT: v_sub_f16_e32 v2, v0, v4
-; GFX9-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-DENORM-NEXT: v_sub_f16_e32 v3, v1, v5
-; GFX9-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-DENORM-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX9-DENORM-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX9-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_v4f16_sub_mul:
@@ -578,12 +574,10 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> %
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2
; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3
-; GFX10-NEXT: v_sub_f16_e32 v2, v0, v4
-; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT: v_sub_f16_e32 v3, v1, v5
-; GFX10-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-CONTRACT-LABEL: test_v4f16_sub_mul:
@@ -598,12 +592,10 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> %
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2
; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3
-; GFX10-DENORM-NEXT: v_sub_f16_e32 v2, v0, v4
-; GFX10-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-DENORM-NEXT: v_sub_f16_e32 v3, v1, v5
-; GFX10-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-DENORM-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX10-DENORM-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX10-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-CONTRACT-LABEL: test_v4f16_sub_mul:
@@ -644,12 +636,12 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2
; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3
-; GFX9-NEXT: v_sub_f16_e32 v2, v4, v0
-; GFX9-NEXT: v_sub_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT: v_sub_f16_e32 v3, v5, v1
-; GFX9-NEXT: v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX9-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX9-NEXT: v_sub_f16_sdwa v4, v4, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: v_sub_f16_sdwa v5, v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: v_sub_f16_sdwa v4, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT: v_sub_f16_sdwa v5, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-NEXT: v_mov_b32_e32 v1, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-CONTRACT-LABEL: test_v4f16_sub_mul_rhs:
@@ -664,12 +656,12 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal
; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2
; GFX9-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3
-; GFX9-DENORM-NEXT: v_sub_f16_e32 v2, v4, v0
-; GFX9-DENORM-NEXT: v_sub_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-DENORM-NEXT: v_sub_f16_e32 v3, v5, v1
-; GFX9-DENORM-NEXT: v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-DENORM-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX9-DENORM-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX9-DENORM-NEXT: v_sub_f16_sdwa v4, v4, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-DENORM-NEXT: v_sub_f16_sdwa v5, v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-DENORM-NEXT: v_sub_f16_sdwa v4, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-DENORM-NEXT: v_sub_f16_sdwa v5, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-DENORM-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-DENORM-NEXT: v_mov_b32_e32 v1, v5
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_v4f16_sub_mul_rhs:
@@ -677,12 +669,12 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2
; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3
-; GFX10-NEXT: v_sub_f16_e32 v2, v4, v0
-; GFX10-NEXT: v_sub_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT: v_sub_f16_e32 v3, v5, v1
-; GFX10-NEXT: v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX10-NEXT: v_sub_f16_sdwa v4, v4, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_sub_f16_sdwa v5, v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_sub_f16_sdwa v4, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT: v_sub_f16_sdwa v5, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
+; GFX10-NEXT: v_mov_b32_e32 v1, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-CONTRACT-LABEL: test_v4f16_sub_mul_rhs:
@@ -697,12 +689,12 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2
; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3
-; GFX10-DENORM-NEXT: v_sub_f16_e32 v2, v4, v0
-; GFX10-DENORM-NEXT: v_sub_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-DENORM-NEXT: v_sub_f16_e32 v3, v5, v1
-; GFX10-DENORM-NEXT: v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-DENORM-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX10-DENORM-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX10-DENORM-NEXT: v_sub_f16_sdwa v4, v4, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DENORM-NEXT: v_sub_f16_sdwa v5, v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DENORM-NEXT: v_sub_f16_sdwa v4, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-DENORM-NEXT: v_sub_f16_sdwa v5, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, v4
+; GFX10-DENORM-NEXT: v_mov_b32_e32 v1, v5
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-CONTRACT-LABEL: test_v4f16_sub_mul_rhs:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll
index 70f961e2777af..ad11c9b5f28ca 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll
@@ -221,12 +221,10 @@ define <4 x half> @test_v4f16_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
-; GFX9-NEXT: v_sub_f16_e32 v2, v0, v4
-; GFX9-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT: v_sub_f16_e32 v3, v1, v5
-; GFX9-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX9-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX9-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-CONTRACT-LABEL: test_v4f16_sub_ext_neg_mul:
@@ -241,12 +239,10 @@ define <4 x half> @test_v4f16_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x
; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
; GFX9-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
-; GFX9-DENORM-NEXT: v_sub_f16_e32 v2, v0, v4
-; GFX9-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-DENORM-NEXT: v_sub_f16_e32 v3, v1, v5
-; GFX9-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-DENORM-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX9-DENORM-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX9-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_v4f16_sub_ext_neg_mul:
@@ -254,12 +250,10 @@ define <4 x half> @test_v4f16_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
-; GFX10-NEXT: v_sub_f16_e32 v2, v0, v4
-; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT: v_sub_f16_e32 v3, v1, v5
-; GFX10-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-CONTRACT-LABEL: test_v4f16_sub_ext_neg_mul:
@@ -274,12 +268,10 @@ define <4 x half> @test_v4f16_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
-; GFX10-DENORM-NEXT: v_sub_f16_e32 v2, v0, v4
-; GFX10-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-DENORM-NEXT: v_sub_f16_e32 v3, v1, v5
-; GFX10-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-DENORM-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX10-DENORM-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX10-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
entry:
%a = fmul <4 x half> %x, %y
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
index 62b264a537457..aac8bab8ddd01 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
@@ -1091,21 +1091,21 @@ define <2 x half> @v_fdiv_v2f16_afn(<2 x half> %a, <2 x half> %b) {
; GFX9-LABEL: v_fdiv_v2f16_afn:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_rcp_f16_e32 v2, v1
-; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0
+; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT: v_mov_b32_e32 v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fdiv_v2f16_afn:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_rcp_f16_e32 v2, v1
-; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT: v_mov_b32_e32 v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fdiv_v2f16_afn:
@@ -2685,17 +2685,15 @@ define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) {
; GFX9-LABEL: v_rcp_v2f16_arcp:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_rcp_f16_e32 v1, v0
-; GFX9-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX9-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_rcp_v2f16_arcp:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_rcp_f16_e32 v1, v0
-; GFX10-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX10-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_rcp_v2f16_arcp:
@@ -2734,17 +2732,15 @@ define <2 x half> @v_rcp_v2f16_arcp_afn(<2 x half> %x) {
; GFX9-LABEL: v_rcp_v2f16_arcp_afn:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_rcp_f16_e32 v1, v0
-; GFX9-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX9-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_rcp_v2f16_arcp_afn:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_rcp_f16_e32 v1, v0
-; GFX10-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX10-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_rcp_v2f16_arcp_afn:
@@ -3070,21 +3066,21 @@ define <2 x half> @v_fdiv_v2f16_afn_ulp25(<2 x half> %a, <2 x half> %b) {
; GFX9-LABEL: v_fdiv_v2f16_afn_ulp25:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_rcp_f16_e32 v2, v1
-; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0
+; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT: v_mov_b32_e32 v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fdiv_v2f16_afn_ulp25:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_rcp_f16_e32 v2, v1
-; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT: v_mov_b32_e32 v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fdiv_v2f16_afn_ulp25:
@@ -3188,21 +3184,21 @@ define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) {
; GFX9-LABEL: v_fdiv_v2f16_arcp_ulp25:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_rcp_f16_e32 v2, v1
-; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0
+; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT: v_mov_b32_e32 v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fdiv_v2f16_arcp_ulp25:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_rcp_f16_e32 v2, v1
-; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT: v_mov_b32_e32 v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fdiv_v2f16_arcp_ulp25:
@@ -3250,21 +3246,21 @@ define <2 x half> @v_fdiv_v2f16_arcp_afn_ulp25(<2 x half> %a, <2 x half> %b) {
; GFX9-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_rcp_f16_e32 v2, v1
-; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0
+; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT: v_mov_b32_e32 v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_rcp_f16_e32 v2, v1
-; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT: v_mov_b32_e32 v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
index 9e527cf38e7ee..232e244075ea8 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
@@ -2186,17 +2186,15 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX9-LABEL: v_rsq_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_rsq_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_rsq_f16_e32 v0, v0
-; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT: v_rsq_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: v_rsq_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_rsq_v2f16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_rsq_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_rsq_f16_e32 v0, v0
-; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX10-NEXT: v_rsq_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_rsq_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_rsq_v2f16:
@@ -2396,17 +2394,15 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX9-LABEL: v_neg_rsq_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_rsq_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_rsq_f16_e32 v0, v0
-; GFX9-NEXT: v_pack_b32_f16 v0, -v0, -v1
+; GFX9-NEXT: v_rsq_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: v_rsq_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_neg_rsq_v2f16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_rsq_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_rsq_f16_e32 v0, v0
-; GFX10-NEXT: v_pack_b32_f16 v0, -v0, -v1
+; GFX10-NEXT: v_rsq_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_rsq_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_neg_rsq_v2f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
index bd9807477c1c0..553803244c9fa 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -187,22 +187,18 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
; GFX10-LABEL: fmul_pow2_8xhalf:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_pk_lshlrev_b16 v3, v3, 1 op_sel_hi:[1,0]
-; GFX10-NEXT: v_pk_lshlrev_b16 v2, v2, 1 op_sel_hi:[1,0]
-; GFX10-NEXT: v_pk_lshlrev_b16 v1, v1, 1 op_sel_hi:[1,0]
; GFX10-NEXT: v_pk_lshlrev_b16 v0, v0, 1 op_sel_hi:[1,0]
-; GFX10-NEXT: v_cvt_f16_u16_e32 v4, v3
-; GFX10-NEXT: v_cvt_f16_u16_e32 v5, v2
-; GFX10-NEXT: v_cvt_f16_u16_e32 v6, v1
-; GFX10-NEXT: v_cvt_f16_u16_e32 v7, v0
-; GFX10-NEXT: v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_cvt_f16_u16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_cvt_f16_u16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_cvt_f16_u16_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_pack_b32_f16 v0, v7, v0
-; GFX10-NEXT: v_pack_b32_f16 v1, v6, v1
-; GFX10-NEXT: v_pack_b32_f16 v2, v5, v2
-; GFX10-NEXT: v_pack_b32_f16 v3, v4, v3
+; GFX10-NEXT: v_pk_lshlrev_b16 v1, v1, 1 op_sel_hi:[1,0]
+; GFX10-NEXT: v_pk_lshlrev_b16 v2, v2, 1 op_sel_hi:[1,0]
+; GFX10-NEXT: v_pk_lshlrev_b16 v3, v3, 1 op_sel_hi:[1,0]
+; GFX10-NEXT: v_cvt_f16_u16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_cvt_f16_u16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_cvt_f16_u16_sdwa v2, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_cvt_f16_u16_sdwa v3, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_cvt_f16_u16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_cvt_f16_u16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_cvt_f16_u16_sdwa v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_cvt_f16_u16_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX10-NEXT: v_pk_mul_f16 v0, 0x7000, v0 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_mul_f16 v1, 0x7000, v1 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_mul_f16 v2, 0x7000, v2 op_sel_hi:[0,1]
@@ -302,18 +298,14 @@ define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v4, 0x7000
-; GFX10-NEXT: v_ldexp_f16_e32 v5, 0x7000, v3
-; GFX10-NEXT: v_ldexp_f16_e32 v6, 0x7000, v2
-; GFX10-NEXT: v_ldexp_f16_e32 v7, 0x7000, v1
-; GFX10-NEXT: v_ldexp_f16_e32 v8, 0x7000, v0
-; GFX10-NEXT: v_ldexp_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_ldexp_f16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_ldexp_f16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_ldexp_f16_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_pack_b32_f16 v0, v8, v0
-; GFX10-NEXT: v_pack_b32_f16 v1, v7, v1
-; GFX10-NEXT: v_pack_b32_f16 v2, v6, v2
-; GFX10-NEXT: v_pack_b32_f16 v3, v5, v3
+; GFX10-NEXT: v_ldexp_f16_sdwa v0, v4, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_ldexp_f16_sdwa v1, v4, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_ldexp_f16_sdwa v2, v4, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_ldexp_f16_sdwa v3, v4, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_ldexp_f16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_ldexp_f16_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_ldexp_f16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_ldexp_f16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: fmul_pow2_ldexp_8xhalf:
@@ -1084,9 +1076,8 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_pk_lshlrev_b16 v0, v0, 2 op_sel_hi:[1,0]
-; GFX10-NEXT: v_cvt_f16_u16_e32 v1, v0
-; GFX10-NEXT: v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX10-NEXT: v_cvt_f16_u16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_cvt_f16_u16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX10-NEXT: v_pk_mul_f16 v0, 0x4b80, v0 op_sel_hi:[0,1]
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/fract-match.ll b/llvm/test/CodeGen/AMDGPU/fract-match.ll
index f00657da440f0..1dd59e82a0007 100644
--- a/llvm/test/CodeGen/AMDGPU/fract-match.ll
+++ b/llvm/test/CodeGen/AMDGPU/fract-match.ll
@@ -1700,9 +1700,8 @@ define <2 x half> @basic_fract_v2f16_nonan(<2 x half> nofpclass(nan) %x) {
; GFX8-LABEL: basic_fract_v2f16_nonan:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_fract_f16_e32 v1, v0
-; GFX8-NEXT: v_fract_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX8-NEXT: v_fract_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_fract_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: basic_fract_v2f16_nonan:
@@ -2726,15 +2725,15 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) writeon
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_movk_i32 s6, 0x204
-; GFX8-NEXT: v_floor_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_floor_f16_e32 v4, v0
; GFX8-NEXT: v_fract_f16_sdwa v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX8-NEXT: v_cmp_class_f16_sdwa s[4:5], v0, s6 src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_pack_b32_f16 v3, v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v0
; GFX8-NEXT: v_fract_f16_e32 v4, v0
; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, 0, s[4:5]
; GFX8-NEXT: v_cmp_class_f16_e64 s[4:5], v0, s6
+; GFX8-NEXT: v_floor_f16_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, 0, s[4:5]
+; GFX8-NEXT: v_floor_f16_sdwa v3, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX8-NEXT: v_pack_b32_f16 v0, v0, v5
; GFX8-NEXT: global_store_dword v[1:2], v3, off
; GFX8-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
index e5fcc2609333b..c618868f99cc5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
@@ -188,11 +188,10 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mul_f16_e32 v3, 0.15915494, v1
-; GFX9-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_cos_f16_e32 v2, v3
-; GFX9-NEXT: v_cos_f16_e32 v1, v1
-; GFX9-NEXT: v_pack_b32_f16 v1, v2, v1
+; GFX9-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
@@ -204,11 +203,10 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mul_f16_e32 v3, 0.15915494, v1
-; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_cos_f16_e32 v2, v3
-; GFX10-NEXT: v_cos_f16_e32 v1, v1
-; GFX10-NEXT: v_pack_b32_f16 v1, v2, v1
+; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
index 9ea8771506aa2..904c1e9ec908e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
@@ -6478,9 +6478,8 @@ define <2 x half> @v_exp_v2f16_fast(<2 x half> %in) {
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x3dc5
; GFX900-SDAG-NEXT: v_pk_mul_f16 v0, v0, s4 op_sel_hi:[1,0]
-; GFX900-SDAG-NEXT: v_exp_f16_e32 v1, v0
-; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_exp_v2f16_fast:
@@ -6658,13 +6657,12 @@ define <3 x half> @v_exp_v3f16_afn(<3 x half> %in) {
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x3dc5
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v2, 0x3dc5, v0
-; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX900-SDAG-NEXT: v_exp_f16_e32 v2, v2
-; GFX900-SDAG-NEXT: v_exp_f16_e32 v0, v0
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x3dc5, v1
; GFX900-SDAG-NEXT: v_exp_f16_e32 v1, v1
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_exp_v3f16_afn:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
index 268e1e25f766f..4e5458dc50ff0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
@@ -6558,11 +6558,10 @@ define <2 x half> @v_exp10_v2f16_fast(<2 x half> %in) {
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x3dc5
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x3dc5, v0
-; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX900-SDAG-NEXT: v_exp_f16_e32 v1, v1
-; GFX900-SDAG-NEXT: v_exp_f16_e32 v0, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_exp10_v2f16_fast:
@@ -6740,13 +6739,12 @@ define <3 x half> @v_exp10_v3f16_afn(<3 x half> %in) {
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x3dc5
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v2, 0x3dc5, v0
-; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX900-SDAG-NEXT: v_exp_f16_e32 v2, v2
-; GFX900-SDAG-NEXT: v_exp_f16_e32 v0, v0
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x3dc5, v1
; GFX900-SDAG-NEXT: v_exp_f16_e32 v1, v1
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_exp10_v3f16_afn:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
index c3f5146168033..cebb81b3c049b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
@@ -3189,9 +3189,8 @@ define <2 x half> @v_exp2_v2f16(<2 x half> %in) {
; GFX900-SDAG-LABEL: v_exp2_v2f16:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_exp_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_exp_f16_e32 v0, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_exp2_v2f16:
@@ -3267,9 +3266,8 @@ define <2 x half> @v_exp2_fabs_v2f16(<2 x half> %in) {
; GFX900-SDAG-LABEL: v_exp2_fabs_v2f16:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_exp_f16_sdwa v1, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_exp_f16_e64 v0, |v0|
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, |v0| dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, |v0| dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_exp2_fabs_v2f16:
@@ -3351,9 +3349,8 @@ define <2 x half> @v_exp2_fneg_fabs_v2f16(<2 x half> %in) {
; GFX900-SDAG-LABEL: v_exp2_fneg_fabs_v2f16:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_exp_f16_sdwa v1, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_exp_f16_e64 v0, -|v0|
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, -|v0| dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, -|v0| dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_exp2_fneg_fabs_v2f16:
@@ -3436,9 +3433,8 @@ define <2 x half> @v_exp2_fneg_v2f16(<2 x half> %in) {
; GFX900-SDAG-LABEL: v_exp2_fneg_v2f16:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_exp_f16_sdwa v1, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_exp_f16_e64 v0, -v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, -v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_exp2_fneg_v2f16:
@@ -3506,9 +3502,8 @@ define <2 x half> @v_exp2_v2f16_fast(<2 x half> %in) {
; GFX900-SDAG-LABEL: v_exp2_v2f16_fast:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_exp_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_exp_f16_e32 v0, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_exp2_v2f16_fast:
@@ -3588,10 +3583,9 @@ define <3 x half> @v_exp_v3f16(<3 x half> %in) {
; GFX900-SDAG-LABEL: v_exp_v3f16:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_exp_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_exp_f16_e32 v0, v0
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX900-SDAG-NEXT: v_exp_f16_e32 v1, v1
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v2
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_exp_v3f16:
@@ -3672,10 +3666,9 @@ define <3 x half> @v_exp2_v3f16_afn(<3 x half> %in) {
; GFX900-SDAG-LABEL: v_exp2_v3f16_afn:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_exp_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_exp_f16_e32 v0, v0
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX900-SDAG-NEXT: v_exp_f16_e32 v1, v1
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v2
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_exp2_v3f16_afn:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
index eed67d9e020d7..d61ca3dd8a0b6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
@@ -411,9 +411,9 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) {
; GFX9-SDAG-LABEL: test_frexp_v2f16_v2i32:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_frexp_mant_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-SDAG-NEXT: v_frexp_mant_f16_e32 v2, v0
-; GFX9-SDAG-NEXT: v_pack_b32_f16 v3, v2, v1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, v0
+; GFX9-SDAG-NEXT: v_frexp_mant_f16_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-SDAG-NEXT: v_frexp_mant_f16_sdwa v3, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX9-SDAG-NEXT: v_frexp_exp_i16_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-SDAG-NEXT: v_frexp_exp_i16_f16_e32 v0, v0
; GFX9-SDAG-NEXT: v_bfe_i32 v2, v1, 0, 16
@@ -522,13 +522,14 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) {
; GFX9-GISEL-LABEL: test_frexp_v2f16_v2i32:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_frexp_mant_f16_e32 v3, v0
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, v0
+; GFX9-GISEL-NEXT: v_frexp_mant_f16_sdwa v3, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX9-GISEL-NEXT: v_frexp_exp_i16_f16_e32 v1, v0
-; GFX9-GISEL-NEXT: v_frexp_mant_f16_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-GISEL-NEXT: v_frexp_mant_f16_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX9-GISEL-NEXT: v_frexp_exp_i16_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-GISEL-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX9-GISEL-NEXT: v_bfe_i32 v2, v0, 0, 16
-; GFX9-GISEL-NEXT: v_pack_b32_f16 v0, v3, v4
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, v3
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-TRUE16-LABEL: test_frexp_v2f16_v2i32:
@@ -628,9 +629,8 @@ define <2 x half> @test_frexp_v2f16_v2i32_only_use_fract(<2 x half> %a) {
; GFX9-SDAG-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_frexp_mant_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-SDAG-NEXT: v_frexp_mant_f16_e32 v0, v0
-; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-SDAG-NEXT: v_frexp_mant_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-SDAG-NEXT: v_frexp_mant_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-TRUE16-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
@@ -698,9 +698,8 @@ define <2 x half> @test_frexp_v2f16_v2i32_only_use_fract(<2 x half> %a) {
; GFX9-GISEL-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_frexp_mant_f16_e32 v1, v0
-; GFX9-GISEL-NEXT: v_frexp_mant_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX9-GISEL-NEXT: v_frexp_mant_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-GISEL-NEXT: v_frexp_mant_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-TRUE16-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
index 22f562ab8557b..bfbe6996e5482 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
@@ -461,10 +461,9 @@ define <2 x half> @test_ldexp_v2f16_v2i32(<2 x half> %a, <2 x i32> %b) {
; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x8000
; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0x7fff
; GFX9-SDAG-NEXT: v_med3_i32 v2, v2, s4, v3
+; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
; GFX9-SDAG-NEXT: v_med3_i32 v1, v1, s4, v3
-; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v1
-; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v2
+; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-TRUE16-LABEL: test_ldexp_v2f16_v2i32:
@@ -585,9 +584,8 @@ define <2 x half> @test_ldexp_v2f16_v2i16(<2 x half> %a, <2 x i16> %b) {
; GFX9-SDAG-LABEL: test_ldexp_v2f16_v2i16:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v1
-; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v2
+; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-TRUE16-LABEL: test_ldexp_v2f16_v2i16:
@@ -697,12 +695,11 @@ define <3 x half> @test_ldexp_v3f16_v3i32(<3 x half> %a, <3 x i32> %b) {
; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x8000
; GFX9-SDAG-NEXT: v_mov_b32_e32 v5, 0x7fff
; GFX9-SDAG-NEXT: v_med3_i32 v3, v3, s4, v5
-; GFX9-SDAG-NEXT: v_med3_i32 v2, v2, s4, v5
; GFX9-SDAG-NEXT: v_med3_i32 v4, v4, s4, v5
-; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v2
+; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-SDAG-NEXT: v_med3_i32 v2, v2, s4, v5
; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v1, v1, v4
-; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v3
+; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-TRUE16-LABEL: test_ldexp_v3f16_v3i32:
@@ -844,10 +841,9 @@ define <3 x half> @test_ldexp_v3f16_v3i16(<3 x half> %a, <3 x i16> %b) {
; GFX9-SDAG-LABEL: test_ldexp_v3f16_v3i16:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v4, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v2
+; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v1, v1, v3
-; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v4
+; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-TRUE16-LABEL: test_ldexp_v3f16_v3i16:
@@ -973,15 +969,13 @@ define <4 x half> @test_ldexp_v4f16_v4i32(<4 x half> %a, <4 x i32> %b) {
; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x8000
; GFX9-SDAG-NEXT: v_mov_b32_e32 v6, 0x7fff
; GFX9-SDAG-NEXT: v_med3_i32 v5, v5, s4, v6
-; GFX9-SDAG-NEXT: v_med3_i32 v4, v4, s4, v6
; GFX9-SDAG-NEXT: v_med3_i32 v3, v3, s4, v6
+; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-SDAG-NEXT: v_med3_i32 v4, v4, s4, v6
+; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
; GFX9-SDAG-NEXT: v_med3_i32 v2, v2, s4, v6
-; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v1, v1, v4
-; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v2
-; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v3
-; GFX9-SDAG-NEXT: v_pack_b32_f16 v1, v1, v5
+; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v1, v1, v4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-TRUE16-LABEL: test_ldexp_v4f16_v4i32:
@@ -1157,12 +1151,10 @@ define <4 x half> @test_ldexp_v4f16_v4i16(<4 x half> %a, <4 x i16> %b) {
; GFX9-SDAG-LABEL: test_ldexp_v4f16_v4i16:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v1, v1, v3
-; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v2
-; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v5
-; GFX9-SDAG-NEXT: v_pack_b32_f16 v1, v1, v4
+; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-TRUE16-LABEL: test_ldexp_v4f16_v4i16:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
index 74b6c75ac4948..8d95f42f72e56 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
@@ -6563,15 +6563,25 @@ define <2 x half> @v_log_v2f16(<2 x half> %in) {
; VI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0
; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_log_v2f16:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_log_f16_e32 v1, v0
-; GFX900-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT: v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-NEXT: v_pack_b32_f16 v0, v1, v0
-; GFX900-NEXT: s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_log_v2f16:
+; GFX900-SDAG: ; %bb.0:
+; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_log_v2f16:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x398c
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log_v2f16:
; GFX1100-SDAG-TRUE16: ; %bb.0:
@@ -6698,22 +6708,22 @@ define <2 x half> @v_log_fabs_v2f16(<2 x half> %in) {
; GFX900-SDAG-LABEL: v_log_fabs_v2f16:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_log_f16_e64 v1, |v0|
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, |v0| dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, |v0| dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_log_fabs_v2f16:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-GISEL-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x398c
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log_fabs_v2f16:
@@ -6850,22 +6860,22 @@ define <2 x half> @v_log_fneg_fabs_v2f16(<2 x half> %in) {
; GFX900-SDAG-LABEL: v_log_fneg_fabs_v2f16:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_log_f16_e64 v1, -|v0|
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, -|v0| dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, -|v0| dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_log_fneg_fabs_v2f16:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-GISEL-NEXT: v_or_b32_e32 v0, 0x80008000, v0
-; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x398c
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log_fneg_fabs_v2f16:
@@ -7003,22 +7013,22 @@ define <2 x half> @v_log_fneg_v2f16(<2 x half> %in) {
; GFX900-SDAG-LABEL: v_log_fneg_v2f16:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_log_f16_e64 v1, -v0
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, -v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_log_fneg_v2f16:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-GISEL-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
-; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x398c
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log_fneg_v2f16:
@@ -7138,15 +7148,25 @@ define <2 x half> @v_log_v2f16_fast(<2 x half> %in) {
; VI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0
; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_log_v2f16_fast:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_log_f16_e32 v1, v0
-; GFX900-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT: v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-NEXT: v_pack_b32_f16 v0, v1, v0
-; GFX900-NEXT: s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_log_v2f16_fast:
+; GFX900-SDAG: ; %bb.0:
+; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_log_v2f16_fast:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x398c
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log_v2f16_fast:
; GFX1100-SDAG-TRUE16: ; %bb.0:
@@ -7265,17 +7285,29 @@ define <3 x half> @v_log_v3f16(<3 x half> %in) {
; VI-NEXT: v_or_b32_e32 v0, v2, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_log_v3f16:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_log_f16_e32 v2, v0
-; GFX900-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT: v_log_f16_e32 v1, v1
-; GFX900-NEXT: v_mul_f16_e32 v2, 0x398c, v2
-; GFX900-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-NEXT: v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX900-NEXT: s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_log_v3f16:
+; GFX900-SDAG: ; %bb.0:
+; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT: v_log_f16_e32 v1, v1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x398c, v1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_log_v3f16:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x398c
+; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x398c, v1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log_v3f16:
; GFX1100-SDAG-TRUE16: ; %bb.0:
@@ -7398,17 +7430,29 @@ define <3 x half> @v_log_v3f16_fast(<3 x half> %in) {
; VI-NEXT: v_or_b32_e32 v0, v2, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_log_v3f16_fast:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_log_f16_e32 v2, v0
-; GFX900-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT: v_log_f16_e32 v1, v1
-; GFX900-NEXT: v_mul_f16_e32 v2, 0x398c, v2
-; GFX900-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-NEXT: v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX900-NEXT: s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_log_v3f16_fast:
+; GFX900-SDAG: ; %bb.0:
+; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT: v_log_f16_e32 v1, v1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x398c, v1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_log_v3f16_fast:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x398c
+; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x398c, v1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log_v3f16_fast:
; GFX1100-SDAG-TRUE16: ; %bb.0:
@@ -7569,31 +7613,29 @@ define <4 x half> @v_log_v4f16(<4 x half> %in) {
; GFX900-SDAG-LABEL: v_log_v4f16:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_log_f16_e32 v2, v1
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_log_f16_e32 v3, v0
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v2, 0x398c, v2
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v3, 0x398c, v3
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v3, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v1, v2, v1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_log_v4f16:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT: v_log_f16_e32 v2, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_log_f16_e32 v3, v1
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v2, 0x398c, v2
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v3, 0x398c, v3
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x398c
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log_v4f16:
@@ -7764,31 +7806,29 @@ define <4 x half> @v_log_v4f16_fast(<4 x half> %in) {
; GFX900-SDAG-LABEL: v_log_v4f16_fast:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_log_f16_e32 v2, v1
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_log_f16_e32 v3, v0
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v2, 0x398c, v2
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v3, 0x398c, v3
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v3, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v1, v2, v1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_log_v4f16_fast:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT: v_log_f16_e32 v2, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_log_f16_e32 v3, v1
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v2, 0x398c, v2
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v3, 0x398c, v3
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x398c
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log_v4f16_fast:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
index c4fdac3ac5b0e..40f379706012d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
@@ -6563,15 +6563,25 @@ define <2 x half> @v_log10_v2f16(<2 x half> %in) {
; VI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0
; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_log10_v2f16:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_log_f16_e32 v1, v0
-; GFX900-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-NEXT: v_pack_b32_f16 v0, v1, v0
-; GFX900-NEXT: s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_log10_v2f16:
+; GFX900-SDAG: ; %bb.0:
+; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_log10_v2f16:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x34d1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log10_v2f16:
; GFX1100-SDAG-TRUE16: ; %bb.0:
@@ -6698,22 +6708,22 @@ define <2 x half> @v_log10_fabs_v2f16(<2 x half> %in) {
; GFX900-SDAG-LABEL: v_log10_fabs_v2f16:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_log_f16_e64 v1, |v0|
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, |v0| dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, |v0| dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_log10_fabs_v2f16:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-GISEL-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x34d1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log10_fabs_v2f16:
@@ -6850,22 +6860,22 @@ define <2 x half> @v_log10_fneg_fabs_v2f16(<2 x half> %in) {
; GFX900-SDAG-LABEL: v_log10_fneg_fabs_v2f16:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_log_f16_e64 v1, -|v0|
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, -|v0| dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, -|v0| dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_log10_fneg_fabs_v2f16:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-GISEL-NEXT: v_or_b32_e32 v0, 0x80008000, v0
-; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x34d1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log10_fneg_fabs_v2f16:
@@ -7003,22 +7013,22 @@ define <2 x half> @v_log10_fneg_v2f16(<2 x half> %in) {
; GFX900-SDAG-LABEL: v_log10_fneg_v2f16:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_log_f16_e64 v1, -v0
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, -v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_log10_fneg_v2f16:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-GISEL-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
-; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x34d1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log10_fneg_v2f16:
@@ -7138,15 +7148,25 @@ define <2 x half> @v_log10_v2f16_fast(<2 x half> %in) {
; VI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0
; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_log10_v2f16_fast:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_log_f16_e32 v1, v0
-; GFX900-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-NEXT: v_pack_b32_f16 v0, v1, v0
-; GFX900-NEXT: s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_log10_v2f16_fast:
+; GFX900-SDAG: ; %bb.0:
+; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_log10_v2f16_fast:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x34d1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log10_v2f16_fast:
; GFX1100-SDAG-TRUE16: ; %bb.0:
@@ -7265,17 +7285,29 @@ define <3 x half> @v_log10_v3f16(<3 x half> %in) {
; VI-NEXT: v_or_b32_e32 v0, v2, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_log10_v3f16:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_log_f16_e32 v2, v0
-; GFX900-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT: v_log_f16_e32 v1, v1
-; GFX900-NEXT: v_mul_f16_e32 v2, 0x34d1, v2
-; GFX900-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX900-NEXT: s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_log10_v3f16:
+; GFX900-SDAG: ; %bb.0:
+; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT: v_log_f16_e32 v1, v1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_log10_v3f16:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x34d1
+; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log10_v3f16:
; GFX1100-SDAG-TRUE16: ; %bb.0:
@@ -7398,17 +7430,29 @@ define <3 x half> @v_log10_v3f16_fast(<3 x half> %in) {
; VI-NEXT: v_or_b32_e32 v0, v2, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_log10_v3f16_fast:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_log_f16_e32 v2, v0
-; GFX900-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT: v_log_f16_e32 v1, v1
-; GFX900-NEXT: v_mul_f16_e32 v2, 0x34d1, v2
-; GFX900-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX900-NEXT: s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_log10_v3f16_fast:
+; GFX900-SDAG: ; %bb.0:
+; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT: v_log_f16_e32 v1, v1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_log10_v3f16_fast:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x34d1
+; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log10_v3f16_fast:
; GFX1100-SDAG-TRUE16: ; %bb.0:
@@ -7569,31 +7613,29 @@ define <4 x half> @v_log10_v4f16(<4 x half> %in) {
; GFX900-SDAG-LABEL: v_log10_v4f16:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_log_f16_e32 v2, v1
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_log_f16_e32 v3, v0
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v2, 0x34d1, v2
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v3, 0x34d1, v3
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v3, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v1, v2, v1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_log10_v4f16:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT: v_log_f16_e32 v2, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_log_f16_e32 v3, v1
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v2, 0x34d1, v2
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v3, 0x34d1, v3
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x34d1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log10_v4f16:
@@ -7764,31 +7806,29 @@ define <4 x half> @v_log10_v4f16_fast(<4 x half> %in) {
; GFX900-SDAG-LABEL: v_log10_v4f16_fast:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_log_f16_e32 v2, v1
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_log_f16_e32 v3, v0
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v2, 0x34d1, v2
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v3, 0x34d1, v3
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v3, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v1, v2, v1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_log10_v4f16_fast:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT: v_log_f16_e32 v2, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_log_f16_e32 v3, v1
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v2, 0x34d1, v2
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v3, 0x34d1, v3
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x34d1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log10_v4f16_fast:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
index 444f37059406a..1b634380f3b05 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -4141,17 +4141,15 @@ define <2 x half> @v_log2_v2f16(<2 x half> %in) {
; GFX900-SDAG-LABEL: v_log2_v2f16:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_log_f16_e32 v0, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_log2_v2f16:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log2_v2f16:
@@ -4259,18 +4257,16 @@ define <2 x half> @v_log2_fabs_v2f16(<2 x half> %in) {
; GFX900-SDAG-LABEL: v_log2_fabs_v2f16:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_log_f16_e64 v0, |v0|
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, |v0| dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, |v0| dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_log2_fabs_v2f16:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-GISEL-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log2_fabs_v2f16:
@@ -4386,18 +4382,16 @@ define <2 x half> @v_log2_fneg_fabs_v2f16(<2 x half> %in) {
; GFX900-SDAG-LABEL: v_log2_fneg_fabs_v2f16:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_log_f16_e64 v0, -|v0|
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, -|v0| dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, -|v0| dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_log2_fneg_fabs_v2f16:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-GISEL-NEXT: v_or_b32_e32 v0, 0x80008000, v0
-; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log2_fneg_fabs_v2f16:
@@ -4514,18 +4508,16 @@ define <2 x half> @v_log2_fneg_v2f16(<2 x half> %in) {
; GFX900-SDAG-LABEL: v_log2_fneg_v2f16:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_log_f16_e64 v0, -v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, -v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_log2_fneg_v2f16:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-GISEL-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
-; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log2_fneg_v2f16:
@@ -4627,17 +4619,15 @@ define <2 x half> @v_log2_v2f16_fast(<2 x half> %in) {
; GFX900-SDAG-LABEL: v_log2_v2f16_fast:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_log_f16_e32 v0, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_log2_v2f16_fast:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log2_v2f16_fast:
@@ -4749,19 +4739,17 @@ define <3 x half> @v_log2_v3f16(<3 x half> %in) {
; GFX900-SDAG-LABEL: v_log2_v3f16:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_log_f16_e32 v0, v0
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX900-SDAG-NEXT: v_log_f16_e32 v1, v1
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v2
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_log2_v3f16:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT: v_log_f16_e32 v2, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v1
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log2_v3f16:
@@ -4871,19 +4859,17 @@ define <3 x half> @v_log2_v3f16_fast(<3 x half> %in) {
; GFX900-SDAG-LABEL: v_log2_v3f16_fast:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_log_f16_e32 v0, v0
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX900-SDAG-NEXT: v_log_f16_e32 v1, v1
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v2
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_log2_v3f16_fast:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT: v_log_f16_e32 v2, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v1
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log2_v3f16_fast:
@@ -5011,23 +4997,19 @@ define <4 x half> @v_log2_v4f16(<4 x half> %in) {
; GFX900-SDAG-LABEL: v_log2_v4f16:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_log_f16_e32 v0, v0
-; GFX900-SDAG-NEXT: v_log_f16_e32 v1, v1
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v3
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v1, v1, v2
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_log2_v4f16:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT: v_log_f16_e32 v2, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_log_f16_e32 v3, v1
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log2_v4f16:
@@ -5158,23 +5140,19 @@ define <4 x half> @v_log2_v4f16_fast(<4 x half> %in) {
; GFX900-SDAG-LABEL: v_log2_v4f16_fast:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_log_f16_e32 v0, v0
-; GFX900-SDAG-NEXT: v_log_f16_e32 v1, v1
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v3
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v1, v1, v2
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_log2_v4f16_fast:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT: v_log_f16_e32 v2, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_log_f16_e32 v3, v1
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log2_v4f16_fast:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
index 3d8a8a2962921..4383b89dfc3ba 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
@@ -198,9 +198,8 @@ define amdgpu_kernel void @rint_v2f16(
; GFX9-NEXT: s_mov_b32 s4, s0
; GFX9-NEXT: s_mov_b32 s5, s1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_rndne_f16_e32 v1, v0
-; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
index 48cd7c0f3286b..748255666311f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
@@ -188,11 +188,10 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mul_f16_e32 v3, 0.15915494, v1
-; GFX9-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_sin_f16_e32 v2, v3
-; GFX9-NEXT: v_sin_f16_e32 v1, v1
-; GFX9-NEXT: v_pack_b32_f16 v1, v2, v1
+; GFX9-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
@@ -204,11 +203,10 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mul_f16_e32 v3, 0.15915494, v1
-; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_sin_f16_e32 v2, v3
-; GFX10-NEXT: v_sin_f16_e32 v1, v1
-; GFX10-NEXT: v_pack_b32_f16 v1, v2, v1
+; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll b/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll
index 624f4480e689a..4732253128d3d 100644
--- a/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll
+++ b/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll
@@ -829,9 +829,8 @@ define <4 x half> @v_repeat_divisor_v2f16_x2(<2 x half> %x, <2 x half> %y, <2 x
; GFX9-LABEL: v_repeat_divisor_v2f16_x2:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_rcp_f16_sdwa v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_rcp_f16_e32 v2, v2
-; GFX9-NEXT: v_pack_b32_f16 v2, v2, v3
+; GFX9-NEXT: v_rcp_f16_sdwa v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: v_rcp_f16_sdwa v2, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2
; GFX9-NEXT: v_pk_mul_f16 v1, v1, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -932,15 +931,14 @@ define <6 x half> @v_repeat_divisor_v3f16_x2(<3 x half> %x, <3 x half> %y, <3 x
; GFX9-LABEL: v_repeat_divisor_v3f16_x2:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_rcp_f16_sdwa v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_rcp_f16_e32 v4, v4
+; GFX9-NEXT: v_rcp_f16_sdwa v4, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX9-NEXT: v_rcp_f16_e32 v5, v5
; GFX9-NEXT: s_movk_i32 s4, 0x7e00
-; GFX9-NEXT: v_pack_b32_f16 v4, v4, v6
+; GFX9-NEXT: v_rcp_f16_sdwa v4, v4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; GFX9-NEXT: v_pack_b32_f16 v5, v5, s4
-; GFX9-NEXT: v_pk_mul_f16 v0, v0, v4
; GFX9-NEXT: v_pk_mul_f16 v1, v1, v5
; GFX9-NEXT: v_pk_mul_f16 v3, v3, v5
+; GFX9-NEXT: v_pk_mul_f16 v0, v0, v4
; GFX9-NEXT: v_pk_mul_f16 v4, v2, v4
; GFX9-NEXT: v_alignbit_b32 v2, v3, v4, 16
; GFX9-NEXT: v_pack_b32_f16 v1, v1, v4
diff --git a/llvm/test/CodeGen/AMDGPU/roundeven.ll b/llvm/test/CodeGen/AMDGPU/roundeven.ll
index cf3edc0b4ac96..2a8b5cdb0ad03 100644
--- a/llvm/test/CodeGen/AMDGPU/roundeven.ll
+++ b/llvm/test/CodeGen/AMDGPU/roundeven.ll
@@ -460,17 +460,15 @@ define <2 x half> @v_roundeven_v2f16(<2 x half> %x) {
; GFX9-LABEL: v_roundeven_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_rndne_f16_e32 v1, v0
-; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_roundeven_v2f16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_rndne_f16_e32 v1, v0
-; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_roundeven_v2f16:
@@ -523,17 +521,15 @@ define <2 x half> @v_roundeven_v2f16(<2 x half> %x) {
; SDAG_GFX9-LABEL: v_roundeven_v2f16:
; SDAG_GFX9: ; %bb.0:
; SDAG_GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG_GFX9-NEXT: v_rndne_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG_GFX9-NEXT: v_rndne_f16_e32 v0, v0
-; SDAG_GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
+; SDAG_GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; SDAG_GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; SDAG_GFX9-NEXT: s_setpc_b64 s[30:31]
;
; SDAG_GFX10-LABEL: v_roundeven_v2f16:
; SDAG_GFX10: ; %bb.0:
; SDAG_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG_GFX10-NEXT: v_rndne_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG_GFX10-NEXT: v_rndne_f16_e32 v0, v0
-; SDAG_GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
+; SDAG_GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; SDAG_GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; SDAG_GFX10-NEXT: s_setpc_b64 s[30:31]
;
; SDAG_GFX11-TRUE16-LABEL: v_roundeven_v2f16:
@@ -602,18 +598,16 @@ define <2 x half> @v_roundeven_v2f16_fneg(<2 x half> %x) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
-; GFX9-NEXT: v_rndne_f16_e32 v1, v0
-; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_roundeven_v2f16_fneg:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
-; GFX10-NEXT: v_rndne_f16_e32 v1, v0
-; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_roundeven_v2f16_fneg:
@@ -676,17 +670,15 @@ define <2 x half> @v_roundeven_v2f16_fneg(<2 x half> %x) {
; SDAG_GFX9-LABEL: v_roundeven_v2f16_fneg:
; SDAG_GFX9: ; %bb.0:
; SDAG_GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG_GFX9-NEXT: v_rndne_f16_sdwa v1, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG_GFX9-NEXT: v_rndne_f16_e64 v0, -v0
-; SDAG_GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
+; SDAG_GFX9-NEXT: v_rndne_f16_sdwa v0, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; SDAG_GFX9-NEXT: v_rndne_f16_sdwa v0, -v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; SDAG_GFX9-NEXT: s_setpc_b64 s[30:31]
;
; SDAG_GFX10-LABEL: v_roundeven_v2f16_fneg:
; SDAG_GFX10: ; %bb.0:
; SDAG_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG_GFX10-NEXT: v_rndne_f16_sdwa v1, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG_GFX10-NEXT: v_rndne_f16_e64 v0, -v0
-; SDAG_GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
+; SDAG_GFX10-NEXT: v_rndne_f16_sdwa v0, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; SDAG_GFX10-NEXT: v_rndne_f16_sdwa v0, -v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; SDAG_GFX10-NEXT: s_setpc_b64 s[30:31]
;
; SDAG_GFX11-TRUE16-LABEL: v_roundeven_v2f16_fneg:
@@ -759,23 +751,19 @@ define <4 x half> @v_roundeven_v4f16(<4 x half> %x) {
; GFX9-LABEL: v_roundeven_v4f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_rndne_f16_e32 v2, v0
-; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_rndne_f16_e32 v3, v1
-; GFX9-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX9-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_roundeven_v4f16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_rndne_f16_e32 v2, v0
-; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_rndne_f16_e32 v3, v1
-; GFX10-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_roundeven_v4f16:
@@ -850,23 +838,19 @@ define <4 x half> @v_roundeven_v4f16(<4 x half> %x) {
; SDAG_GFX9-LABEL: v_roundeven_v4f16:
; SDAG_GFX9: ; %bb.0:
; SDAG_GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG_GFX9-NEXT: v_rndne_f16_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG_GFX9-NEXT: v_rndne_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG_GFX9-NEXT: v_rndne_f16_e32 v1, v1
-; SDAG_GFX9-NEXT: v_rndne_f16_e32 v0, v0
-; SDAG_GFX9-NEXT: v_pack_b32_f16 v0, v0, v3
-; SDAG_GFX9-NEXT: v_pack_b32_f16 v1, v1, v2
+; SDAG_GFX9-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; SDAG_GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; SDAG_GFX9-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; SDAG_GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; SDAG_GFX9-NEXT: s_setpc_b64 s[30:31]
;
; SDAG_GFX10-LABEL: v_roundeven_v4f16:
; SDAG_GFX10: ; %bb.0:
; SDAG_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG_GFX10-NEXT: v_rndne_f16_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG_GFX10-NEXT: v_rndne_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG_GFX10-NEXT: v_rndne_f16_e32 v0, v0
-; SDAG_GFX10-NEXT: v_rndne_f16_e32 v1, v1
-; SDAG_GFX10-NEXT: v_pack_b32_f16 v0, v0, v3
-; SDAG_GFX10-NEXT: v_pack_b32_f16 v1, v1, v2
+; SDAG_GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; SDAG_GFX10-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; SDAG_GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; SDAG_GFX10-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
; SDAG_GFX10-NEXT: s_setpc_b64 s[30:31]
;
; SDAG_GFX11-TRUE16-LABEL: v_roundeven_v4f16:
>From 20f2534a5d3b2510eb9d5f512b3cdf523837ba0f Mon Sep 17 00:00:00 2001
From: vikashgu <Vikash.Gupta at amd.com>
Date: Thu, 24 Apr 2025 10:13:40 +0000
Subject: [PATCH 03/11] [AMDGPU][NFC] Added Pre-commit tests for #137137
This adds llc LIT test for vector fp16 operations like log, exp, etc.
Its act as the pre-commit test for github PR#137137.
---
llvm/test/CodeGen/AMDGPU/vector-fp16.ll | 2758 +++++++++++++++++++++++
1 file changed, 2758 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/vector-fp16.ll
diff --git a/llvm/test/CodeGen/AMDGPU/vector-fp16.ll b/llvm/test/CodeGen/AMDGPU/vector-fp16.ll
new file mode 100644
index 0000000000000..501630e790200
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vector-fp16.ll
@@ -0,0 +1,2758 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX906 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+
+declare <1 x half> @llvm.sin.v1f16(<1 x half>)
+declare <1 x half> @llvm.cos.v1f16(<1 x half>)
+declare <1 x half> @llvm.log.v1f16(<1 x half>)
+declare <1 x half> @llvm.log2.v1f16(<1 x half>)
+declare <1 x half> @llvm.log10.v1f16(<1 x half>)
+declare <1 x half> @llvm.exp.v1f16(<1 x half>)
+declare <1 x half> @llvm.exp2.v1f16(<1 x half>)
+declare <1 x half> @llvm.exp10.v1f16(<1 x half>)
+declare <1 x half> @llvm.sqrt.v1f16(<1 x half>)
+
+declare <2 x half> @llvm.sin.v2f16(<2 x half>)
+declare <2 x half> @llvm.cos.v2f16(<2 x half>)
+declare <2 x half> @llvm.log.v2f16(<2 x half>)
+declare <2 x half> @llvm.log2.v2f16(<2 x half>)
+declare <2 x half> @llvm.log10.v2f16(<2 x half>)
+declare <2 x half> @llvm.exp.v2f16(<2 x half>)
+declare <2 x half> @llvm.exp2.v2f16(<2 x half>)
+declare <2 x half> @llvm.exp10.v2f16(<2 x half>)
+declare <2 x half> @llvm.sqrt.v2f16(<2 x half>)
+
+declare <4 x half> @llvm.sin.v4f16(<4 x half>)
+declare <4 x half> @llvm.cos.v4f16(<4 x half>)
+declare <4 x half> @llvm.log.v4f16(<4 x half>)
+declare <4 x half> @llvm.log2.v4f16(<4 x half>)
+declare <4 x half> @llvm.log10.v4f16(<4 x half>)
+declare <4 x half> @llvm.exp.v4f16(<4 x half>)
+declare <4 x half> @llvm.exp2.v4f16(<4 x half>)
+declare <4 x half> @llvm.exp10.v4f16(<4 x half>)
+declare <4 x half> @llvm.sqrt.v4f16(<4 x half>)
+
+declare <5 x half> @llvm.sin.v5f16(<5 x half>)
+declare <5 x half> @llvm.cos.v5f16(<5 x half>)
+declare <5 x half> @llvm.log.v5f16(<5 x half>)
+declare <5 x half> @llvm.log2.v5f16(<5 x half>)
+declare <5 x half> @llvm.log10.v5f16(<5 x half>)
+declare <5 x half> @llvm.exp.v5f16(<5 x half>)
+declare <5 x half> @llvm.exp2.v5f16(<5 x half>)
+declare <5 x half> @llvm.exp10.v5f16(<5 x half>)
+declare <5 x half> @llvm.sqrt.v5f16(<5 x half>)
+
+
+define <1 x half> @sin_v1f16(<1 x half> %a) {
+; GFX8-LABEL: sin_v1f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
+; GFX8-NEXT: v_fract_f16_e32 v0, v0
+; GFX8-NEXT: v_sin_f16_e32 v0, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: sin_v1f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
+; GFX9-NEXT: v_sin_f16_e32 v0, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: sin_v1f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
+; GFX10-NEXT: v_sin_f16_e32 v0, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: sin_v1f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_sin_f16_e32 v0, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <1 x half> @llvm.sin.v1f16(<1 x half> %a)
+ ret <1 x half> %res
+}
+
+define <1 x half> @cos_v1f16(<1 x half> %a) {
+; GFX8-LABEL: cos_v1f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
+; GFX8-NEXT: v_fract_f16_e32 v0, v0
+; GFX8-NEXT: v_cos_f16_e32 v0, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: cos_v1f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
+; GFX9-NEXT: v_cos_f16_e32 v0, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: cos_v1f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
+; GFX10-NEXT: v_cos_f16_e32 v0, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: cos_v1f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cos_f16_e32 v0, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <1 x half> @llvm.cos.v1f16(<1 x half> %a)
+ ret <1 x half> %res
+}
+
+define <1 x half> @log_v1f16(<1 x half> %a) {
+; GFX8-LABEL: log_v1f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_log_f16_e32 v0, v0
+; GFX8-NEXT: v_mul_f16_e32 v0, 0x398c, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: log_v1f16:
+; GFX906: ; %bb.0:
+; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT: v_log_f16_e32 v0, v0
+; GFX906-NEXT: v_mul_f16_e32 v0, 0x398c, v0
+; GFX906-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: log_v1f16:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_log_f16_e32 v0, v0
+; GFX908-NEXT: v_mul_f16_e32 v0, 0x398c, v0
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: log_v1f16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_log_f16_e32 v0, v0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mul_f16_e32 v0, 0x398c, v0
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: log_v1f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_log_f16_e32 v0, v0
+; GFX10-NEXT: v_mul_f16_e32 v0, 0x398c, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: log_v1f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_log_f16_e32 v0, v0
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f16_e32 v0, 0x398c, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <1 x half> @llvm.log.v1f16(<1 x half> %a)
+ ret <1 x half> %res
+}
+
+define <1 x half> @log2_v1f16(<1 x half> %a) {
+; GFX8-LABEL: log2_v1f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_log_f16_e32 v0, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: log2_v1f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_log_f16_e32 v0, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: log2_v1f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_log_f16_e32 v0, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: log2_v1f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_log_f16_e32 v0, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <1 x half> @llvm.log2.v1f16(<1 x half> %a)
+ ret <1 x half> %res
+}
+
+define <1 x half> @log10_v1f16(<1 x half> %a) {
+; GFX8-LABEL: log10_v1f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_log_f16_e32 v0, v0
+; GFX8-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: log10_v1f16:
+; GFX906: ; %bb.0:
+; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT: v_log_f16_e32 v0, v0
+; GFX906-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
+; GFX906-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: log10_v1f16:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_log_f16_e32 v0, v0
+; GFX908-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: log10_v1f16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_log_f16_e32 v0, v0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: log10_v1f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_log_f16_e32 v0, v0
+; GFX10-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: log10_v1f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_log_f16_e32 v0, v0
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <1 x half> @llvm.log10.v1f16(<1 x half> %a)
+ ret <1 x half> %res
+}
+
+define <1 x half> @exp_v1f16(<1 x half> %a) {
+; GFX8-LABEL: exp_v1f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX8-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX8-NEXT: v_exp_f32_e32 v0, v0
+; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: exp_v1f16:
+; GFX906: ; %bb.0:
+; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX906-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX906-NEXT: v_exp_f32_e32 v0, v0
+; GFX906-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX906-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: exp_v1f16:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX908-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX908-NEXT: v_exp_f32_e32 v0, v0
+; GFX908-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: exp_v1f16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX942-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX942-NEXT: v_exp_f32_e32 v0, v0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: exp_v1f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX10-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX10-NEXT: v_exp_f32_e32 v0, v0
+; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: exp_v1f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX11-NEXT: v_exp_f32_e32 v0, v0
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <1 x half> @llvm.exp.v1f16(<1 x half> %a)
+ ret <1 x half> %res
+}
+
+define <1 x half> @exp2_v1f16(<1 x half> %a) {
+; GFX8-LABEL: exp2_v1f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_exp_f16_e32 v0, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: exp2_v1f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_exp_f16_e32 v0, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: exp2_v1f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_exp_f16_e32 v0, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: exp2_v1f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_exp_f16_e32 v0, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <1 x half> @llvm.exp2.v1f16(<1 x half> %a)
+ ret <1 x half> %res
+}
+
+define <1 x half> @exp10_v1f16(<1 x half> %a) {
+; GFX8-LABEL: exp10_v1f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX8-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX8-NEXT: v_exp_f32_e32 v0, v0
+; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: exp10_v1f16:
+; GFX906: ; %bb.0:
+; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX906-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX906-NEXT: v_exp_f32_e32 v0, v0
+; GFX906-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX906-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: exp10_v1f16:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX908-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX908-NEXT: v_exp_f32_e32 v0, v0
+; GFX908-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: exp10_v1f16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX942-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX942-NEXT: v_exp_f32_e32 v0, v0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: exp10_v1f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX10-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX10-NEXT: v_exp_f32_e32 v0, v0
+; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: exp10_v1f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX11-NEXT: v_exp_f32_e32 v0, v0
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <1 x half> @llvm.exp10.v1f16(<1 x half> %a)
+ ret <1 x half> %res
+}
+
+define <1 x half> @sqrt_v1f16(<1 x half> %a) {
+; GFX8-LABEL: sqrt_v1f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: sqrt_v1f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: sqrt_v1f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: sqrt_v1f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <1 x half> @llvm.sqrt.v1f16(<1 x half> %a)
+ ret <1 x half> %res
+}
+
+define <2 x half> @sin_v2f16(<2 x half> %a) {
+; GFX8-LABEL: sin_v2f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, 0x3118
+; GFX8-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
+; GFX8-NEXT: v_fract_f16_e32 v1, v1
+; GFX8-NEXT: v_fract_f16_e32 v0, v0
+; GFX8-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_sin_f16_e32 v0, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: sin_v2f16:
+; GFX906: ; %bb.0:
+; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT: v_mov_b32_e32 v1, 0x3118
+; GFX906-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: sin_v2f16:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v1, 0x3118
+; GFX908-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: sin_v2f16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v1, 0x3118
+; GFX942-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: sin_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v1, 0x3118
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: sin_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
+; GFX11-NEXT: v_sin_f16_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_sin_f16_e32 v1, v1
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <2 x half> @llvm.sin.v2f16(<2 x half> %a)
+ ret <2 x half> %res
+}
+
+define <2 x half> @cos_v2f16(<2 x half> %a) {
+; GFX8-LABEL: cos_v2f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, 0x3118
+; GFX8-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
+; GFX8-NEXT: v_fract_f16_e32 v1, v1
+; GFX8-NEXT: v_fract_f16_e32 v0, v0
+; GFX8-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_cos_f16_e32 v0, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: cos_v2f16:
+; GFX906: ; %bb.0:
+; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT: v_mov_b32_e32 v1, 0x3118
+; GFX906-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: cos_v2f16:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v1, 0x3118
+; GFX908-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: cos_v2f16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v1, 0x3118
+; GFX942-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: cos_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v1, 0x3118
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: cos_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
+; GFX11-NEXT: v_cos_f16_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cos_f16_e32 v1, v1
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <2 x half> @llvm.cos.v2f16(<2 x half> %a)
+ ret <2 x half> %res
+}
+
+define <2 x half> @log_v2f16(<2 x half> %a) {
+; GFX8-LABEL: log_v2f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_log_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_log_f16_e32 v0, v0
+; GFX8-NEXT: v_mov_b32_e32 v2, 0x398c
+; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_mul_f16_e32 v0, 0x398c, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: log_v2f16:
+; GFX906: ; %bb.0:
+; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT: s_movk_i32 s4, 0x398c
+; GFX906-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: log_v2f16:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT: s_movk_i32 s4, 0x398c
+; GFX908-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: log_v2f16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT: s_movk_i32 s0, 0x398c
+; GFX942-NEXT: v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: log_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_e32 v1, 0x398c
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: log_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT: v_log_f16_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_log_f16_e32 v1, v1
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f16_e32 v0, 0x398c, v0
+; GFX11-NEXT: v_mul_f16_e32 v1, 0x398c, v1
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <2 x half> @llvm.log.v2f16(<2 x half> %a)
+ ret <2 x half> %res
+}
+
+define <2 x half> @log2_v2f16(<2 x half> %a) {
+; GFX8-LABEL: log2_v2f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_log_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_log_f16_e32 v0, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: log2_v2f16:
+; GFX906: ; %bb.0:
+; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: log2_v2f16:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: log2_v2f16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: log2_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: log2_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT: v_log_f16_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_log_f16_e32 v1, v1
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <2 x half> @llvm.log2.v2f16(<2 x half> %a)
+ ret <2 x half> %res
+}
+
+define <2 x half> @log10_v2f16(<2 x half> %a) {
+; GFX8-LABEL: log10_v2f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_log_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_log_f16_e32 v0, v0
+; GFX8-NEXT: v_mov_b32_e32 v2, 0x34d1
+; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: log10_v2f16:
+; GFX906: ; %bb.0:
+; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT: s_movk_i32 s4, 0x34d1
+; GFX906-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: log10_v2f16:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT: s_movk_i32 s4, 0x34d1
+; GFX908-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: log10_v2f16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT: s_movk_i32 s0, 0x34d1
+; GFX942-NEXT: v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: log10_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_e32 v1, 0x34d1
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: log10_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT: v_log_f16_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_log_f16_e32 v1, v1
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
+; GFX11-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <2 x half> @llvm.log10.v2f16(<2 x half> %a)
+ ret <2 x half> %res
+}
+
+define <2 x half> @exp_v2f16(<2 x half> %a) {
+; GFX8-LABEL: exp_v2f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cvt_f32_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX8-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX8-NEXT: v_exp_f32_e32 v1, v1
+; GFX8-NEXT: v_exp_f32_e32 v0, v0
+; GFX8-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: exp_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX9-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX9-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX9-NEXT: v_exp_f32_e32 v1, v1
+; GFX9-NEXT: v_exp_f32_e32 v0, v0
+; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: exp_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX10-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX10-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX10-NEXT: v_exp_f32_e32 v1, v1
+; GFX10-NEXT: v_exp_f32_e32 v0, v0
+; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: exp_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX11-NEXT: v_dual_mul_f32 v0, 0x3fb8aa3b, v0 :: v_dual_mul_f32 v1, 0x3fb8aa3b, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_exp_f32_e32 v0, v0
+; GFX11-NEXT: v_exp_f32_e32 v1, v1
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <2 x half> @llvm.exp.v2f16(<2 x half> %a)
+ ret <2 x half> %res
+}
+
+define <2 x half> @exp2_v2f16(<2 x half> %a) {
+; GFX8-LABEL: exp2_v2f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_exp_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_exp_f16_e32 v0, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: exp2_v2f16:
+; GFX906: ; %bb.0:
+; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: exp2_v2f16:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: exp2_v2f16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: exp2_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: exp2_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT: v_exp_f16_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_exp_f16_e32 v1, v1
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <2 x half> @llvm.exp2.v2f16(<2 x half> %a)
+ ret <2 x half> %res
+}
+
+define <2 x half> @exp10_v2f16(<2 x half> %a) {
+; GFX8-LABEL: exp10_v2f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cvt_f32_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX8-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX8-NEXT: v_exp_f32_e32 v1, v1
+; GFX8-NEXT: v_exp_f32_e32 v0, v0
+; GFX8-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: exp10_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX9-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX9-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX9-NEXT: v_exp_f32_e32 v1, v1
+; GFX9-NEXT: v_exp_f32_e32 v0, v0
+; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: exp10_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX10-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX10-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX10-NEXT: v_exp_f32_e32 v1, v1
+; GFX10-NEXT: v_exp_f32_e32 v0, v0
+; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: exp10_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX11-NEXT: v_dual_mul_f32 v0, 0x3fb8aa3b, v0 :: v_dual_mul_f32 v1, 0x3fb8aa3b, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_exp_f32_e32 v0, v0
+; GFX11-NEXT: v_exp_f32_e32 v1, v1
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <2 x half> @llvm.exp10.v2f16(<2 x half> %a)
+ ret <2 x half> %res
+}
+
+define <2 x half> @sqrt_v2f16(<2 x half> %a) {
+; GFX8-LABEL: sqrt_v2f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_sqrt_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: sqrt_v2f16:
+; GFX906: ; %bb.0:
+; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: sqrt_v2f16:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: sqrt_v2f16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: sqrt_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: sqrt_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_sqrt_f16_e32 v1, v1
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
+ ret <2 x half> %res
+}
+
+define <4 x half> @sin_v4f16(<4 x half> %a) {
+; GFX8-LABEL: sin_v4f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v3, 0x3118
+; GFX8-NEXT: v_mul_f16_e32 v2, 0.15915494, v1
+; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_mul_f16_e32 v4, 0.15915494, v0
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_fract_f16_e32 v2, v2
+; GFX8-NEXT: v_fract_f16_e32 v1, v1
+; GFX8-NEXT: v_fract_f16_e32 v4, v4
+; GFX8-NEXT: v_fract_f16_e32 v0, v0
+; GFX8-NEXT: v_sin_f16_e32 v2, v2
+; GFX8-NEXT: v_sin_f16_e32 v4, v4
+; GFX8-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: sin_v4f16:
+; GFX906: ; %bb.0:
+; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT: v_mov_b32_e32 v2, 0x3118
+; GFX906-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: sin_v4f16:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v2, 0x3118
+; GFX908-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: sin_v4f16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v2, 0x3118
+; GFX942-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: sin_v4f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v2, 0x3118
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: sin_v4f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
+; GFX11-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
+; GFX11-NEXT: v_mul_f16_e32 v3, 0.15915494, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_sin_f16_e32 v1, v1
+; GFX11-NEXT: v_sin_f16_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_sin_f16_e32 v2, v2
+; GFX11-NEXT: v_sin_f16_e32 v3, v3
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v2
+; GFX11-NEXT: v_pack_b32_f16 v1, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <4 x half> @llvm.sin.v4f16(<4 x half> %a)
+ ret <4 x half> %res
+}
+
+define <4 x half> @cos_v4f16(<4 x half> %a) {
+; GFX8-LABEL: cos_v4f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v3, 0x3118
+; GFX8-NEXT: v_mul_f16_e32 v2, 0.15915494, v1
+; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_mul_f16_e32 v4, 0.15915494, v0
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_fract_f16_e32 v2, v2
+; GFX8-NEXT: v_fract_f16_e32 v1, v1
+; GFX8-NEXT: v_fract_f16_e32 v4, v4
+; GFX8-NEXT: v_fract_f16_e32 v0, v0
+; GFX8-NEXT: v_cos_f16_e32 v2, v2
+; GFX8-NEXT: v_cos_f16_e32 v4, v4
+; GFX8-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: cos_v4f16:
+; GFX906: ; %bb.0:
+; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT: v_mov_b32_e32 v2, 0x3118
+; GFX906-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: cos_v4f16:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v2, 0x3118
+; GFX908-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: cos_v4f16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v2, 0x3118
+; GFX942-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: cos_v4f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v2, 0x3118
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: cos_v4f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
+; GFX11-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
+; GFX11-NEXT: v_mul_f16_e32 v3, 0.15915494, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cos_f16_e32 v1, v1
+; GFX11-NEXT: v_cos_f16_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cos_f16_e32 v2, v2
+; GFX11-NEXT: v_cos_f16_e32 v3, v3
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v2
+; GFX11-NEXT: v_pack_b32_f16 v1, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <4 x half> @llvm.cos.v4f16(<4 x half> %a)
+ ret <4 x half> %res
+}
+
+define <4 x half> @log_v4f16(<4 x half> %a) {
+; GFX8-LABEL: log_v4f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_log_f16_e32 v2, v1
+; GFX8-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_log_f16_e32 v3, v0
+; GFX8-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_mov_b32_e32 v4, 0x398c
+; GFX8-NEXT: v_mul_f16_e32 v2, 0x398c, v2
+; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_mul_f16_e32 v3, 0x398c, v3
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_e32 v0, v3, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: log_v4f16:
+; GFX906: ; %bb.0:
+; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT: s_movk_i32 s4, 0x398c
+; GFX906-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: log_v4f16:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT: s_movk_i32 s4, 0x398c
+; GFX908-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: log_v4f16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT: s_movk_i32 s0, 0x398c
+; GFX942-NEXT: v_mul_f16_sdwa v1, v1, s0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT: v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT: v_mul_f16_sdwa v1, v1, s0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT: v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: log_v4f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_e32 v2, 0x398c
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: log_v4f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT: v_log_f16_e32 v1, v1
+; GFX11-NEXT: v_log_f16_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_log_f16_e32 v2, v2
+; GFX11-NEXT: v_log_f16_e32 v3, v3
+; GFX11-NEXT: v_mul_f16_e32 v1, 0x398c, v1
+; GFX11-NEXT: s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_mul_f16_e32 v0, 0x398c, v0
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f16_e32 v2, 0x398c, v2
+; GFX11-NEXT: v_mul_f16_e32 v3, 0x398c, v3
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_pack_b32_f16 v1, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <4 x half> @llvm.log.v4f16(<4 x half> %a)
+ ret <4 x half> %res
+}
+
+define <4 x half> @log2_v4f16(<4 x half> %a) {
+; GFX8-LABEL: log2_v4f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_log_f16_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_log_f16_sdwa v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_log_f16_e32 v0, v0
+; GFX8-NEXT: v_log_f16_e32 v1, v1
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v3
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: log2_v4f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: log2_v4f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: log2_v4f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT: v_log_f16_e32 v1, v1
+; GFX11-NEXT: v_log_f16_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_log_f16_e32 v2, v2
+; GFX11-NEXT: v_log_f16_e32 v3, v3
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v2
+; GFX11-NEXT: v_pack_b32_f16 v1, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <4 x half> @llvm.log2.v4f16(<4 x half> %a)
+ ret <4 x half> %res
+}
+
+define <4 x half> @log10_v4f16(<4 x half> %a) {
+; GFX8-LABEL: log10_v4f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_log_f16_e32 v2, v1
+; GFX8-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_log_f16_e32 v3, v0
+; GFX8-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_mov_b32_e32 v4, 0x34d1
+; GFX8-NEXT: v_mul_f16_e32 v2, 0x34d1, v2
+; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_mul_f16_e32 v3, 0x34d1, v3
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_e32 v0, v3, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: log10_v4f16:
+; GFX906: ; %bb.0:
+; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT: s_movk_i32 s4, 0x34d1
+; GFX906-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: log10_v4f16:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT: s_movk_i32 s4, 0x34d1
+; GFX908-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: log10_v4f16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT: s_movk_i32 s0, 0x34d1
+; GFX942-NEXT: v_mul_f16_sdwa v1, v1, s0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT: v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT: v_mul_f16_sdwa v1, v1, s0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT: v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: log10_v4f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_e32 v2, 0x34d1
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: log10_v4f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT: v_log_f16_e32 v1, v1
+; GFX11-NEXT: v_log_f16_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_log_f16_e32 v2, v2
+; GFX11-NEXT: v_log_f16_e32 v3, v3
+; GFX11-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
+; GFX11-NEXT: s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f16_e32 v2, 0x34d1, v2
+; GFX11-NEXT: v_mul_f16_e32 v3, 0x34d1, v3
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_pack_b32_f16 v1, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <4 x half> @llvm.log10.v4f16(<4 x half> %a)
+ ret <4 x half> %res
+}
+
+define <4 x half> @exp_v4f16(<4 x half> %a) {
+; GFX8-LABEL: exp_v4f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX8-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX8-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+; GFX8-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX8-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3
+; GFX8-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX8-NEXT: v_exp_f32_e32 v2, v2
+; GFX8-NEXT: v_exp_f32_e32 v3, v3
+; GFX8-NEXT: v_exp_f32_e32 v0, v0
+; GFX8-NEXT: v_exp_f32_e32 v1, v1
+; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX8-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_or_b32_e32 v0, v3, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: exp_v4f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX9-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX9-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+; GFX9-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX9-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3
+; GFX9-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX9-NEXT: v_exp_f32_e32 v2, v2
+; GFX9-NEXT: v_exp_f32_e32 v3, v3
+; GFX9-NEXT: v_exp_f32_e32 v0, v0
+; GFX9-NEXT: v_exp_f32_e32 v1, v1
+; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT: v_pack_b32_f16 v0, v3, v0
+; GFX9-NEXT: v_pack_b32_f16 v1, v2, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: exp_v4f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX10-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+; GFX10-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3
+; GFX10-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX10-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX10-NEXT: v_exp_f32_e32 v2, v2
+; GFX10-NEXT: v_exp_f32_e32 v3, v3
+; GFX10-NEXT: v_exp_f32_e32 v0, v0
+; GFX10-NEXT: v_exp_f32_e32 v1, v1
+; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX10-NEXT: v_pack_b32_f16 v0, v3, v0
+; GFX10-NEXT: v_pack_b32_f16 v1, v2, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: exp_v4f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_dual_mul_f32 v1, 0x3fb8aa3b, v1 :: v_dual_mul_f32 v0, 0x3fb8aa3b, v0
+; GFX11-NEXT: v_dual_mul_f32 v2, 0x3fb8aa3b, v2 :: v_dual_mul_f32 v3, 0x3fb8aa3b, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_exp_f32_e32 v1, v1
+; GFX11-NEXT: v_exp_f32_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_exp_f32_e32 v2, v2
+; GFX11-NEXT: v_exp_f32_e32 v3, v3
+; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX11-NEXT: s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_pack_b32_f16 v1, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <4 x half> @llvm.exp.v4f16(<4 x half> %a)
+ ret <4 x half> %res
+}
+
+define <4 x half> @exp2_v4f16(<4 x half> %a) {
+; GFX8-LABEL: exp2_v4f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_exp_f16_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_exp_f16_sdwa v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_exp_f16_e32 v0, v0
+; GFX8-NEXT: v_exp_f16_e32 v1, v1
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v3
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: exp2_v4f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: v_exp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_exp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: exp2_v4f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_exp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_exp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: exp2_v4f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT: v_exp_f16_e32 v1, v1
+; GFX11-NEXT: v_exp_f16_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_exp_f16_e32 v2, v2
+; GFX11-NEXT: v_exp_f16_e32 v3, v3
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v2
+; GFX11-NEXT: v_pack_b32_f16 v1, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <4 x half> @llvm.exp2.v4f16(<4 x half> %a)
+ ret <4 x half> %res
+}
+
+define <4 x half> @exp10_v4f16(<4 x half> %a) {
+; GFX8-LABEL: exp10_v4f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX8-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX8-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+; GFX8-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX8-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3
+; GFX8-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX8-NEXT: v_exp_f32_e32 v2, v2
+; GFX8-NEXT: v_exp_f32_e32 v3, v3
+; GFX8-NEXT: v_exp_f32_e32 v0, v0
+; GFX8-NEXT: v_exp_f32_e32 v1, v1
+; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX8-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_or_b32_e32 v0, v3, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: exp10_v4f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX9-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX9-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+; GFX9-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX9-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3
+; GFX9-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX9-NEXT: v_exp_f32_e32 v2, v2
+; GFX9-NEXT: v_exp_f32_e32 v3, v3
+; GFX9-NEXT: v_exp_f32_e32 v0, v0
+; GFX9-NEXT: v_exp_f32_e32 v1, v1
+; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT: v_pack_b32_f16 v0, v3, v0
+; GFX9-NEXT: v_pack_b32_f16 v1, v2, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: exp10_v4f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX10-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+; GFX10-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3
+; GFX10-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX10-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX10-NEXT: v_exp_f32_e32 v2, v2
+; GFX10-NEXT: v_exp_f32_e32 v3, v3
+; GFX10-NEXT: v_exp_f32_e32 v0, v0
+; GFX10-NEXT: v_exp_f32_e32 v1, v1
+; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX10-NEXT: v_pack_b32_f16 v0, v3, v0
+; GFX10-NEXT: v_pack_b32_f16 v1, v2, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: exp10_v4f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_dual_mul_f32 v1, 0x3fb8aa3b, v1 :: v_dual_mul_f32 v0, 0x3fb8aa3b, v0
+; GFX11-NEXT: v_dual_mul_f32 v2, 0x3fb8aa3b, v2 :: v_dual_mul_f32 v3, 0x3fb8aa3b, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_exp_f32_e32 v1, v1
+; GFX11-NEXT: v_exp_f32_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_exp_f32_e32 v2, v2
+; GFX11-NEXT: v_exp_f32_e32 v3, v3
+; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX11-NEXT: s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_pack_b32_f16 v1, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <4 x half> @llvm.exp10.v4f16(<4 x half> %a)
+ ret <4 x half> %res
+}
+
+define <4 x half> @sqrt_v4f16(<4 x half> %a) {
+; GFX8-LABEL: sqrt_v4f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_sqrt_f16_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_sqrt_f16_sdwa v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX8-NEXT: v_sqrt_f16_e32 v1, v1
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v3
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: sqrt_v4f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: v_sqrt_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_sqrt_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: sqrt_v4f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_sqrt_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_sqrt_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: sqrt_v4f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT: v_sqrt_f16_e32 v1, v1
+; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_sqrt_f16_e32 v2, v2
+; GFX11-NEXT: v_sqrt_f16_e32 v3, v3
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v2
+; GFX11-NEXT: v_pack_b32_f16 v1, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <4 x half> @llvm.sqrt.v4f16(<4 x half> %a)
+ ret <4 x half> %res
+}
+
+define <5 x half> @sin_v5f16(<5 x half> %a) {
+; GFX8-LABEL: sin_v5f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v4, 0x3118
+; GFX8-NEXT: v_mul_f16_e32 v3, 0.15915494, v1
+; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_mul_f16_e32 v5, 0.15915494, v0
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
+; GFX8-NEXT: v_fract_f16_e32 v3, v3
+; GFX8-NEXT: v_fract_f16_e32 v1, v1
+; GFX8-NEXT: v_fract_f16_e32 v5, v5
+; GFX8-NEXT: v_fract_f16_e32 v0, v0
+; GFX8-NEXT: v_sin_f16_e32 v3, v3
+; GFX8-NEXT: v_sin_f16_e32 v5, v5
+; GFX8-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_fract_f16_e32 v2, v2
+; GFX8-NEXT: v_sin_f16_e32 v2, v2
+; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v3, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: sin_v5f16:
+; GFX906: ; %bb.0:
+; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT: v_mov_b32_e32 v3, 0x3118
+; GFX906-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
+; GFX906-NEXT: v_sin_f16_e32 v2, v2
+; GFX906-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: sin_v5f16:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v3, 0x3118
+; GFX908-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
+; GFX908-NEXT: v_sin_f16_e32 v2, v2
+; GFX908-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: sin_v5f16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0x3118
+; GFX942-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
+; GFX942-NEXT: v_sin_f16_e32 v2, v2
+; GFX942-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: sin_v5f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v3, 0x3118
+; GFX10-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_sin_f16_e32 v2, v2
+; GFX10-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: sin_v5f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
+; GFX11-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
+; GFX11-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
+; GFX11-NEXT: v_mul_f16_e32 v3, 0.15915494, v3
+; GFX11-NEXT: v_mul_f16_e32 v4, 0.15915494, v4
+; GFX11-NEXT: v_sin_f16_e32 v1, v1
+; GFX11-NEXT: v_sin_f16_e32 v0, v0
+; GFX11-NEXT: v_sin_f16_e32 v2, v2
+; GFX11-NEXT: v_sin_f16_e32 v3, v3
+; GFX11-NEXT: v_sin_f16_e32 v4, v4
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v3
+; GFX11-NEXT: v_pack_b32_f16 v1, v1, v4
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <5 x half> @llvm.sin.v5f16(<5 x half> %a)
+ ret <5 x half> %res
+}
+
+define <5 x half> @cos_v5f16(<5 x half> %a) {
+; GFX8-LABEL: cos_v5f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v4, 0x3118
+; GFX8-NEXT: v_mul_f16_e32 v3, 0.15915494, v1
+; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_mul_f16_e32 v5, 0.15915494, v0
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
+; GFX8-NEXT: v_fract_f16_e32 v3, v3
+; GFX8-NEXT: v_fract_f16_e32 v1, v1
+; GFX8-NEXT: v_fract_f16_e32 v5, v5
+; GFX8-NEXT: v_fract_f16_e32 v0, v0
+; GFX8-NEXT: v_cos_f16_e32 v3, v3
+; GFX8-NEXT: v_cos_f16_e32 v5, v5
+; GFX8-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_fract_f16_e32 v2, v2
+; GFX8-NEXT: v_cos_f16_e32 v2, v2
+; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v3, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: cos_v5f16:
+; GFX906: ; %bb.0:
+; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT: v_mov_b32_e32 v3, 0x3118
+; GFX906-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
+; GFX906-NEXT: v_cos_f16_e32 v2, v2
+; GFX906-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: cos_v5f16:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v3, 0x3118
+; GFX908-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
+; GFX908-NEXT: v_cos_f16_e32 v2, v2
+; GFX908-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: cos_v5f16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0x3118
+; GFX942-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
+; GFX942-NEXT: v_cos_f16_e32 v2, v2
+; GFX942-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: cos_v5f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v3, 0x3118
+; GFX10-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_cos_f16_e32 v2, v2
+; GFX10-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: cos_v5f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
+; GFX11-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
+; GFX11-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
+; GFX11-NEXT: v_mul_f16_e32 v3, 0.15915494, v3
+; GFX11-NEXT: v_mul_f16_e32 v4, 0.15915494, v4
+; GFX11-NEXT: v_cos_f16_e32 v1, v1
+; GFX11-NEXT: v_cos_f16_e32 v0, v0
+; GFX11-NEXT: v_cos_f16_e32 v2, v2
+; GFX11-NEXT: v_cos_f16_e32 v3, v3
+; GFX11-NEXT: v_cos_f16_e32 v4, v4
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v3
+; GFX11-NEXT: v_pack_b32_f16 v1, v1, v4
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <5 x half> @llvm.cos.v5f16(<5 x half> %a)
+ ret <5 x half> %res
+}
+
+define <5 x half> @log_v5f16(<5 x half> %a) {
+; GFX8-LABEL: log_v5f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_log_f16_e32 v3, v1
+; GFX8-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_log_f16_e32 v4, v0
+; GFX8-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_log_f16_e32 v2, v2
+; GFX8-NEXT: v_mov_b32_e32 v5, 0x398c
+; GFX8-NEXT: v_mul_f16_e32 v3, 0x398c, v3
+; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_mul_f16_e32 v4, 0x398c, v4
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_mul_f16_e32 v2, 0x398c, v2
+; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v3, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: log_v5f16:
+; GFX906: ; %bb.0:
+; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT: s_movk_i32 s4, 0x398c
+; GFX906-NEXT: v_log_f16_e32 v2, v2
+; GFX906-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT: v_mul_f16_e32 v2, 0x398c, v2
+; GFX906-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: log_v5f16:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT: s_movk_i32 s4, 0x398c
+; GFX908-NEXT: v_log_f16_e32 v2, v2
+; GFX908-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT: v_mul_f16_e32 v2, 0x398c, v2
+; GFX908-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: log_v5f16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT: s_movk_i32 s0, 0x398c
+; GFX942-NEXT: v_log_f16_e32 v2, v2
+; GFX942-NEXT: v_mul_f16_sdwa v1, v1, s0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT: v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT: v_mul_f16_e32 v2, 0x398c, v2
+; GFX942-NEXT: v_mul_f16_sdwa v1, v1, s0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT: v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: log_v5f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_e32 v3, 0x398c
+; GFX10-NEXT: v_log_f16_e32 v2, v2
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_mul_f16_e32 v2, 0x398c, v2
+; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: log_v5f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-NEXT: v_log_f16_e32 v1, v1
+; GFX11-NEXT: v_log_f16_e32 v0, v0
+; GFX11-NEXT: v_log_f16_e32 v2, v2
+; GFX11-NEXT: v_log_f16_e32 v3, v3
+; GFX11-NEXT: v_log_f16_e32 v4, v4
+; GFX11-NEXT: v_mul_f16_e32 v1, 0x398c, v1
+; GFX11-NEXT: v_mul_f16_e32 v0, 0x398c, v0
+; GFX11-NEXT: s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_mul_f16_e32 v2, 0x398c, v2
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f16_e32 v3, 0x398c, v3
+; GFX11-NEXT: v_mul_f16_e32 v4, 0x398c, v4
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_pack_b32_f16 v1, v1, v4
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <5 x half> @llvm.log.v5f16(<5 x half> %a)
+ ret <5 x half> %res
+}
+
+define <5 x half> @log2_v5f16(<5 x half> %a) {
+; GFX8-LABEL: log2_v5f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_log_f16_sdwa v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_log_f16_sdwa v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_log_f16_e32 v0, v0
+; GFX8-NEXT: v_log_f16_e32 v1, v1
+; GFX8-NEXT: v_log_f16_e32 v2, v2
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: log2_v5f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: v_log_f16_e32 v2, v2
+; GFX9-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: log2_v5f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_log_f16_e32 v2, v2
+; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: log2_v5f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-NEXT: v_log_f16_e32 v1, v1
+; GFX11-NEXT: v_log_f16_e32 v0, v0
+; GFX11-NEXT: v_log_f16_e32 v2, v2
+; GFX11-NEXT: v_log_f16_e32 v3, v3
+; GFX11-NEXT: v_log_f16_e32 v4, v4
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v3
+; GFX11-NEXT: v_pack_b32_f16 v1, v1, v4
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <5 x half> @llvm.log2.v5f16(<5 x half> %a)
+ ret <5 x half> %res
+}
+
+define <5 x half> @log10_v5f16(<5 x half> %a) {
+; GFX8-LABEL: log10_v5f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_log_f16_e32 v3, v1
+; GFX8-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_log_f16_e32 v4, v0
+; GFX8-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_log_f16_e32 v2, v2
+; GFX8-NEXT: v_mov_b32_e32 v5, 0x34d1
+; GFX8-NEXT: v_mul_f16_e32 v3, 0x34d1, v3
+; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_mul_f16_e32 v4, 0x34d1, v4
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_mul_f16_e32 v2, 0x34d1, v2
+; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v3, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: log10_v5f16:
+; GFX906: ; %bb.0:
+; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX906-NEXT: s_movk_i32 s4, 0x34d1
+; GFX906-NEXT: v_log_f16_e32 v2, v2
+; GFX906-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX906-NEXT: v_mul_f16_e32 v2, 0x34d1, v2
+; GFX906-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: log10_v5f16:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX908-NEXT: s_movk_i32 s4, 0x34d1
+; GFX908-NEXT: v_log_f16_e32 v2, v2
+; GFX908-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX908-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX908-NEXT: v_mul_f16_e32 v2, 0x34d1, v2
+; GFX908-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: log10_v5f16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX942-NEXT: s_movk_i32 s0, 0x34d1
+; GFX942-NEXT: v_log_f16_e32 v2, v2
+; GFX942-NEXT: v_mul_f16_sdwa v1, v1, s0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT: v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX942-NEXT: v_mul_f16_e32 v2, 0x34d1, v2
+; GFX942-NEXT: v_mul_f16_sdwa v1, v1, s0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT: v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: log10_v5f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_e32 v3, 0x34d1
+; GFX10-NEXT: v_log_f16_e32 v2, v2
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_mul_f16_e32 v2, 0x34d1, v2
+; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: log10_v5f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-NEXT: v_log_f16_e32 v1, v1
+; GFX11-NEXT: v_log_f16_e32 v0, v0
+; GFX11-NEXT: v_log_f16_e32 v2, v2
+; GFX11-NEXT: v_log_f16_e32 v3, v3
+; GFX11-NEXT: v_log_f16_e32 v4, v4
+; GFX11-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
+; GFX11-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
+; GFX11-NEXT: s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_mul_f16_e32 v2, 0x34d1, v2
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f16_e32 v3, 0x34d1, v3
+; GFX11-NEXT: v_mul_f16_e32 v4, 0x34d1, v4
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_pack_b32_f16 v1, v1, v4
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <5 x half> @llvm.log10.v5f16(<5 x half> %a)
+ ret <5 x half> %res
+}
+
+define <5 x half> @exp_v5f16(<5 x half> %a) {
+; GFX8-LABEL: exp_v5f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX8-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_cvt_f32_f16_e32 v4, v0
+; GFX8-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX8-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3
+; GFX8-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX8-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v4
+; GFX8-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX8-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+; GFX8-NEXT: v_exp_f32_e32 v3, v3
+; GFX8-NEXT: v_exp_f32_e32 v4, v4
+; GFX8-NEXT: v_exp_f32_e32 v0, v0
+; GFX8-NEXT: v_exp_f32_e32 v1, v1
+; GFX8-NEXT: v_exp_f32_e32 v2, v2
+; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX8-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX8-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v3, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: exp_v5f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX9-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v0
+; GFX9-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX9-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3
+; GFX9-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX9-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v4
+; GFX9-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX9-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+; GFX9-NEXT: v_exp_f32_e32 v3, v3
+; GFX9-NEXT: v_exp_f32_e32 v4, v4
+; GFX9-NEXT: v_exp_f32_e32 v0, v0
+; GFX9-NEXT: v_exp_f32_e32 v1, v1
+; GFX9-NEXT: v_exp_f32_e32 v2, v2
+; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX9-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX9-NEXT: v_pack_b32_f16 v0, v4, v0
+; GFX9-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: exp_v5f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v0
+; GFX10-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX10-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3
+; GFX10-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v4
+; GFX10-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX10-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX10-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+; GFX10-NEXT: v_exp_f32_e32 v3, v3
+; GFX10-NEXT: v_exp_f32_e32 v4, v4
+; GFX10-NEXT: v_exp_f32_e32 v0, v0
+; GFX10-NEXT: v_exp_f32_e32 v1, v1
+; GFX10-NEXT: v_exp_f32_e32 v2, v2
+; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX10-NEXT: v_pack_b32_f16 v0, v4, v0
+; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: exp_v5f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_dual_mul_f32 v1, 0x3fb8aa3b, v1 :: v_dual_mul_f32 v0, 0x3fb8aa3b, v0
+; GFX11-NEXT: v_dual_mul_f32 v2, 0x3fb8aa3b, v2 :: v_dual_mul_f32 v3, 0x3fb8aa3b, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v4
+; GFX11-NEXT: v_exp_f32_e32 v1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_exp_f32_e32 v0, v0
+; GFX11-NEXT: v_exp_f32_e32 v2, v2
+; GFX11-NEXT: v_exp_f32_e32 v3, v3
+; GFX11-NEXT: v_exp_f32_e32 v4, v4
+; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_pack_b32_f16 v1, v1, v4
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <5 x half> @llvm.exp.v5f16(<5 x half> %a)
+ ret <5 x half> %res
+}
+
+define <5 x half> @exp2_v5f16(<5 x half> %a) {
+; GFX8-LABEL: exp2_v5f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_exp_f16_sdwa v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_exp_f16_sdwa v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_exp_f16_e32 v0, v0
+; GFX8-NEXT: v_exp_f16_e32 v1, v1
+; GFX8-NEXT: v_exp_f16_e32 v2, v2
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: exp2_v5f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: v_exp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: v_exp_f16_e32 v2, v2
+; GFX9-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_exp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: exp2_v5f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_exp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_exp_f16_e32 v2, v2
+; GFX10-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_exp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: exp2_v5f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-NEXT: v_exp_f16_e32 v1, v1
+; GFX11-NEXT: v_exp_f16_e32 v0, v0
+; GFX11-NEXT: v_exp_f16_e32 v2, v2
+; GFX11-NEXT: v_exp_f16_e32 v3, v3
+; GFX11-NEXT: v_exp_f16_e32 v4, v4
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v3
+; GFX11-NEXT: v_pack_b32_f16 v1, v1, v4
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <5 x half> @llvm.exp2.v5f16(<5 x half> %a)
+ ret <5 x half> %res
+}
+
+define <5 x half> @exp10_v5f16(<5 x half> %a) {
+; GFX8-LABEL: exp10_v5f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX8-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_cvt_f32_f16_e32 v4, v0
+; GFX8-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX8-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3
+; GFX8-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX8-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v4
+; GFX8-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX8-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+; GFX8-NEXT: v_exp_f32_e32 v3, v3
+; GFX8-NEXT: v_exp_f32_e32 v4, v4
+; GFX8-NEXT: v_exp_f32_e32 v0, v0
+; GFX8-NEXT: v_exp_f32_e32 v1, v1
+; GFX8-NEXT: v_exp_f32_e32 v2, v2
+; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX8-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX8-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v3, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: exp10_v5f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX9-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v0
+; GFX9-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX9-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3
+; GFX9-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX9-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v4
+; GFX9-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX9-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+; GFX9-NEXT: v_exp_f32_e32 v3, v3
+; GFX9-NEXT: v_exp_f32_e32 v4, v4
+; GFX9-NEXT: v_exp_f32_e32 v0, v0
+; GFX9-NEXT: v_exp_f32_e32 v1, v1
+; GFX9-NEXT: v_exp_f32_e32 v2, v2
+; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX9-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX9-NEXT: v_pack_b32_f16 v0, v4, v0
+; GFX9-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: exp10_v5f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v0
+; GFX10-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX10-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3
+; GFX10-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v4
+; GFX10-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX10-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX10-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+; GFX10-NEXT: v_exp_f32_e32 v3, v3
+; GFX10-NEXT: v_exp_f32_e32 v4, v4
+; GFX10-NEXT: v_exp_f32_e32 v0, v0
+; GFX10-NEXT: v_exp_f32_e32 v1, v1
+; GFX10-NEXT: v_exp_f32_e32 v2, v2
+; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX10-NEXT: v_pack_b32_f16 v0, v4, v0
+; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: exp10_v5f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_dual_mul_f32 v1, 0x3fb8aa3b, v1 :: v_dual_mul_f32 v0, 0x3fb8aa3b, v0
+; GFX11-NEXT: v_dual_mul_f32 v2, 0x3fb8aa3b, v2 :: v_dual_mul_f32 v3, 0x3fb8aa3b, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v4
+; GFX11-NEXT: v_exp_f32_e32 v1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_exp_f32_e32 v0, v0
+; GFX11-NEXT: v_exp_f32_e32 v2, v2
+; GFX11-NEXT: v_exp_f32_e32 v3, v3
+; GFX11-NEXT: v_exp_f32_e32 v4, v4
+; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_pack_b32_f16 v1, v1, v4
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <5 x half> @llvm.exp10.v5f16(<5 x half> %a)
+ ret <5 x half> %res
+}
+
+define <5 x half> @sqrt_v5f16(<5 x half> %a) {
+; GFX8-LABEL: sqrt_v5f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_sqrt_f16_sdwa v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_sqrt_f16_sdwa v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX8-NEXT: v_sqrt_f16_e32 v1, v1
+; GFX8-NEXT: v_sqrt_f16_e32 v2, v2
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: sqrt_v5f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: v_sqrt_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: v_sqrt_f16_e32 v2, v2
+; GFX9-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_sqrt_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: sqrt_v5f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_sqrt_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_sqrt_f16_e32 v2, v2
+; GFX10-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_sqrt_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: sqrt_v5f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-NEXT: v_sqrt_f16_e32 v1, v1
+; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX11-NEXT: v_sqrt_f16_e32 v2, v2
+; GFX11-NEXT: v_sqrt_f16_e32 v3, v3
+; GFX11-NEXT: v_sqrt_f16_e32 v4, v4
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v3
+; GFX11-NEXT: v_pack_b32_f16 v1, v1, v4
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = call <5 x half> @llvm.sqrt.v5f16(<5 x half> %a)
+ ret <5 x half> %res
+}
+
+define <4 x half> @cascaded_v4f16(<4 x half> %a) {
+; GFX8-LABEL: cascaded_v4f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_log_f16_e32 v2, v1
+; GFX8-NEXT: v_log_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_log_f16_e32 v0, v0
+; GFX8-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
+; GFX8-NEXT: v_mul_f16_e32 v3, 0.15915494, v3
+; GFX8-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
+; GFX8-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
+; GFX8-NEXT: v_fract_f16_e32 v2, v2
+; GFX8-NEXT: v_fract_f16_e32 v1, v1
+; GFX8-NEXT: v_fract_f16_e32 v0, v0
+; GFX8-NEXT: v_fract_f16_e32 v3, v3
+; GFX8-NEXT: v_sin_f16_e32 v2, v2
+; GFX8-NEXT: v_sin_f16_e32 v0, v0
+; GFX8-NEXT: v_sin_f16_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v3
+; GFX8-NEXT: v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: cascaded_v4f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_log_f16_e32 v2, v1
+; GFX9-NEXT: v_log_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_log_f16_e32 v0, v0
+; GFX9-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
+; GFX9-NEXT: v_mul_f16_e32 v3, 0.15915494, v3
+; GFX9-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
+; GFX9-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
+; GFX9-NEXT: v_sin_f16_e32 v2, v2
+; GFX9-NEXT: v_sin_f16_e32 v0, v0
+; GFX9-NEXT: v_sin_f16_e32 v3, v3
+; GFX9-NEXT: v_sin_f16_e32 v1, v1
+; GFX9-NEXT: v_pack_b32_f16 v0, v0, v3
+; GFX9-NEXT: v_pack_b32_f16 v1, v2, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: cascaded_v4f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_log_f16_e32 v2, v1
+; GFX10-NEXT: v_log_f16_e32 v3, v0
+; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
+; GFX10-NEXT: v_mul_f16_e32 v3, 0.15915494, v3
+; GFX10-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
+; GFX10-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
+; GFX10-NEXT: v_sin_f16_e32 v2, v2
+; GFX10-NEXT: v_sin_f16_e32 v3, v3
+; GFX10-NEXT: v_sin_f16_e32 v0, v0
+; GFX10-NEXT: v_sin_f16_e32 v1, v1
+; GFX10-NEXT: v_pack_b32_f16 v0, v3, v0
+; GFX10-NEXT: v_pack_b32_f16 v1, v2, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: cascaded_v4f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT: v_log_f16_e32 v1, v1
+; GFX11-NEXT: v_log_f16_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_log_f16_e32 v2, v2
+; GFX11-NEXT: v_log_f16_e32 v3, v3
+; GFX11-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
+; GFX11-NEXT: s_delay_alu instid0(TRANS32_DEP_3)
+; GFX11-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
+; GFX11-NEXT: v_mul_f16_e32 v3, 0.15915494, v3
+; GFX11-NEXT: v_sin_f16_e32 v1, v1
+; GFX11-NEXT: v_sin_f16_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_sin_f16_e32 v2, v2
+; GFX11-NEXT: v_sin_f16_e32 v3, v3
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v2
+; GFX11-NEXT: v_pack_b32_f16 v1, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %b = call <4 x half> @llvm.log2.v4f16(<4 x half> %a)
+ %res = call <4 x half> @llvm.sin.v4f16(<4 x half> %b)
+ ret <4 x half> %res
+}
+
+define <5 x half> @cascaded_v5f16(<5 x half> %a) {
+; GFX8-LABEL: cascaded_v5f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_log_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_log_f16_e32 v4, v1
+; GFX8-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_log_f16_e32 v0, v0
+; GFX8-NEXT: v_log_f16_e32 v2, v2
+; GFX8-NEXT: v_mul_f16_e32 v4, 0.15915494, v4
+; GFX8-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
+; GFX8-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
+; GFX8-NEXT: v_mul_f16_e32 v3, 0.15915494, v3
+; GFX8-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
+; GFX8-NEXT: v_fract_f16_e32 v4, v4
+; GFX8-NEXT: v_fract_f16_e32 v1, v1
+; GFX8-NEXT: v_fract_f16_e32 v0, v0
+; GFX8-NEXT: v_fract_f16_e32 v3, v3
+; GFX8-NEXT: v_sin_f16_e32 v4, v4
+; GFX8-NEXT: v_sin_f16_e32 v0, v0
+; GFX8-NEXT: v_sin_f16_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_fract_f16_e32 v2, v2
+; GFX8-NEXT: v_sin_f16_e32 v2, v2
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v3
+; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: cascaded_v5f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_log_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_log_f16_e32 v4, v1
+; GFX9-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_log_f16_e32 v0, v0
+; GFX9-NEXT: v_log_f16_e32 v2, v2
+; GFX9-NEXT: v_mul_f16_e32 v4, 0.15915494, v4
+; GFX9-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
+; GFX9-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
+; GFX9-NEXT: v_mul_f16_e32 v3, 0.15915494, v3
+; GFX9-NEXT: v_sin_f16_e32 v4, v4
+; GFX9-NEXT: v_sin_f16_e32 v0, v0
+; GFX9-NEXT: v_sin_f16_e32 v3, v3
+; GFX9-NEXT: v_sin_f16_e32 v1, v1
+; GFX9-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
+; GFX9-NEXT: v_sin_f16_e32 v2, v2
+; GFX9-NEXT: v_pack_b32_f16 v0, v0, v3
+; GFX9-NEXT: v_pack_b32_f16 v1, v4, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: cascaded_v5f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_log_f16_e32 v3, v1
+; GFX10-NEXT: v_log_f16_e32 v4, v0
+; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_log_f16_e32 v2, v2
+; GFX10-NEXT: v_mul_f16_e32 v3, 0.15915494, v3
+; GFX10-NEXT: v_mul_f16_e32 v4, 0.15915494, v4
+; GFX10-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
+; GFX10-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
+; GFX10-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
+; GFX10-NEXT: v_sin_f16_e32 v3, v3
+; GFX10-NEXT: v_sin_f16_e32 v4, v4
+; GFX10-NEXT: v_sin_f16_e32 v0, v0
+; GFX10-NEXT: v_sin_f16_e32 v1, v1
+; GFX10-NEXT: v_sin_f16_e32 v2, v2
+; GFX10-NEXT: v_pack_b32_f16 v0, v4, v0
+; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: cascaded_v5f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-NEXT: v_log_f16_e32 v1, v1
+; GFX11-NEXT: v_log_f16_e32 v0, v0
+; GFX11-NEXT: v_log_f16_e32 v2, v2
+; GFX11-NEXT: v_log_f16_e32 v3, v3
+; GFX11-NEXT: v_log_f16_e32 v4, v4
+; GFX11-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
+; GFX11-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
+; GFX11-NEXT: s_delay_alu instid0(TRANS32_DEP_3)
+; GFX11-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f16_e32 v3, 0.15915494, v3
+; GFX11-NEXT: v_mul_f16_e32 v4, 0.15915494, v4
+; GFX11-NEXT: v_sin_f16_e32 v1, v1
+; GFX11-NEXT: v_sin_f16_e32 v0, v0
+; GFX11-NEXT: v_sin_f16_e32 v2, v2
+; GFX11-NEXT: v_sin_f16_e32 v3, v3
+; GFX11-NEXT: v_sin_f16_e32 v4, v4
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v3
+; GFX11-NEXT: v_pack_b32_f16 v1, v1, v4
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %b = call <5 x half> @llvm.log2.v5f16(<5 x half> %a)
+ %res = call <5 x half> @llvm.sin.v5f16(<5 x half> %b)
+ ret <5 x half> %res
+}
>From 59438d371aa1c80dad25fbe36605d2c39432a345 Mon Sep 17 00:00:00 2001
From: vikashgu <Vikash.Gupta at amd.com>
Date: Thu, 24 Apr 2025 11:10:42 +0000
Subject: [PATCH 04/11] Refactored the code in order to encapsulate redundant
code as lambda function.
---
llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 57 ++++++++++++-----------
1 file changed, 31 insertions(+), 26 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index b5b743e029e1b..a0cafe084d142 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -1463,8 +1463,21 @@ static bool checkForRightSrcRootAccess(MachineInstr *Def0MI,
MachineInstr *Def1MI,
Register SrcRootReg,
const SIInstrInfo *TII) {
- // As if could, the Def1MI would have been sdwa-ed
- if (!TII->isSDWA(Def1MI->getOpcode()))
+ // As if could, the Def1MI would have been sdwa-ed in order to access
+ // upper half, and Def0MI should not be as it accessing lower half.
+ if (!TII->isSDWA(Def1MI->getOpcode()) || TII->isSDWA(Def0MI->getOpcode()))
+ return false;
+
+ // Def1 should be writing into entire DWORD of dst, with unused part set
+ // to zero-pad.
+ MachineOperand *Def1DstSel =
+ TII->getNamedOperand(*Def1MI, AMDGPU::OpName::dst_sel);
+ if (!Def1DstSel || Def1DstSel->getImm() != AMDGPU::SDWA::SdwaSel::DWORD)
+ return false;
+ MachineOperand *Def1DstUnused =
+ TII->getNamedOperand(*Def1MI, AMDGPU::OpName::dst_unused);
+ if (!Def1DstUnused ||
+ Def1DstUnused->getImm() != AMDGPU::SDWA::DstUnused::UNUSED_PAD)
return false;
MachineOperand *Def1Src0 =
@@ -1476,13 +1489,7 @@ static bool checkForRightSrcRootAccess(MachineInstr *Def0MI,
MachineOperand *Def0Src1 =
TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src1);
- if (Def1Src0 && Def1Src0->isReg() && (Def1Src0->getReg() == SrcRootReg)) {
- MachineOperand *Def1Src0Sel =
- TII->getNamedOperand(*Def1MI, AMDGPU::OpName::src0_sel);
- if (!Def1Src0Sel ||
- (Def1Src0Sel->getImm() != AMDGPU::SDWA::SdwaSel::WORD_1))
- return false;
-
+ auto chkForDef0MIAccess = [&]() -> bool {
if (Def0Src0 && Def0Src0->isReg() && (Def0Src0->getReg() == SrcRootReg)) {
MachineOperand *Def0Src0Sel =
TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src0_sel);
@@ -1500,6 +1507,19 @@ static bool checkForRightSrcRootAccess(MachineInstr *Def0MI,
if (Def0Src1Sel && Def0Src1Sel->getImm() == AMDGPU::SDWA::SdwaSel::WORD_0)
return true;
}
+
+ return false;
+ };
+
+ if (Def1Src0 && Def1Src0->isReg() && (Def1Src0->getReg() == SrcRootReg)) {
+ MachineOperand *Def1Src0Sel =
+ TII->getNamedOperand(*Def1MI, AMDGPU::OpName::src0_sel);
+ if (!Def1Src0Sel ||
+ (Def1Src0Sel->getImm() != AMDGPU::SDWA::SdwaSel::WORD_1))
+ return false;
+
+ if (chkForDef0MIAccess())
+ return true;
}
if (Def1Src1 && Def1Src1->isReg() && (Def1Src1->getReg() == SrcRootReg)) {
@@ -1509,23 +1529,8 @@ static bool checkForRightSrcRootAccess(MachineInstr *Def0MI,
(Def1Src1Sel->getImm() != AMDGPU::SDWA::SdwaSel::WORD_1))
return false;
- if (Def0Src0 && Def0Src0->isReg() && (Def0Src0->getReg() == SrcRootReg)) {
- MachineOperand *Def0Src0Sel =
- TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src0_sel);
- if (!Def0Src0Sel)
- return true;
- if (Def0Src0Sel && Def0Src0Sel->getImm() == AMDGPU::SDWA::SdwaSel::WORD_0)
- return true;
- }
-
- if (Def0Src1 && Def0Src1->isReg() && (Def0Src1->getReg() == SrcRootReg)) {
- MachineOperand *Def0Src1Sel =
- TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src1_sel);
- if (!Def0Src1Sel)
- return true;
- if (Def0Src1Sel && Def0Src1Sel->getImm() == AMDGPU::SDWA::SdwaSel::WORD_0)
- return true;
- }
+ if (chkForDef0MIAccess())
+ return true;
}
return false;
>From a9d99f6874f430753c938f823f1060d5f16fe20c Mon Sep 17 00:00:00 2001
From: vikashgu <Vikash.Gupta at amd.com>
Date: Mon, 28 Apr 2025 09:33:53 +0000
Subject: [PATCH 05/11] Added reviewed changes addressing redundant code &
complex logic.
---
llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 450 ++++++++++------------
1 file changed, 214 insertions(+), 236 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index a0cafe084d142..0bb3a7c1c06f2 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -78,7 +78,7 @@ class SIPeepholeSDWA {
computeMIChainsForPackedOps(MachineInstr *ParentMI,
std::queue<MachineOperand *> &DefSrcQueue,
const GCNSubtarget &ST);
- void convertMIToSDWAWithOpsel(MachineInstr &MI, MachineOperand &SrcMO,
+ void convertMIToSDWAWithOpsel(MachineInstr *MI, MachineOperand &SrcMO,
AMDGPU::SDWA::SdwaSel OpSel);
public:
@@ -281,17 +281,13 @@ void SDWADstPreserveOperand::print(raw_ostream& OS) const {
#endif
-static void copyRegOperand(MachineOperand &To, const MachineOperand &From,
- bool isKill = false) {
+static void copyRegOperand(MachineOperand &To, const MachineOperand &From) {
assert(To.isReg() && From.isReg());
To.setReg(From.getReg());
To.setSubReg(From.getSubReg());
To.setIsUndef(From.isUndef());
if (To.isUse()) {
- if (isKill)
- To.setIsKill(true);
- else
- To.setIsKill(From.isKill());
+ To.setIsKill(From.isKill());
} else {
To.setIsDead(From.isDead());
}
@@ -1489,22 +1485,20 @@ static bool checkForRightSrcRootAccess(MachineInstr *Def0MI,
MachineOperand *Def0Src1 =
TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src1);
- auto chkForDef0MIAccess = [&]() -> bool {
+ auto checkForDef0MIAccess = [&]() -> bool {
if (Def0Src0 && Def0Src0->isReg() && (Def0Src0->getReg() == SrcRootReg)) {
MachineOperand *Def0Src0Sel =
TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src0_sel);
- if (!Def0Src0Sel)
- return true;
- if (Def0Src0Sel && Def0Src0Sel->getImm() == AMDGPU::SDWA::SdwaSel::WORD_0)
+ if (!Def0Src0Sel ||
+ Def0Src0Sel->getImm() == AMDGPU::SDWA::SdwaSel::WORD_0)
return true;
}
if (Def0Src1 && Def0Src1->isReg() && (Def0Src1->getReg() == SrcRootReg)) {
MachineOperand *Def0Src1Sel =
TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src1_sel);
- if (!Def0Src1Sel)
- return true;
- if (Def0Src1Sel && Def0Src1Sel->getImm() == AMDGPU::SDWA::SdwaSel::WORD_0)
+ if (!Def0Src1Sel ||
+ Def0Src1Sel->getImm() == AMDGPU::SDWA::SdwaSel::WORD_0)
return true;
}
@@ -1514,22 +1508,20 @@ static bool checkForRightSrcRootAccess(MachineInstr *Def0MI,
if (Def1Src0 && Def1Src0->isReg() && (Def1Src0->getReg() == SrcRootReg)) {
MachineOperand *Def1Src0Sel =
TII->getNamedOperand(*Def1MI, AMDGPU::OpName::src0_sel);
- if (!Def1Src0Sel ||
- (Def1Src0Sel->getImm() != AMDGPU::SDWA::SdwaSel::WORD_1))
+ if (!Def1Src0Sel || Def1Src0Sel->getImm() != AMDGPU::SDWA::SdwaSel::WORD_1)
return false;
- if (chkForDef0MIAccess())
+ if (checkForDef0MIAccess())
return true;
}
if (Def1Src1 && Def1Src1->isReg() && (Def1Src1->getReg() == SrcRootReg)) {
MachineOperand *Def1Src1Sel =
TII->getNamedOperand(*Def1MI, AMDGPU::OpName::src1_sel);
- if (!Def1Src1Sel ||
- (Def1Src1Sel->getImm() != AMDGPU::SDWA::SdwaSel::WORD_1))
+ if (!Def1Src1Sel || Def1Src1Sel->getImm() != AMDGPU::SDWA::SdwaSel::WORD_1)
return false;
- if (chkForDef0MIAccess())
+ if (checkForDef0MIAccess())
return true;
}
@@ -1554,71 +1546,69 @@ static bool dominates(MachineBasicBlock::const_iterator A,
// Convert MI into its SDWA version with its Dst_Sel & SrcMO_Sel set with OpSel
// and preserving the rest of Dst's bits.
-void SIPeepholeSDWA::convertMIToSDWAWithOpsel(MachineInstr &MI,
+void SIPeepholeSDWA::convertMIToSDWAWithOpsel(MachineInstr *MI,
MachineOperand &SrcMO,
AMDGPU::SDWA::SdwaSel OpSel) {
LLVM_DEBUG(dbgs() << "Convert instruction:" << MI);
- MachineInstr *SDWAInst;
- if (TII->isSDWA(MI.getOpcode())) {
- SDWAInst = &MI;
- } else {
- SDWAInst = createSDWAVersion(MI);
- MI.eraseFromParent();
+ if (!TII->isSDWA(MI->getOpcode())) {
+ MachineInstr *SDWAInst = createSDWAVersion(*MI);
+ MI->eraseFromParent();
+ MI = SDWAInst;
}
- ConvertedInstructions.push_back(SDWAInst);
- unsigned SDWAOpcode = SDWAInst->getOpcode();
+ ConvertedInstructions.push_back(MI);
+ unsigned SDWAOpcode = MI->getOpcode();
++NumSDWAInstructionsToEliminateFP16Pack;
- MachineOperand *Dst = TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::vdst);
+ MachineOperand *Dst = TII->getNamedOperand(*MI, AMDGPU::OpName::vdst);
assert(Dst && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::vdst));
- MachineOperand *DstSel =
- TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::dst_sel);
+ MachineOperand *DstSel = TII->getNamedOperand(*MI, AMDGPU::OpName::dst_sel);
assert(DstSel &&
AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::dst_sel));
DstSel->setImm(OpSel);
MachineOperand *DstUnused =
- TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::dst_unused);
+ TII->getNamedOperand(*MI, AMDGPU::OpName::dst_unused);
assert(DstUnused &&
AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::dst_unused));
assert(!(DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) &&
"Dst_unused should not be UNUSED_PRESERVE already");
DstUnused->setImm(AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE);
- auto PreserveDstIdx =
+ int PreserveDstIdx =
AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst);
assert(PreserveDstIdx != -1);
- auto NewSrcImplitMO = MachineOperand::CreateReg(SrcMO.getReg(), false, true);
+ MachineOperand NewSrcImplitMO =
+ MachineOperand::CreateReg(SrcMO.getReg(), false, true);
copyRegOperand(NewSrcImplitMO, SrcMO);
- SDWAInst->addOperand(NewSrcImplitMO);
- SDWAInst->tieOperands(PreserveDstIdx, SDWAInst->getNumOperands() - 1);
+ MI->addOperand(NewSrcImplitMO);
+ MI->tieOperands(PreserveDstIdx, MI->getNumOperands() - 1);
- MachineOperand *Src0 = TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::src0);
+ MachineOperand *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
assert(Src0 && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0));
if (Src0->isReg() && (Src0->getReg() == SrcMO.getReg())) {
MachineOperand *Src0Sel =
- TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::src0_sel);
+ TII->getNamedOperand(*MI, AMDGPU::OpName::src0_sel);
assert(Src0Sel &&
AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_sel));
Src0Sel->setImm(OpSel);
- LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n');
+ LLVM_DEBUG(dbgs() << "\nInto:" << *MI << '\n');
return;
}
- MachineOperand *Src1 = TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::src1);
+ MachineOperand *Src1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1);
assert(Src1 && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1));
if (Src1->isReg() && (Src1->getReg() == SrcMO.getReg())) {
MachineOperand *Src1Sel =
- TII->getNamedOperand(*SDWAInst, AMDGPU::OpName::src1_sel);
+ TII->getNamedOperand(*MI, AMDGPU::OpName::src1_sel);
assert(Src1Sel &&
AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_sel));
Src1Sel->setImm(OpSel);
- LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n');
+ LLVM_DEBUG(dbgs() << "\nInto:" << *MI << '\n');
return;
}
}
@@ -1629,15 +1619,20 @@ unsigned SIPeepholeSDWA::computeMIChainsForPackedOps(
MachineInstr *ParentMI, std::queue<MachineOperand *> &DefSrcQueue,
const GCNSubtarget &ST) {
unsigned NumOfFP16Def;
+
+ // We will go up the use-def chain for ParentMI, until we encounter the
+ // exit condition, where we don't find any such defs of use operands
+ // which satisfy convertibility to SDWA OR find such uses more than 1 as now
+ // we don't know which path to follow-up.
do {
- MachineInstr *NextMIInChain = nullptr;
NumOfFP16Def = 0;
- for (MachineOperand ¤tMO : ParentMI->uses()) {
- if (!currentMO.isReg() || currentMO.getReg().isPhysical() ||
- !MRI->hasOneUse(currentMO.getReg()))
+ MachineInstr *NextMIInChain = nullptr;
+ for (MachineOperand &CurrentMO : ParentMI->uses()) {
+ if (!CurrentMO.isReg() || CurrentMO.getReg().isPhysical() ||
+ !MRI->hasOneUse(CurrentMO.getReg()))
continue;
- MachineOperand *DefCurrMO = findSingleRegDef(¤tMO, MRI);
+ MachineOperand *DefCurrMO = findSingleRegDef(&CurrentMO, MRI);
if (!DefCurrMO)
continue;
@@ -1651,11 +1646,8 @@ unsigned SIPeepholeSDWA::computeMIChainsForPackedOps(
NumOfFP16Def++;
}
- if (NumOfFP16Def > 1)
- break;
-
ParentMI = NextMIInChain;
- } while (ParentMI);
+ } while (NumOfFP16Def == 1);
return NumOfFP16Def;
}
@@ -1666,216 +1658,202 @@ void SIPeepholeSDWA::eliminateFP16Packing(MachineBasicBlock &MBB,
return;
for (MachineInstr &MI : make_early_inc_range(MBB)) {
- if (MI.getOpcode() == AMDGPU::V_PACK_B32_F16_e64) {
- LLVM_DEBUG(dbgs() << "\nCandidate FP16 Packed MI : " << MI << '\n');
- std::queue<MachineOperand *> DefSrc0Queue;
- std::queue<MachineOperand *> DefSrc1Queue;
- MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
- MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
-
- if (!Src0->isReg() || Src0->getReg().isPhysical() ||
- !MRI->hasOneUse(Src0->getReg()) || !Src1->isReg() ||
- Src1->getReg().isPhysical() || !MRI->hasOneUse(Src1->getReg()))
- continue;
+ if (MI.getOpcode() != AMDGPU::V_PACK_B32_F16_e64)
+ continue;
+ LLVM_DEBUG(dbgs() << "\nCandidate FP16 Packed MI : " << MI << '\n');
- MachineOperand *Op0 = findSingleRegDef(Src0, MRI);
- MachineOperand *Op1 = findSingleRegDef(Src1, MRI);
+ std::queue<MachineOperand *> DefSrc0Queue;
+ std::queue<MachineOperand *> DefSrc1Queue;
+ MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+ MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
- if (!Op0 || !Op1)
- continue;
+ if (!Src0->isReg() || Src0->getReg().isPhysical() ||
+ !MRI->hasOneUse(Src0->getReg()) || !Src1->isReg() ||
+ Src1->getReg().isPhysical() || !MRI->hasOneUse(Src1->getReg()))
+ continue;
- MachineInstr *ParentMIOp0 = Op0->getParent();
- MachineInstr *ParentMIOp1 = Op1->getParent();
+ MachineOperand *Op0 = findSingleRegDef(Src0, MRI);
+ MachineOperand *Op1 = findSingleRegDef(Src1, MRI);
- if (!isSrcDestFP16Bits(ParentMIOp0, TII) ||
- !isSrcDestFP16Bits(ParentMIOp1, TII) ||
- !isConvertibleToSDWA(*ParentMIOp0, ST, TII) ||
- !isConvertibleToSDWA(*ParentMIOp1, ST, TII))
- continue;
+ if (!Op0 || !Op1)
+ continue;
- DefSrc0Queue.push(Op0);
- DefSrc1Queue.push(Op1);
+ MachineInstr *ParentMIOp0 = Op0->getParent();
+ MachineInstr *ParentMIOp1 = Op1->getParent();
- // This checks for the given MI, that it only has exact one register MO
- // use , that is defined by pure FP16 instruction (that is SDWA-able too)
- unsigned NumOfFP16Def;
+ if (!isSrcDestFP16Bits(ParentMIOp0, TII) ||
+ !isSrcDestFP16Bits(ParentMIOp1, TII) ||
+ !isConvertibleToSDWA(*ParentMIOp0, ST, TII) ||
+ !isConvertibleToSDWA(*ParentMIOp1, ST, TII))
+ continue;
- NumOfFP16Def = computeMIChainsForPackedOps(ParentMIOp0, DefSrc0Queue, ST);
- if (NumOfFP16Def > 1)
- continue;
+ DefSrc0Queue.push(Op0);
+ DefSrc1Queue.push(Op1);
- NumOfFP16Def = computeMIChainsForPackedOps(ParentMIOp1, DefSrc1Queue, ST);
- if (NumOfFP16Def > 1)
- continue;
+ // This checks for the given MI, that it only has exact one register MO
+ // use , that is defined by pure FP16 instruction (that is SDWA-able too)
+ unsigned NumOfFP16Def;
- MachineInstr *Def0RootMI = (DefSrc0Queue.back())->getParent();
- MachineInstr *Def1RootMI = (DefSrc1Queue.back())->getParent();
- Register SrcRootMOReg = AMDGPU::NoRegister;
+ NumOfFP16Def = computeMIChainsForPackedOps(ParentMIOp0, DefSrc0Queue, ST);
+ if (NumOfFP16Def > 1)
+ continue;
- // Now, check if the last operation for each in of the DefSrcQueue
- // has the common MO, that would be the source root MO for element-wise
- // fp16 chain operations
- for (MachineOperand &Current0MO : Def0RootMI->uses()) {
- if (!Current0MO.isReg() || Current0MO.getReg().isPhysical())
- continue;
+ NumOfFP16Def = computeMIChainsForPackedOps(ParentMIOp1, DefSrc1Queue, ST);
+ if (NumOfFP16Def > 1)
+ continue;
- for (MachineOperand &Current1MO : Def1RootMI->uses()) {
- if (!Current1MO.isReg() || Current1MO.getReg().isPhysical())
- continue;
+ MachineInstr *Def0RootMI = (DefSrc0Queue.back())->getParent();
+ MachineInstr *Def1RootMI = (DefSrc1Queue.back())->getParent();
+ Register SrcRootMOReg = AMDGPU::NoRegister;
- if (Current0MO.getReg() == Current1MO.getReg() &&
- Current0MO.getSubReg() == Current1MO.getSubReg()) {
- SrcRootMOReg = Current0MO.getReg();
- break;
- }
- }
- // Found it, no more check needed, so break;
- if (SrcRootMOReg != AMDGPU::NoRegister)
+ // Now, check if the last operation for each in of the DefSrcQueue
+ // has the common MO, that would be the source root MO for element-wise
+ // fp16 chain operations
+ for (MachineOperand &Current0MO : Def0RootMI->uses()) {
+ if (!Current0MO.isReg() || Current0MO.getReg().isPhysical())
+ continue;
+
+ for (MachineOperand &Current1MO : Def1RootMI->uses()) {
+ if (!Current1MO.isReg() || Current1MO.getReg().isPhysical())
+ continue;
+
+ if (Current0MO.getReg() == Current1MO.getReg() &&
+ Current0MO.getSubReg() == Current1MO.getSubReg()) {
+ SrcRootMOReg = Current0MO.getReg();
break;
+ }
}
+ // Found it, no more check needed, so break;
+ if (SrcRootMOReg != AMDGPU::NoRegister)
+ break;
+ }
- if (SrcRootMOReg == AMDGPU::NoRegister)
- continue;
+ if (SrcRootMOReg == AMDGPU::NoRegister)
+ continue;
- // Also we need to ensure that each of the DefXRootMI should access the
- // lower and upper half word of SrcRootMOReg respectively.
- if (!checkForRightSrcRootAccess(Def0RootMI, Def1RootMI, SrcRootMOReg,
- TII))
- continue;
+ // Also we need to ensure that each of the DefXRootMI should access the
+ // lower and upper half word of SrcRootMOReg respectively.
+ if (!checkForRightSrcRootAccess(Def0RootMI, Def1RootMI, SrcRootMOReg, TII))
+ continue;
- // The graph below represents the connection :
- // Op0Intial --> Op0x --> ... --> Op0Final
- // / \'
- // SrcRootMO v_Pack_b32_f16
- // \ /
- // Op1Intial --> Op1x --> ... --> Op1Final
- // The nomenclature is based upon above flow-graph
- //
- // Also for each of DefSrcXQueue :
- // OpXIntial is at back & OpXFinal is at front
- auto Op0FinalMI = (DefSrc0Queue.front())->getParent();
- auto Op1FinalMI = (DefSrc1Queue.front())->getParent();
- auto Op0IntialMI = (DefSrc0Queue.back())->getParent();
- auto Op1IntialMI = (DefSrc1Queue.back())->getParent();
-
- MachineOperand *FinalOutMO = nullptr;
- std::queue<MachineOperand *> ChainedDefOps;
- AMDGPU::SDWA::SdwaSel OpSel = AMDGPU::SDWA::SdwaSel::DWORD;
- int NumOfElemInSecondOpChain = 0;
-
- // Now, we will change the flow as per the dominace of MI as follows, if
- // possible and store it in ChainedDefOps, so later can be used to convert
- // into its SDWA version:
- //
- // If (dominates(Op0FinalMI, Op1IntialMI)) == TRUE
- // SrcRootMO -> Op0Intial -> Op0x -> ... -> Op0Final
- // -> Op1Intial -> Op1x -> ... -> Op1Final (FinalOutMO)
- //
- // If (dominates(Op1FinalMI, Op0IntialMI)) == TRUE
- // SrcRootMO -> Op1Intial -> Op1x -> ... -> Op1Final
- // -> Op0Intial -> Op0x -> ... -> Op0Final (FinalOutMO)
- //
- // TODO : Else, not handled!
- // One such case is observed when multiple fp16 instruction are chained
- // on a fp16 vector input. For Example :
- //
- // %1 = call <2 x half> @llvm.log.v2f16 (<2 x half> %0)
- // %res = call <2 x half> @llvm.sin.v2f16 (<2 x half> %1)
- // return <2 x half> %res
- if (dominates(Op0FinalMI, Op1IntialMI)) {
- int OpIdx = Op1IntialMI->findRegisterUseOperandIdx(SrcRootMOReg, TRI);
- auto &MOTo = Op1IntialMI->getOperand(OpIdx);
- auto MOFrom = DefSrc0Queue.front();
- copyRegOperand(MOTo, *MOFrom, true);
- FinalOutMO = DefSrc1Queue.front();
-
- LLVM_DEBUG(dbgs() << "Updated Connecting MI : " << *Op1IntialMI
- << '\n');
- OpIdx = Op0IntialMI->findRegisterUseOperandIdx(SrcRootMOReg, TRI);
- auto &IntialInMO = Op0IntialMI->getOperand(OpIdx);
-
- while (!DefSrc1Queue.empty()) {
- ChainedDefOps.push(DefSrc1Queue.front());
- DefSrc1Queue.pop();
- NumOfElemInSecondOpChain++;
- }
- while (!DefSrc0Queue.empty()) {
- ChainedDefOps.push(DefSrc0Queue.front());
- DefSrc0Queue.pop();
- }
+ // The graph below represents the connection :
+ // Op0Intial --> Op0x --> ... --> Op0Final
+ // / \'
+ // SrcRootMO v_Pack_b32_f16
+ // \ /
+ // Op1Intial --> Op1x --> ... --> Op1Final
+ // The nomenclature is based upon above flow-graph
+ //
+ // Also for each of DefSrcXQueue :
+ // OpXIntial is at back & OpXFinal is at front
+ auto Op0FinalMI = (DefSrc0Queue.front())->getParent();
+ auto Op1FinalMI = (DefSrc1Queue.front())->getParent();
+ auto Op0IntialMI = (DefSrc0Queue.back())->getParent();
+ auto Op1IntialMI = (DefSrc1Queue.back())->getParent();
+
+ MachineOperand *FinalOutMO = nullptr;
+ std::queue<MachineOperand *> ChainedDefOps;
+ AMDGPU::SDWA::SdwaSel OpSel = AMDGPU::SDWA::SdwaSel::DWORD;
+ int NumOfElemInSecondOpChain = 0;
+
+ auto canonicalizedMIFlow =
+ [&](std::queue<MachineOperand *> DefFromQueue,
+ std::queue<MachineOperand *> DefToQueue) -> void {
+ MachineInstr *OpToIntialMI = (DefToQueue.back())->getParent();
+ int OpIdx = OpToIntialMI->findRegisterUseOperandIdx(SrcRootMOReg, TRI);
+ auto &MOTo = OpToIntialMI->getOperand(OpIdx);
+ auto MOFrom = DefFromQueue.front();
+ copyRegOperand(MOTo, *MOFrom);
+
+ LLVM_DEBUG(dbgs() << "Updated Connecting MI : " << *OpToIntialMI << '\n');
+
+ FinalOutMO = DefToQueue.front();
+ MachineInstr *OpFromIntialMI = (DefFromQueue.back())->getParent();
+ OpIdx = OpFromIntialMI->findRegisterUseOperandIdx(SrcRootMOReg, TRI);
+ auto &IntialInMO = OpFromIntialMI->getOperand(OpIdx);
+
+ while (!DefToQueue.empty()) {
+ ChainedDefOps.push(DefToQueue.front());
+ DefToQueue.pop();
+ NumOfElemInSecondOpChain++;
+ }
+ while (!DefFromQueue.empty()) {
+ ChainedDefOps.push(DefFromQueue.front());
+ DefFromQueue.pop();
+ }
+ ChainedDefOps.push(&IntialInMO);
+ };
- ChainedDefOps.push(&IntialInMO);
- OpSel = AMDGPU::SDWA::SdwaSel::WORD_1;
- } else if (dominates(Op1FinalMI, Op0IntialMI)) {
- int OpIdx = Op0IntialMI->findRegisterUseOperandIdx(SrcRootMOReg, TRI);
- auto &MOTo = Op0IntialMI->getOperand(OpIdx);
- auto MOFrom = DefSrc1Queue.front();
- copyRegOperand(MOTo, *MOFrom, true);
- FinalOutMO = DefSrc0Queue.front();
-
- LLVM_DEBUG(dbgs() << "Updated Connecting MI : " << *Op0IntialMI
- << '\n');
- OpIdx = Op1IntialMI->findRegisterUseOperandIdx(SrcRootMOReg, TRI);
- auto &IntialInMO = Op1IntialMI->getOperand(OpIdx);
-
- while (!DefSrc0Queue.empty()) {
- ChainedDefOps.push(DefSrc0Queue.front());
- DefSrc0Queue.pop();
- NumOfElemInSecondOpChain++;
- }
- while (!DefSrc1Queue.empty()) {
- ChainedDefOps.push(DefSrc1Queue.front());
- DefSrc1Queue.pop();
- }
+ // Now, we will change the flow as per the dominace of MI as follows, if
+ // possible and store it in ChainedDefOps, so later can be used to convert
+ // into its SDWA version:
+ //
+ // If (dominates(Op0FinalMI, Op1IntialMI)) == TRUE
+ // SrcRootMO -> Op0Intial -> Op0x -> ... -> Op0Final
+ // -> Op1Intial -> Op1x -> ... -> Op1Final (FinalOutMO)
+ //
+ // If (dominates(Op1FinalMI, Op0IntialMI)) == TRUE
+ // SrcRootMO -> Op1Intial -> Op1x -> ... -> Op1Final
+ // -> Op0Intial -> Op0x -> ... -> Op0Final (FinalOutMO)
+ //
+ // TODO : Else, not handled!
+ // One such case is observed when multiple fp16 instruction are chained
+ // on a fp16 vector input. For Example :
+ //
+ // %1 = call <2 x half> @llvm.log.v2f16 (<2 x half> %0)
+ // %res = call <2 x half> @llvm.sin.v2f16 (<2 x half> %1)
+ // return <2 x half> %res
+ if (dominates(Op0FinalMI, Op1IntialMI)) {
+ canonicalizedMIFlow(DefSrc0Queue, DefSrc1Queue);
+ OpSel = AMDGPU::SDWA::SdwaSel::WORD_1;
+ } else if (dominates(Op1FinalMI, Op0IntialMI)) {
+ canonicalizedMIFlow(DefSrc1Queue, DefSrc0Queue);
+ OpSel = AMDGPU::SDWA::SdwaSel::WORD_0;
+ } else {
+ LLVM_DEBUG(dbgs() << "No Connecting MI exists" << '\n');
+ continue;
+ }
- ChainedDefOps.push(&IntialInMO);
- OpSel = AMDGPU::SDWA::SdwaSel::WORD_0;
- } else {
- LLVM_DEBUG(dbgs() << "No Connecting MI exists" << '\n');
+ // Replace all use places of MI(v_pack) defMO with FinalOutMO.
+ MachineOperand &DefMO = MI.getOperand(0);
+ for (MachineOperand &MO :
+ make_early_inc_range(MRI->use_nodbg_operands(DefMO.getReg()))) {
+ if (!MO.isReg())
continue;
- }
- // Replace all use places of MI(v_pack) defMO with FinalOutMO.
- MachineOperand &DefMO = MI.getOperand(0);
- for (MachineOperand &MO :
- make_early_inc_range(MRI->use_nodbg_operands(DefMO.getReg()))) {
- if (!MO.isReg())
- continue;
+ MO.setReg(FinalOutMO->getReg());
+ MO.setSubReg(FinalOutMO->getSubReg());
+ }
+ LLVM_DEBUG(dbgs() << "Replace uses of " << DefMO << " in " << MI << "With "
+ << *FinalOutMO << '\n');
- MO.setReg(FinalOutMO->getReg());
- MO.setSubReg(FinalOutMO->getSubReg());
+ // Delete v_pack machine instruction
+ LLVM_DEBUG(dbgs() << "\nInstruction to be deleted : " << MI << "\n\n");
+ MI.eraseFromParent();
+ ++Num16BitPackedInstructionsEliminated;
+
+ // Convert machine instruction into SDWA-version
+ while (ChainedDefOps.size() != 1) {
+ if (NumOfElemInSecondOpChain == 0) {
+ if (OpSel == AMDGPU::SDWA::SdwaSel::WORD_0)
+ OpSel = AMDGPU::SDWA::SdwaSel::WORD_1;
+ else
+ OpSel = AMDGPU::SDWA::SdwaSel::WORD_0;
}
- LLVM_DEBUG(dbgs() << "Replace uses of " << DefMO << " in " << MI
- << "With " << *FinalOutMO << '\n');
-
- // Delete v_pack machine instruction
- LLVM_DEBUG(dbgs() << "\nInstruction to be deleted : " << MI << "\n\n");
- MI.eraseFromParent();
- ++Num16BitPackedInstructionsEliminated;
-
- // Convert machine instruction into SDWA-version
- while (ChainedDefOps.size() != 1) {
- if (NumOfElemInSecondOpChain == 0) {
- if (OpSel == AMDGPU::SDWA::SdwaSel::WORD_0)
- OpSel = AMDGPU::SDWA::SdwaSel::WORD_1;
- else
- OpSel = AMDGPU::SDWA::SdwaSel::WORD_0;
- }
-
- MachineInstr *DefMI = ChainedDefOps.front()->getParent();
- ChainedDefOps.pop();
- MachineOperand *SrcMO = ChainedDefOps.front();
- // Take SrcMO (which are def) as its usage in DefMI
- if (SrcMO->isDef()) {
- assert(MRI->hasOneUse(SrcMO->getReg()));
- SrcMO = findSingleRegUse(SrcMO, MRI);
- assert(DefMI == SrcMO->getParent() && "the only use is not in DefMI");
- }
+ MachineInstr *DefMI = ChainedDefOps.front()->getParent();
+ ChainedDefOps.pop();
+ MachineOperand *SrcMO = ChainedDefOps.front();
- convertMIToSDWAWithOpsel(*DefMI, *SrcMO, OpSel);
- NumOfElemInSecondOpChain--;
+ // Take SrcMO (which are def) as its usage in DefMI
+ if (SrcMO->isDef()) {
+ assert(MRI->hasOneUse(SrcMO->getReg()));
+ SrcMO = findSingleRegUse(SrcMO, MRI);
+ assert(DefMI == SrcMO->getParent() && "the only use is not in DefMI");
}
+
+ convertMIToSDWAWithOpsel(DefMI, *SrcMO, OpSel);
+ NumOfElemInSecondOpChain--;
}
}
}
>From aa3bb8bdf03116a4dd902a489190618df6592ab0 Mon Sep 17 00:00:00 2001
From: vikashgu <Vikash.Gupta at amd.com>
Date: Mon, 28 Apr 2025 10:41:00 +0000
Subject: [PATCH 06/11] Added MIR test to demonstrate the specific MIR pattern
handling to eliminate packing for fp16.
---
llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 11 +-
llvm/test/CodeGen/AMDGPU/packed-vec-fp16.mir | 337 +++++++++++++++++++
2 files changed, 344 insertions(+), 4 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/packed-vec-fp16.mir
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 0bb3a7c1c06f2..1c499c9b4a90b 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -1537,11 +1537,14 @@ static bool dominates(MachineBasicBlock::const_iterator A,
if (B == MBBEnd)
return true;
- MachineBasicBlock::const_iterator I = MBB->begin();
- for (; &*I != A && &*I != B; ++I)
- ;
+ if (A == MBBEnd)
+ return false;
+
+ MachineBasicBlock::const_iterator I = A;
+ while (I != B && I != MBBEnd)
+ I++;
- return &*I == A;
+ return (I == B);
}
// Convert MI into its SDWA version with its Dst_Sel & SrcMO_Sel set with OpSel
diff --git a/llvm/test/CodeGen/AMDGPU/packed-vec-fp16.mir b/llvm/test/CodeGen/AMDGPU/packed-vec-fp16.mir
new file mode 100644
index 0000000000000..2b2dce0d26a09
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/packed-vec-fp16.mir
@@ -0,0 +1,337 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -run-pass si-peephole-sdwa,dead-mi-elimination -o - %s | FileCheck -check-prefixes=GFX9 %s
+
+--- |
+ source_filename = "/home/vikashgu/work/upstream/llvm-project/llvm/test/CodeGen/AMDGPU/vector-fp16.ll"
+ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+ target triple = "amdgcn-amd-amdhsa"
+
+ define <4 x half> @sin_v4f16(<4 x half> %a) #0 {
+ %res = call <4 x half> @llvm.sin.v4f16(<4 x half> %a)
+ ret <4 x half> %res
+ }
+
+ define <4 x half> @cos_v4f16(<4 x half> %a) #0 {
+ %res = call <4 x half> @llvm.cos.v4f16(<4 x half> %a)
+ ret <4 x half> %res
+ }
+
+ define <4 x half> @log_v4f16(<4 x half> %a) #0 {
+ %res = call <4 x half> @llvm.log.v4f16(<4 x half> %a)
+ ret <4 x half> %res
+ }
+
+ define <4 x half> @log2_v4f16(<4 x half> %a) #0 {
+ %res = call <4 x half> @llvm.log2.v4f16(<4 x half> %a)
+ ret <4 x half> %res
+ }
+
+ define <4 x half> @exp_v4f16(<4 x half> %a) #0 {
+ %res = call <4 x half> @llvm.exp.v4f16(<4 x half> %a)
+ ret <4 x half> %res
+ }
+
+ define <4 x half> @cascaded_v4f16(<4 x half> %a) #0 {
+ %b = call <4 x half> @llvm.log2.v4f16(<4 x half> %a)
+ %res = call <4 x half> @llvm.sin.v4f16(<4 x half> %b)
+ ret <4 x half> %res
+ }
+
+ declare <4 x half> @llvm.cos.v4f16(<4 x half>) #1
+
+ declare <4 x half> @llvm.exp.v4f16(<4 x half>) #1
+
+ declare <4 x half> @llvm.log.v4f16(<4 x half>) #1
+
+ declare <4 x half> @llvm.log2.v4f16(<4 x half>) #1
+
+ declare <4 x half> @llvm.sin.v4f16(<4 x half>) #1
+
+ attributes #0 = { "target-cpu"="gfx942" }
+ attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx942" }
+...
+
+
+---
+name: sin_v4f16
+tracksRegLiveness: true
+body: |
+ bb.0 (%ir-block.0):
+ liveins: $vgpr0, $vgpr1
+
+ ; GFX9-LABEL: name: sin_v4f16
+ ; GFX9: liveins: $vgpr0, $vgpr1
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 12568, implicit $exec
+ ; GFX9-NEXT: [[V_MUL_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, [[COPY]], 0, [[V_MOV_B32_e32_]], 0, 0, 4, 2, 4, 6, implicit $mode, implicit $exec, implicit [[COPY]](tied-def 0)
+ ; GFX9-NEXT: [[V_SIN_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_SIN_F16_sdwa 0, killed [[V_MUL_F16_sdwa]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit killed [[V_MUL_F16_sdwa]](tied-def 0)
+ ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 12568, implicit $exec
+ ; GFX9-NEXT: [[V_MUL_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, [[V_SIN_F16_sdwa]], 0, [[V_MOV_B32_e32_1]], 0, 0, 5, 2, 5, 6, implicit $mode, implicit $exec, implicit [[V_SIN_F16_sdwa]](tied-def 0)
+ ; GFX9-NEXT: [[V_SIN_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_SIN_F16_sdwa 0, killed [[V_MUL_F16_sdwa1]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit killed [[V_MUL_F16_sdwa1]](tied-def 0)
+ ; GFX9-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 12568, implicit $exec
+ ; GFX9-NEXT: [[V_MUL_F16_sdwa2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, [[COPY1]], 0, [[V_MOV_B32_e32_2]], 0, 0, 4, 2, 4, 6, implicit $mode, implicit $exec, implicit [[COPY1]](tied-def 0)
+ ; GFX9-NEXT: [[V_SIN_F16_sdwa2:%[0-9]+]]:vgpr_32 = nofpexcept V_SIN_F16_sdwa 0, killed [[V_MUL_F16_sdwa2]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit killed [[V_MUL_F16_sdwa2]](tied-def 0)
+ ; GFX9-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 12568, implicit $exec
+ ; GFX9-NEXT: [[V_MUL_F16_sdwa3:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, [[V_SIN_F16_sdwa2]], 0, [[V_MOV_B32_e32_3]], 0, 0, 5, 2, 5, 6, implicit $mode, implicit $exec, implicit [[V_SIN_F16_sdwa2]](tied-def 0)
+ ; GFX9-NEXT: [[V_SIN_F16_sdwa3:%[0-9]+]]:vgpr_32 = nofpexcept V_SIN_F16_sdwa 0, killed [[V_MUL_F16_sdwa3]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit killed [[V_MUL_F16_sdwa3]](tied-def 0)
+ ; GFX9-NEXT: $vgpr0 = COPY [[V_SIN_F16_sdwa3]]
+ ; GFX9-NEXT: $vgpr1 = COPY [[V_SIN_F16_sdwa1]]
+ ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+ %9:vgpr_32 = COPY $vgpr1
+ %8:vgpr_32 = COPY $vgpr0
+ %12:vgpr_32 = nofpexcept V_MUL_F16_e64 0, %9, 0, 12568, 0, 0, implicit $mode, implicit $exec
+ %13:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed %12, 0, 0, implicit $mode, implicit $exec
+ %15:vgpr_32 = V_LSHRREV_B32_e64 16, %9, implicit $exec
+ %17:vgpr_32 = nofpexcept V_MUL_F16_e64 0, %15, 0, 12568, 0, 0, implicit $mode, implicit $exec
+ %18:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed %17, 0, 0, implicit $mode, implicit $exec
+ %20:vgpr_32 = nofpexcept V_MUL_F16_e64 0, %8, 0, 12568, 0, 0, implicit $mode, implicit $exec
+ %21:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed %20, 0, 0, implicit $mode, implicit $exec
+ %22:vgpr_32 = V_LSHRREV_B32_e64 16, %8, implicit $exec
+ %24:vgpr_32 = nofpexcept V_MUL_F16_e64 0, %22, 0, 12568, 0, 0, implicit $mode, implicit $exec
+ %25:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed %24, 0, 0, implicit $mode, implicit $exec
+ %26:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %21, 0, killed %25, 0, 0, implicit $mode, implicit $exec
+ %27:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %13, 0, killed %18, 0, 0, implicit $mode, implicit $exec
+ $vgpr0 = COPY %26
+ $vgpr1 = COPY %27
+ SI_RETURN implicit $vgpr0, implicit $vgpr1
+...
+
+---
+name: cos_v4f16
+tracksRegLiveness: true
+body: |
+ bb.0 (%ir-block.0):
+ liveins: $vgpr0, $vgpr1
+
+ ; GFX9-LABEL: name: cos_v4f16
+ ; GFX9: liveins: $vgpr0, $vgpr1
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 12568, implicit $exec
+ ; GFX9-NEXT: [[V_MUL_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, [[COPY]], 0, [[V_MOV_B32_e32_]], 0, 0, 4, 2, 4, 6, implicit $mode, implicit $exec, implicit [[COPY]](tied-def 0)
+ ; GFX9-NEXT: [[V_COS_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_COS_F16_sdwa 0, killed [[V_MUL_F16_sdwa]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit killed [[V_MUL_F16_sdwa]](tied-def 0)
+ ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 12568, implicit $exec
+ ; GFX9-NEXT: [[V_MUL_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, [[V_COS_F16_sdwa]], 0, [[V_MOV_B32_e32_1]], 0, 0, 5, 2, 5, 6, implicit $mode, implicit $exec, implicit [[V_COS_F16_sdwa]](tied-def 0)
+ ; GFX9-NEXT: [[V_COS_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_COS_F16_sdwa 0, killed [[V_MUL_F16_sdwa1]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit killed [[V_MUL_F16_sdwa1]](tied-def 0)
+ ; GFX9-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 12568, implicit $exec
+ ; GFX9-NEXT: [[V_MUL_F16_sdwa2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, [[COPY1]], 0, [[V_MOV_B32_e32_2]], 0, 0, 4, 2, 4, 6, implicit $mode, implicit $exec, implicit [[COPY1]](tied-def 0)
+ ; GFX9-NEXT: [[V_COS_F16_sdwa2:%[0-9]+]]:vgpr_32 = nofpexcept V_COS_F16_sdwa 0, killed [[V_MUL_F16_sdwa2]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit killed [[V_MUL_F16_sdwa2]](tied-def 0)
+ ; GFX9-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 12568, implicit $exec
+ ; GFX9-NEXT: [[V_MUL_F16_sdwa3:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, [[V_COS_F16_sdwa2]], 0, [[V_MOV_B32_e32_3]], 0, 0, 5, 2, 5, 6, implicit $mode, implicit $exec, implicit [[V_COS_F16_sdwa2]](tied-def 0)
+ ; GFX9-NEXT: [[V_COS_F16_sdwa3:%[0-9]+]]:vgpr_32 = nofpexcept V_COS_F16_sdwa 0, killed [[V_MUL_F16_sdwa3]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit killed [[V_MUL_F16_sdwa3]](tied-def 0)
+ ; GFX9-NEXT: $vgpr0 = COPY [[V_COS_F16_sdwa3]]
+ ; GFX9-NEXT: $vgpr1 = COPY [[V_COS_F16_sdwa1]]
+ ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+ %9:vgpr_32 = COPY $vgpr1
+ %8:vgpr_32 = COPY $vgpr0
+ %12:vgpr_32 = nofpexcept V_MUL_F16_e64 0, %9, 0, 12568, 0, 0, implicit $mode, implicit $exec
+ %13:vgpr_32 = nofpexcept V_COS_F16_e64 0, killed %12, 0, 0, implicit $mode, implicit $exec
+ %15:vgpr_32 = V_LSHRREV_B32_e64 16, %9, implicit $exec
+ %17:vgpr_32 = nofpexcept V_MUL_F16_e64 0, %15, 0, 12568, 0, 0, implicit $mode, implicit $exec
+ %18:vgpr_32 = nofpexcept V_COS_F16_e64 0, killed %17, 0, 0, implicit $mode, implicit $exec
+ %20:vgpr_32 = nofpexcept V_MUL_F16_e64 0, %8, 0, 12568, 0, 0, implicit $mode, implicit $exec
+ %21:vgpr_32 = nofpexcept V_COS_F16_e64 0, killed %20, 0, 0, implicit $mode, implicit $exec
+ %22:vgpr_32 = V_LSHRREV_B32_e64 16, %8, implicit $exec
+ %24:vgpr_32 = nofpexcept V_MUL_F16_e64 0, %22, 0, 12568, 0, 0, implicit $mode, implicit $exec
+ %25:vgpr_32 = nofpexcept V_COS_F16_e64 0, killed %24, 0, 0, implicit $mode, implicit $exec
+ %26:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %21, 0, killed %25, 0, 0, implicit $mode, implicit $exec
+ %27:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %13, 0, killed %18, 0, 0, implicit $mode, implicit $exec
+ $vgpr0 = COPY %26
+ $vgpr1 = COPY %27
+ SI_RETURN implicit $vgpr0, implicit $vgpr1
+...
+
+---
+name: log_v4f16
+tracksRegLiveness: true
+body: |
+ bb.0 (%ir-block.0):
+ liveins: $vgpr0, $vgpr1
+
+ ; GFX9-LABEL: name: log_v4f16
+ ; GFX9: liveins: $vgpr0, $vgpr1
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9-NEXT: [[V_LOG_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[COPY]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit [[COPY]](tied-def 0)
+ ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 14732
+ ; GFX9-NEXT: [[V_MUL_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, killed [[V_LOG_F16_sdwa]], 0, [[S_MOV_B32_]], 0, 0, 4, 2, 4, 6, implicit $mode, implicit $exec, implicit killed [[V_LOG_F16_sdwa]](tied-def 0)
+ ; GFX9-NEXT: [[V_LOG_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[V_MUL_F16_sdwa]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit [[V_MUL_F16_sdwa]](tied-def 0)
+ ; GFX9-NEXT: [[V_MUL_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, killed [[V_LOG_F16_sdwa1]], 0, [[S_MOV_B32_]], 0, 0, 5, 2, 5, 6, implicit $mode, implicit $exec, implicit killed [[V_LOG_F16_sdwa1]](tied-def 0)
+ ; GFX9-NEXT: [[V_LOG_F16_sdwa2:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[COPY1]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit [[COPY1]](tied-def 0)
+ ; GFX9-NEXT: [[V_MUL_F16_sdwa2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, killed [[V_LOG_F16_sdwa2]], 0, [[S_MOV_B32_]], 0, 0, 4, 2, 4, 6, implicit $mode, implicit $exec, implicit killed [[V_LOG_F16_sdwa2]](tied-def 0)
+ ; GFX9-NEXT: [[V_LOG_F16_sdwa3:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[V_MUL_F16_sdwa2]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit [[V_MUL_F16_sdwa2]](tied-def 0)
+ ; GFX9-NEXT: [[V_MUL_F16_sdwa3:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, killed [[V_LOG_F16_sdwa3]], 0, [[S_MOV_B32_]], 0, 0, 5, 2, 5, 6, implicit $mode, implicit $exec, implicit killed [[V_LOG_F16_sdwa3]](tied-def 0)
+ ; GFX9-NEXT: $vgpr0 = COPY [[V_MUL_F16_sdwa3]]
+ ; GFX9-NEXT: $vgpr1 = COPY [[V_MUL_F16_sdwa1]]
+ ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+ %9:vgpr_32 = COPY $vgpr1
+ %8:vgpr_32 = COPY $vgpr0
+ %11:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %9, 0, 0, implicit $mode, implicit $exec
+ %12:sreg_32 = S_MOV_B32 14732
+ %13:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %11, 0, %12, 0, 0, implicit $mode, implicit $exec
+ %15:vgpr_32 = V_LSHRREV_B32_e64 16, %9, implicit $exec
+ %17:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %15, 0, 0, implicit $mode, implicit $exec
+ %18:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %17, 0, %12, 0, 0, implicit $mode, implicit $exec
+ %20:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %8, 0, 0, implicit $mode, implicit $exec
+ %21:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %20, 0, %12, 0, 0, implicit $mode, implicit $exec
+ %22:vgpr_32 = V_LSHRREV_B32_e64 16, %8, implicit $exec
+ %24:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %22, 0, 0, implicit $mode, implicit $exec
+ %25:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %24, 0, %12, 0, 0, implicit $mode, implicit $exec
+ %26:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %21, 0, killed %25, 0, 0, implicit $mode, implicit $exec
+ %27:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %13, 0, killed %18, 0, 0, implicit $mode, implicit $exec
+ $vgpr0 = COPY %26
+ $vgpr1 = COPY %27
+ SI_RETURN implicit $vgpr0, implicit $vgpr1
+...
+
+---
+name: log2_v4f16
+tracksRegLiveness: true
+body: |
+ bb.0 (%ir-block.0):
+ liveins: $vgpr0, $vgpr1
+
+ ; GFX9-LABEL: name: log2_v4f16
+ ; GFX9: liveins: $vgpr0, $vgpr1
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9-NEXT: [[V_LOG_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[COPY]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit [[COPY]](tied-def 0)
+ ; GFX9-NEXT: [[V_LOG_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[COPY1]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit [[COPY1]](tied-def 0)
+ ; GFX9-NEXT: [[V_LOG_F16_sdwa2:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[V_LOG_F16_sdwa]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit [[V_LOG_F16_sdwa]](tied-def 0)
+ ; GFX9-NEXT: [[V_LOG_F16_sdwa3:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[V_LOG_F16_sdwa1]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit [[V_LOG_F16_sdwa1]](tied-def 0)
+ ; GFX9-NEXT: $vgpr0 = COPY [[V_LOG_F16_sdwa3]]
+ ; GFX9-NEXT: $vgpr1 = COPY [[V_LOG_F16_sdwa2]]
+ ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+ %9:vgpr_32 = COPY $vgpr1
+ %8:vgpr_32 = COPY $vgpr0
+ %11:vgpr_32 = V_LSHRREV_B32_e64 16, %9, implicit $exec
+ %13:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %11, 0, 0, implicit $mode, implicit $exec
+ %14:vgpr_32 = V_LSHRREV_B32_e64 16, %8, implicit $exec
+ %16:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %14, 0, 0, implicit $mode, implicit $exec
+ %18:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %9, 0, 0, implicit $mode, implicit $exec
+ %20:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %8, 0, 0, implicit $mode, implicit $exec
+ %21:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %20, 0, killed %16, 0, 0, implicit $mode, implicit $exec
+ %22:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %18, 0, killed %13, 0, 0, implicit $mode, implicit $exec
+ $vgpr0 = COPY %21
+ $vgpr1 = COPY %22
+ SI_RETURN implicit $vgpr0, implicit $vgpr1
+...
+
+---
+name: exp_v4f16
+tracksRegLiveness: true
+body: |
+ bb.0 (%ir-block.0):
+ liveins: $vgpr0, $vgpr1
+
+ ; GFX9-LABEL: name: exp_v4f16
+ ; GFX9: liveins: $vgpr0, $vgpr1
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9-NEXT: [[V_CVT_F32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 1069066811
+ ; GFX9-NEXT: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed [[V_CVT_F32_F16_e64_]], 0, [[S_MOV_B32_]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_EXP_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F32_e64 0, killed [[V_MUL_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_CVT_F16_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, killed [[V_EXP_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_CVT_F32_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_sdwa 0, [[COPY]], 0, 0, 6, 0, 5, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_MUL_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed [[V_CVT_F32_F16_sdwa]], 0, [[S_MOV_B32_]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_EXP_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F32_e64 0, killed [[V_MUL_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_CVT_F16_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, killed [[V_EXP_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_CVT_F32_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_MUL_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed [[V_CVT_F32_F16_e64_1]], 0, [[S_MOV_B32_]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_EXP_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F32_e64 0, killed [[V_MUL_F32_e64_2]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_CVT_F16_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, killed [[V_EXP_F32_e64_2]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_CVT_F32_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_sdwa 0, [[COPY1]], 0, 0, 6, 0, 5, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_MUL_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed [[V_CVT_F32_F16_sdwa1]], 0, [[S_MOV_B32_]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_EXP_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F32_e64 0, killed [[V_MUL_F32_e64_3]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_CVT_F16_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, killed [[V_EXP_F32_e64_3]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_PACK_B32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed [[V_CVT_F16_F32_e64_2]], 0, killed [[V_CVT_F16_F32_e64_3]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_PACK_B32_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed [[V_CVT_F16_F32_e64_]], 0, killed [[V_CVT_F16_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: $vgpr0 = COPY [[V_PACK_B32_F16_e64_]]
+ ; GFX9-NEXT: $vgpr1 = COPY [[V_PACK_B32_F16_e64_1]]
+ ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+ %9:vgpr_32 = COPY $vgpr1
+ %8:vgpr_32 = COPY $vgpr0
+ %11:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, %9, 0, 0, implicit $mode, implicit $exec
+ %12:sgpr_32 = S_MOV_B32 1069066811
+ %13:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed %11, 0, %12, 0, 0, implicit $mode, implicit $exec
+ %14:vgpr_32 = nofpexcept V_EXP_F32_e64 0, killed %13, 0, 0, implicit $mode, implicit $exec
+ %15:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, killed %14, 0, 0, implicit $mode, implicit $exec
+ %17:vgpr_32 = V_LSHRREV_B32_e64 16, %9, implicit $exec
+ %19:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, %17, 0, 0, implicit $mode, implicit $exec
+ %20:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed %19, 0, %12, 0, 0, implicit $mode, implicit $exec
+ %21:vgpr_32 = nofpexcept V_EXP_F32_e64 0, killed %20, 0, 0, implicit $mode, implicit $exec
+ %22:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, killed %21, 0, 0, implicit $mode, implicit $exec
+ %24:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, %8, 0, 0, implicit $mode, implicit $exec
+ %25:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed %24, 0, %12, 0, 0, implicit $mode, implicit $exec
+ %26:vgpr_32 = nofpexcept V_EXP_F32_e64 0, killed %25, 0, 0, implicit $mode, implicit $exec
+ %27:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, killed %26, 0, 0, implicit $mode, implicit $exec
+ %28:vgpr_32 = V_LSHRREV_B32_e64 16, %8, implicit $exec
+ %30:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, %28, 0, 0, implicit $mode, implicit $exec
+ %31:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed %30, 0, %12, 0, 0, implicit $mode, implicit $exec
+ %32:vgpr_32 = nofpexcept V_EXP_F32_e64 0, killed %31, 0, 0, implicit $mode, implicit $exec
+ %33:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, killed %32, 0, 0, implicit $mode, implicit $exec
+ %34:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %27, 0, killed %33, 0, 0, implicit $mode, implicit $exec
+ %35:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %15, 0, killed %22, 0, 0, implicit $mode, implicit $exec
+ $vgpr0 = COPY %34
+ $vgpr1 = COPY %35
+ SI_RETURN implicit $vgpr0, implicit $vgpr1
+...
+
+---
+name: cascaded_v4f16
+tracksRegLiveness: true
+body: |
+ bb.0 (%ir-block.0):
+ liveins: $vgpr0, $vgpr1
+
+ ; GFX9-LABEL: name: cascaded_v4f16
+ ; GFX9: liveins: $vgpr0, $vgpr1
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9-NEXT: [[V_LOG_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[COPY1]], 0, 0, 6, 0, 5, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_LOG_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[COPY]], 0, 0, 6, 0, 5, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_LOG_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_LOG_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_MUL_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed [[V_LOG_F16_e64_1]], 0, 12568, 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_SIN_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed [[V_MUL_F16_e64_]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_MUL_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed [[V_LOG_F16_sdwa1]], 0, 12568, 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_SIN_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed [[V_MUL_F16_e64_1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_MUL_F16_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed [[V_LOG_F16_e64_]], 0, 12568, 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_SIN_F16_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed [[V_MUL_F16_e64_2]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_MUL_F16_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed [[V_LOG_F16_sdwa]], 0, 12568, 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_SIN_F16_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed [[V_MUL_F16_e64_3]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_PACK_B32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed [[V_SIN_F16_e64_2]], 0, killed [[V_SIN_F16_e64_3]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_PACK_B32_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed [[V_SIN_F16_e64_]], 0, killed [[V_SIN_F16_e64_1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: $vgpr0 = COPY [[V_PACK_B32_F16_e64_]]
+ ; GFX9-NEXT: $vgpr1 = COPY [[V_PACK_B32_F16_e64_1]]
+ ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+ %9:vgpr_32 = COPY $vgpr1
+ %8:vgpr_32 = COPY $vgpr0
+ %11:vgpr_32 = V_LSHRREV_B32_e64 16, %8, implicit $exec
+ %13:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %11, 0, 0, implicit $mode, implicit $exec
+ %14:vgpr_32 = V_LSHRREV_B32_e64 16, %9, implicit $exec
+ %16:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %14, 0, 0, implicit $mode, implicit $exec
+ %18:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %8, 0, 0, implicit $mode, implicit $exec
+ %20:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %9, 0, 0, implicit $mode, implicit $exec
+ %22:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %20, 0, 12568, 0, 0, implicit $mode, implicit $exec
+ %23:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed %22, 0, 0, implicit $mode, implicit $exec
+ %24:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %16, 0, 12568, 0, 0, implicit $mode, implicit $exec
+ %25:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed %24, 0, 0, implicit $mode, implicit $exec
+ %26:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %18, 0, 12568, 0, 0, implicit $mode, implicit $exec
+ %27:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed %26, 0, 0, implicit $mode, implicit $exec
+ %28:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %13, 0, 12568, 0, 0, implicit $mode, implicit $exec
+ %29:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed %28, 0, 0, implicit $mode, implicit $exec
+ %30:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %27, 0, killed %29, 0, 0, implicit $mode, implicit $exec
+ %31:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %23, 0, killed %25, 0, 0, implicit $mode, implicit $exec
+ $vgpr0 = COPY %30
+ $vgpr1 = COPY %31
+ SI_RETURN implicit $vgpr0, implicit $vgpr1
+...
>From f064175543e1052426a817bfa726b670efed5233 Mon Sep 17 00:00:00 2001
From: vikashgu <Vikash.Gupta at amd.com>
Date: Tue, 6 May 2025 07:49:38 +0000
Subject: [PATCH 07/11] Reduced duplicate code length & added a new MIR test in
existing testFile.
---
llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 106 ++++++++-----------
llvm/test/CodeGen/AMDGPU/packed-vec-fp16.mir | 43 ++++++++
2 files changed, 89 insertions(+), 60 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 1c499c9b4a90b..d6ca723e31142 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -1476,54 +1476,44 @@ static bool checkForRightSrcRootAccess(MachineInstr *Def0MI,
Def1DstUnused->getImm() != AMDGPU::SDWA::DstUnused::UNUSED_PAD)
return false;
- MachineOperand *Def1Src0 =
- TII->getNamedOperand(*Def1MI, AMDGPU::OpName::src0);
- MachineOperand *Def1Src1 =
- TII->getNamedOperand(*Def1MI, AMDGPU::OpName::src1);
- MachineOperand *Def0Src0 =
- TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src0);
- MachineOperand *Def0Src1 =
- TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src1);
-
- auto checkForDef0MIAccess = [&]() -> bool {
- if (Def0Src0 && Def0Src0->isReg() && (Def0Src0->getReg() == SrcRootReg)) {
- MachineOperand *Def0Src0Sel =
- TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src0_sel);
- if (!Def0Src0Sel ||
- Def0Src0Sel->getImm() == AMDGPU::SDWA::SdwaSel::WORD_0)
- return true;
- }
-
- if (Def0Src1 && Def0Src1->isReg() && (Def0Src1->getReg() == SrcRootReg)) {
- MachineOperand *Def0Src1Sel =
- TII->getNamedOperand(*Def0MI, AMDGPU::OpName::src1_sel);
- if (!Def0Src1Sel ||
- Def0Src1Sel->getImm() == AMDGPU::SDWA::SdwaSel::WORD_0)
- return true;
+ const auto checkSrcSel = [&](MachineInstr *DefMI, AMDGPU::OpName SrcName,
+ AMDGPU::OpName SrcSelName,
+ AMDGPU::SDWA::SdwaSel SdwaSel) -> bool {
+ MachineOperand *DefSrc = TII->getNamedOperand(*DefMI, SrcName);
+ if (DefSrc && DefSrc->isReg() && (DefSrc->getReg() == SrcRootReg)) {
+ MachineOperand *DefSrcSel = TII->getNamedOperand(*DefMI, SrcSelName);
+ if (SdwaSel == AMDGPU::SDWA::SdwaSel::WORD_0) {
+ if (!DefSrcSel || DefSrcSel->getImm() == SdwaSel)
+ return true;
+ } else {
+ assert(SdwaSel == AMDGPU::SDWA::SdwaSel::WORD_1 &&
+ "Not valid SDWA SrcSel operand");
+ if (DefSrcSel && DefSrcSel->getImm() == SdwaSel)
+ return true;
+ }
}
-
return false;
};
- if (Def1Src0 && Def1Src0->isReg() && (Def1Src0->getReg() == SrcRootReg)) {
- MachineOperand *Def1Src0Sel =
- TII->getNamedOperand(*Def1MI, AMDGPU::OpName::src0_sel);
- if (!Def1Src0Sel || Def1Src0Sel->getImm() != AMDGPU::SDWA::SdwaSel::WORD_1)
- return false;
+ const auto checkForDef0MIAccess = [&]() -> bool {
+ if (checkSrcSel(Def0MI, AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel,
+ AMDGPU::SDWA::SdwaSel::WORD_0))
+ return true;
+ if (checkSrcSel(Def0MI, AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel,
+ AMDGPU::SDWA::SdwaSel::WORD_0))
+ return true;
+ return false;
+ };
+ if (checkSrcSel(Def1MI, AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel,
+ AMDGPU::SDWA::SdwaSel::WORD_1))
if (checkForDef0MIAccess())
return true;
- }
-
- if (Def1Src1 && Def1Src1->isReg() && (Def1Src1->getReg() == SrcRootReg)) {
- MachineOperand *Def1Src1Sel =
- TII->getNamedOperand(*Def1MI, AMDGPU::OpName::src1_sel);
- if (!Def1Src1Sel || Def1Src1Sel->getImm() != AMDGPU::SDWA::SdwaSel::WORD_1)
- return false;
+ if (checkSrcSel(Def1MI, AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel,
+ AMDGPU::SDWA::SdwaSel::WORD_1))
if (checkForDef0MIAccess())
return true;
- }
return false;
}
@@ -1576,7 +1566,7 @@ void SIPeepholeSDWA::convertMIToSDWAWithOpsel(MachineInstr *MI,
TII->getNamedOperand(*MI, AMDGPU::OpName::dst_unused);
assert(DstUnused &&
AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::dst_unused));
- assert(!(DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) &&
+ assert(DstUnused->getImm() != AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE &&
"Dst_unused should not be UNUSED_PRESERVE already");
DstUnused->setImm(AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE);
@@ -1589,31 +1579,27 @@ void SIPeepholeSDWA::convertMIToSDWAWithOpsel(MachineInstr *MI,
MI->addOperand(NewSrcImplitMO);
MI->tieOperands(PreserveDstIdx, MI->getNumOperands() - 1);
- MachineOperand *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
- assert(Src0 && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0));
- if (Src0->isReg() && (Src0->getReg() == SrcMO.getReg())) {
- MachineOperand *Src0Sel =
- TII->getNamedOperand(*MI, AMDGPU::OpName::src0_sel);
- assert(Src0Sel &&
- AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_sel));
- Src0Sel->setImm(OpSel);
+ auto modifySrcSelIntoOpSel = [&](AMDGPU::OpName SrcName,
+ AMDGPU::OpName SrcSelName) -> bool {
+ MachineOperand *Src = TII->getNamedOperand(*MI, SrcName);
+ assert(Src && AMDGPU::hasNamedOperand(SDWAOpcode, SrcName));
+ if (Src->isReg() && (Src->getReg() == SrcMO.getReg())) {
+ MachineOperand *SrcSel = TII->getNamedOperand(*MI, SrcSelName);
+ assert(SrcSel && AMDGPU::hasNamedOperand(SDWAOpcode, SrcSelName));
+ SrcSel->setImm(OpSel);
- LLVM_DEBUG(dbgs() << "\nInto:" << *MI << '\n');
- return;
- }
+ LLVM_DEBUG(dbgs() << "\nInto:" << *MI << '\n');
+ return true;
+ }
- MachineOperand *Src1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1);
- assert(Src1 && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1));
- if (Src1->isReg() && (Src1->getReg() == SrcMO.getReg())) {
- MachineOperand *Src1Sel =
- TII->getNamedOperand(*MI, AMDGPU::OpName::src1_sel);
- assert(Src1Sel &&
- AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_sel));
- Src1Sel->setImm(OpSel);
+ return false;
+ };
- LLVM_DEBUG(dbgs() << "\nInto:" << *MI << '\n');
+ if (modifySrcSelIntoOpSel(AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel))
+ return;
+
+ if (modifySrcSelIntoOpSel(AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel))
return;
- }
}
// BackTracks the given Parent MI to look for any of its use operand that has
diff --git a/llvm/test/CodeGen/AMDGPU/packed-vec-fp16.mir b/llvm/test/CodeGen/AMDGPU/packed-vec-fp16.mir
index 2b2dce0d26a09..9318f8dd2bbea 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-vec-fp16.mir
+++ b/llvm/test/CodeGen/AMDGPU/packed-vec-fp16.mir
@@ -37,6 +37,10 @@
ret <4 x half> %res
}
+ define void @unbalanced_operations_packed(<4 x half> %a) #0 {
+ ret void
+ }
+
declare <4 x half> @llvm.cos.v4f16(<4 x half>) #1
declare <4 x half> @llvm.exp.v4f16(<4 x half>) #1
@@ -335,3 +339,42 @@ body: |
$vgpr1 = COPY %31
SI_RETURN implicit $vgpr0, implicit $vgpr1
...
+
+---
+name: unbalanced_operations_packed
+tracksRegLiveness: true
+body: |
+ bb.0 (%ir-block.0):
+ liveins: $vgpr0, $vgpr1
+ ; GFX9-LABEL: name: unbalanced_operations_packed
+ ; GFX9: liveins: $vgpr0, $vgpr1
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9-NEXT: [[V_LOG_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[COPY]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit [[COPY]](tied-def 0)
+ ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 14732
+ ; GFX9-NEXT: [[V_MUL_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, killed [[V_LOG_F16_sdwa]], 0, [[S_MOV_B32_]], 0, 0, 4, 2, 4, 6, implicit $mode, implicit $exec, implicit killed [[V_LOG_F16_sdwa]](tied-def 0)
+ ; GFX9-NEXT: [[V_LOG_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[V_MUL_F16_sdwa]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit [[V_MUL_F16_sdwa]](tied-def 0)
+ ; GFX9-NEXT: [[V_LOG_F16_sdwa2:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[COPY1]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit [[COPY1]](tied-def 0)
+ ; GFX9-NEXT: [[V_LOG_F16_sdwa3:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[V_LOG_F16_sdwa2]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit [[V_LOG_F16_sdwa2]](tied-def 0)
+ ; GFX9-NEXT: [[V_MUL_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, killed [[V_LOG_F16_sdwa3]], 0, [[S_MOV_B32_]], 0, 0, 5, 2, 5, 6, implicit $mode, implicit $exec, implicit killed [[V_LOG_F16_sdwa3]](tied-def 0)
+ ; GFX9-NEXT: $vgpr0 = COPY [[V_MUL_F16_sdwa1]]
+ ; GFX9-NEXT: $vgpr1 = COPY [[V_LOG_F16_sdwa1]]
+ ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+ %9:vgpr_32 = COPY $vgpr1
+ %8:vgpr_32 = COPY $vgpr0
+ %11:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %9, 0, 0, implicit $mode, implicit $exec
+ %12:sreg_32 = S_MOV_B32 14732
+ %13:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %11, 0, %12, 0, 0, implicit $mode, implicit $exec
+ %15:vgpr_32 = V_LSHRREV_B32_e64 16, %9, implicit $exec
+ %17:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %15, 0, 0, implicit $mode, implicit $exec
+ %20:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %8, 0, 0, implicit $mode, implicit $exec
+ %22:vgpr_32 = V_LSHRREV_B32_e64 16, %8, implicit $exec
+ %24:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %22, 0, 0, implicit $mode, implicit $exec
+ %25:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %24, 0, %12, 0, 0, implicit $mode, implicit $exec
+ %26:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %20, 0, killed %25, 0, 0, implicit $mode, implicit $exec
+ %27:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %13, 0, killed %17, 0, 0, implicit $mode, implicit $exec
+ $vgpr0 = COPY %26
+ $vgpr1 = COPY %27
+ SI_RETURN implicit $vgpr0, implicit $vgpr1
+...
>From 47d987f080e10139c466d42c83433a41d89b04ce Mon Sep 17 00:00:00 2001
From: vikashgu <Vikash.Gupta at amd.com>
Date: Tue, 13 May 2025 10:44:30 +0000
Subject: [PATCH 08/11] Updated MIR test case, and addressed code reviews.
---
llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 35 +-
llvm/test/CodeGen/AMDGPU/packed-vec-fp16.mir | 360 +--
llvm/test/CodeGen/AMDGPU/vector-fp16.ll | 2758 ------------------
3 files changed, 109 insertions(+), 3044 deletions(-)
delete mode 100644 llvm/test/CodeGen/AMDGPU/vector-fp16.ll
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index d6ca723e31142..0d973b6fbee99 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -1476,7 +1476,7 @@ static bool checkForRightSrcRootAccess(MachineInstr *Def0MI,
Def1DstUnused->getImm() != AMDGPU::SDWA::DstUnused::UNUSED_PAD)
return false;
- const auto checkSrcSel = [&](MachineInstr *DefMI, AMDGPU::OpName SrcName,
+ const auto CheckSrcSel = [&](MachineInstr *DefMI, AMDGPU::OpName SrcName,
AMDGPU::OpName SrcSelName,
AMDGPU::SDWA::SdwaSel SdwaSel) -> bool {
MachineOperand *DefSrc = TII->getNamedOperand(*DefMI, SrcName);
@@ -1495,27 +1495,16 @@ static bool checkForRightSrcRootAccess(MachineInstr *Def0MI,
return false;
};
- const auto checkForDef0MIAccess = [&]() -> bool {
- if (checkSrcSel(Def0MI, AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel,
- AMDGPU::SDWA::SdwaSel::WORD_0))
- return true;
- if (checkSrcSel(Def0MI, AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel,
- AMDGPU::SDWA::SdwaSel::WORD_0))
- return true;
+ if (!CheckSrcSel(Def1MI, AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel,
+ AMDGPU::SDWA::SdwaSel::WORD_1) &&
+ !CheckSrcSel(Def1MI, AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel,
+ AMDGPU::SDWA::SdwaSel::WORD_1))
return false;
- };
-
- if (checkSrcSel(Def1MI, AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel,
- AMDGPU::SDWA::SdwaSel::WORD_1))
- if (checkForDef0MIAccess())
- return true;
-
- if (checkSrcSel(Def1MI, AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel,
- AMDGPU::SDWA::SdwaSel::WORD_1))
- if (checkForDef0MIAccess())
- return true;
- return false;
+ return CheckSrcSel(Def0MI, AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel,
+ AMDGPU::SDWA::SdwaSel::WORD_0) ||
+ CheckSrcSel(Def0MI, AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel,
+ AMDGPU::SDWA::SdwaSel::WORD_0);
}
/// Given A and B are in the same MBB, returns true if A comes before B.
@@ -1579,7 +1568,7 @@ void SIPeepholeSDWA::convertMIToSDWAWithOpsel(MachineInstr *MI,
MI->addOperand(NewSrcImplitMO);
MI->tieOperands(PreserveDstIdx, MI->getNumOperands() - 1);
- auto modifySrcSelIntoOpSel = [&](AMDGPU::OpName SrcName,
+ auto ModifySrcSelIntoOpSel = [&](AMDGPU::OpName SrcName,
AMDGPU::OpName SrcSelName) -> bool {
MachineOperand *Src = TII->getNamedOperand(*MI, SrcName);
assert(Src && AMDGPU::hasNamedOperand(SDWAOpcode, SrcName));
@@ -1595,10 +1584,10 @@ void SIPeepholeSDWA::convertMIToSDWAWithOpsel(MachineInstr *MI,
return false;
};
- if (modifySrcSelIntoOpSel(AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel))
+ if (ModifySrcSelIntoOpSel(AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel))
return;
- if (modifySrcSelIntoOpSel(AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel))
+ if (ModifySrcSelIntoOpSel(AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel))
return;
}
diff --git a/llvm/test/CodeGen/AMDGPU/packed-vec-fp16.mir b/llvm/test/CodeGen/AMDGPU/packed-vec-fp16.mir
index 9318f8dd2bbea..e45d7dd8f2029 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-vec-fp16.mir
+++ b/llvm/test/CodeGen/AMDGPU/packed-vec-fp16.mir
@@ -1,245 +1,126 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -run-pass si-peephole-sdwa,dead-mi-elimination -o - %s | FileCheck -check-prefixes=GFX9 %s
---- |
- source_filename = "/home/vikashgu/work/upstream/llvm-project/llvm/test/CodeGen/AMDGPU/vector-fp16.ll"
- target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
- target triple = "amdgcn-amd-amdhsa"
-
- define <4 x half> @sin_v4f16(<4 x half> %a) #0 {
- %res = call <4 x half> @llvm.sin.v4f16(<4 x half> %a)
- ret <4 x half> %res
- }
-
- define <4 x half> @cos_v4f16(<4 x half> %a) #0 {
- %res = call <4 x half> @llvm.cos.v4f16(<4 x half> %a)
- ret <4 x half> %res
- }
-
- define <4 x half> @log_v4f16(<4 x half> %a) #0 {
- %res = call <4 x half> @llvm.log.v4f16(<4 x half> %a)
- ret <4 x half> %res
- }
-
- define <4 x half> @log2_v4f16(<4 x half> %a) #0 {
- %res = call <4 x half> @llvm.log2.v4f16(<4 x half> %a)
- ret <4 x half> %res
- }
-
- define <4 x half> @exp_v4f16(<4 x half> %a) #0 {
- %res = call <4 x half> @llvm.exp.v4f16(<4 x half> %a)
- ret <4 x half> %res
- }
-
- define <4 x half> @cascaded_v4f16(<4 x half> %a) #0 {
- %b = call <4 x half> @llvm.log2.v4f16(<4 x half> %a)
- %res = call <4 x half> @llvm.sin.v4f16(<4 x half> %b)
- ret <4 x half> %res
- }
-
- define void @unbalanced_operations_packed(<4 x half> %a) #0 {
- ret void
- }
-
- declare <4 x half> @llvm.cos.v4f16(<4 x half>) #1
-
- declare <4 x half> @llvm.exp.v4f16(<4 x half>) #1
-
- declare <4 x half> @llvm.log.v4f16(<4 x half>) #1
-
- declare <4 x half> @llvm.log2.v4f16(<4 x half>) #1
-
- declare <4 x half> @llvm.sin.v4f16(<4 x half>) #1
-
- attributes #0 = { "target-cpu"="gfx942" }
- attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx942" }
-...
-
-
---
-name: sin_v4f16
+name: symmetric_equal_edges_fp16
tracksRegLiveness: true
body: |
- bb.0 (%ir-block.0):
- liveins: $vgpr0, $vgpr1
-
- ; GFX9-LABEL: name: sin_v4f16
- ; GFX9: liveins: $vgpr0, $vgpr1
+ bb.0:
+ liveins: $vgpr0
+ ; GFX9-LABEL: name: symmetric_equal_edges_fp16
+ ; GFX9: liveins: $vgpr0
; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 12568, implicit $exec
; GFX9-NEXT: [[V_MUL_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, [[COPY]], 0, [[V_MOV_B32_e32_]], 0, 0, 4, 2, 4, 6, implicit $mode, implicit $exec, implicit [[COPY]](tied-def 0)
; GFX9-NEXT: [[V_SIN_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_SIN_F16_sdwa 0, killed [[V_MUL_F16_sdwa]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit killed [[V_MUL_F16_sdwa]](tied-def 0)
; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 12568, implicit $exec
; GFX9-NEXT: [[V_MUL_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, [[V_SIN_F16_sdwa]], 0, [[V_MOV_B32_e32_1]], 0, 0, 5, 2, 5, 6, implicit $mode, implicit $exec, implicit [[V_SIN_F16_sdwa]](tied-def 0)
; GFX9-NEXT: [[V_SIN_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_SIN_F16_sdwa 0, killed [[V_MUL_F16_sdwa1]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit killed [[V_MUL_F16_sdwa1]](tied-def 0)
- ; GFX9-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 12568, implicit $exec
- ; GFX9-NEXT: [[V_MUL_F16_sdwa2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, [[COPY1]], 0, [[V_MOV_B32_e32_2]], 0, 0, 4, 2, 4, 6, implicit $mode, implicit $exec, implicit [[COPY1]](tied-def 0)
- ; GFX9-NEXT: [[V_SIN_F16_sdwa2:%[0-9]+]]:vgpr_32 = nofpexcept V_SIN_F16_sdwa 0, killed [[V_MUL_F16_sdwa2]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit killed [[V_MUL_F16_sdwa2]](tied-def 0)
- ; GFX9-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 12568, implicit $exec
- ; GFX9-NEXT: [[V_MUL_F16_sdwa3:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, [[V_SIN_F16_sdwa2]], 0, [[V_MOV_B32_e32_3]], 0, 0, 5, 2, 5, 6, implicit $mode, implicit $exec, implicit [[V_SIN_F16_sdwa2]](tied-def 0)
- ; GFX9-NEXT: [[V_SIN_F16_sdwa3:%[0-9]+]]:vgpr_32 = nofpexcept V_SIN_F16_sdwa 0, killed [[V_MUL_F16_sdwa3]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit killed [[V_MUL_F16_sdwa3]](tied-def 0)
- ; GFX9-NEXT: $vgpr0 = COPY [[V_SIN_F16_sdwa3]]
- ; GFX9-NEXT: $vgpr1 = COPY [[V_SIN_F16_sdwa1]]
- ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
- %9:vgpr_32 = COPY $vgpr1
+ ; GFX9-NEXT: $vgpr0 = COPY [[V_SIN_F16_sdwa1]]
+ ; GFX9-NEXT: SI_RETURN implicit $vgpr0
%8:vgpr_32 = COPY $vgpr0
- %12:vgpr_32 = nofpexcept V_MUL_F16_e64 0, %9, 0, 12568, 0, 0, implicit $mode, implicit $exec
- %13:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed %12, 0, 0, implicit $mode, implicit $exec
- %15:vgpr_32 = V_LSHRREV_B32_e64 16, %9, implicit $exec
- %17:vgpr_32 = nofpexcept V_MUL_F16_e64 0, %15, 0, 12568, 0, 0, implicit $mode, implicit $exec
- %18:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed %17, 0, 0, implicit $mode, implicit $exec
%20:vgpr_32 = nofpexcept V_MUL_F16_e64 0, %8, 0, 12568, 0, 0, implicit $mode, implicit $exec
%21:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed %20, 0, 0, implicit $mode, implicit $exec
%22:vgpr_32 = V_LSHRREV_B32_e64 16, %8, implicit $exec
%24:vgpr_32 = nofpexcept V_MUL_F16_e64 0, %22, 0, 12568, 0, 0, implicit $mode, implicit $exec
%25:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed %24, 0, 0, implicit $mode, implicit $exec
%26:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %21, 0, killed %25, 0, 0, implicit $mode, implicit $exec
- %27:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %13, 0, killed %18, 0, 0, implicit $mode, implicit $exec
$vgpr0 = COPY %26
- $vgpr1 = COPY %27
- SI_RETURN implicit $vgpr0, implicit $vgpr1
+ SI_RETURN implicit $vgpr0
...
---
-name: cos_v4f16
+name: asymmetric_equal_edges_fp16
tracksRegLiveness: true
body: |
- bb.0 (%ir-block.0):
- liveins: $vgpr0, $vgpr1
-
- ; GFX9-LABEL: name: cos_v4f16
- ; GFX9: liveins: $vgpr0, $vgpr1
+ bb.0:
+ liveins: $vgpr0
+ ; GFX9-LABEL: name: asymmetric_equal_edges_fp16
+ ; GFX9: liveins: $vgpr0
; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 12568, implicit $exec
; GFX9-NEXT: [[V_MUL_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, [[COPY]], 0, [[V_MOV_B32_e32_]], 0, 0, 4, 2, 4, 6, implicit $mode, implicit $exec, implicit [[COPY]](tied-def 0)
; GFX9-NEXT: [[V_COS_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_COS_F16_sdwa 0, killed [[V_MUL_F16_sdwa]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit killed [[V_MUL_F16_sdwa]](tied-def 0)
; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 12568, implicit $exec
; GFX9-NEXT: [[V_MUL_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, [[V_COS_F16_sdwa]], 0, [[V_MOV_B32_e32_1]], 0, 0, 5, 2, 5, 6, implicit $mode, implicit $exec, implicit [[V_COS_F16_sdwa]](tied-def 0)
- ; GFX9-NEXT: [[V_COS_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_COS_F16_sdwa 0, killed [[V_MUL_F16_sdwa1]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit killed [[V_MUL_F16_sdwa1]](tied-def 0)
- ; GFX9-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 12568, implicit $exec
- ; GFX9-NEXT: [[V_MUL_F16_sdwa2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, [[COPY1]], 0, [[V_MOV_B32_e32_2]], 0, 0, 4, 2, 4, 6, implicit $mode, implicit $exec, implicit [[COPY1]](tied-def 0)
- ; GFX9-NEXT: [[V_COS_F16_sdwa2:%[0-9]+]]:vgpr_32 = nofpexcept V_COS_F16_sdwa 0, killed [[V_MUL_F16_sdwa2]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit killed [[V_MUL_F16_sdwa2]](tied-def 0)
- ; GFX9-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 12568, implicit $exec
- ; GFX9-NEXT: [[V_MUL_F16_sdwa3:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, [[V_COS_F16_sdwa2]], 0, [[V_MOV_B32_e32_3]], 0, 0, 5, 2, 5, 6, implicit $mode, implicit $exec, implicit [[V_COS_F16_sdwa2]](tied-def 0)
- ; GFX9-NEXT: [[V_COS_F16_sdwa3:%[0-9]+]]:vgpr_32 = nofpexcept V_COS_F16_sdwa 0, killed [[V_MUL_F16_sdwa3]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit killed [[V_MUL_F16_sdwa3]](tied-def 0)
- ; GFX9-NEXT: $vgpr0 = COPY [[V_COS_F16_sdwa3]]
- ; GFX9-NEXT: $vgpr1 = COPY [[V_COS_F16_sdwa1]]
- ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
- %9:vgpr_32 = COPY $vgpr1
+ ; GFX9-NEXT: [[V_EXP_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F16_sdwa 0, killed [[V_MUL_F16_sdwa1]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit killed [[V_MUL_F16_sdwa1]](tied-def 0)
+ ; GFX9-NEXT: $vgpr0 = COPY [[V_EXP_F16_sdwa]]
+ ; GFX9-NEXT: SI_RETURN implicit $vgpr0
%8:vgpr_32 = COPY $vgpr0
- %12:vgpr_32 = nofpexcept V_MUL_F16_e64 0, %9, 0, 12568, 0, 0, implicit $mode, implicit $exec
- %13:vgpr_32 = nofpexcept V_COS_F16_e64 0, killed %12, 0, 0, implicit $mode, implicit $exec
- %15:vgpr_32 = V_LSHRREV_B32_e64 16, %9, implicit $exec
- %17:vgpr_32 = nofpexcept V_MUL_F16_e64 0, %15, 0, 12568, 0, 0, implicit $mode, implicit $exec
- %18:vgpr_32 = nofpexcept V_COS_F16_e64 0, killed %17, 0, 0, implicit $mode, implicit $exec
%20:vgpr_32 = nofpexcept V_MUL_F16_e64 0, %8, 0, 12568, 0, 0, implicit $mode, implicit $exec
%21:vgpr_32 = nofpexcept V_COS_F16_e64 0, killed %20, 0, 0, implicit $mode, implicit $exec
%22:vgpr_32 = V_LSHRREV_B32_e64 16, %8, implicit $exec
%24:vgpr_32 = nofpexcept V_MUL_F16_e64 0, %22, 0, 12568, 0, 0, implicit $mode, implicit $exec
- %25:vgpr_32 = nofpexcept V_COS_F16_e64 0, killed %24, 0, 0, implicit $mode, implicit $exec
+ %25:vgpr_32 = nofpexcept V_EXP_F16_e64 0, killed %24, 0, 0, implicit $mode, implicit $exec
%26:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %21, 0, killed %25, 0, 0, implicit $mode, implicit $exec
- %27:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %13, 0, killed %18, 0, 0, implicit $mode, implicit $exec
$vgpr0 = COPY %26
- $vgpr1 = COPY %27
- SI_RETURN implicit $vgpr0, implicit $vgpr1
+ SI_RETURN implicit $vgpr0
...
---
-name: log_v4f16
+name: asymmetric_unequal_edges_fp16
tracksRegLiveness: true
body: |
- bb.0 (%ir-block.0):
- liveins: $vgpr0, $vgpr1
-
- ; GFX9-LABEL: name: log_v4f16
- ; GFX9: liveins: $vgpr0, $vgpr1
+ bb.0:
+ liveins: $vgpr0
+ ; GFX9-LABEL: name: asymmetric_unequal_edges_fp16
+ ; GFX9: liveins: $vgpr0
; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX9-NEXT: [[V_LOG_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[COPY]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit [[COPY]](tied-def 0)
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 14732
- ; GFX9-NEXT: [[V_MUL_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, killed [[V_LOG_F16_sdwa]], 0, [[S_MOV_B32_]], 0, 0, 4, 2, 4, 6, implicit $mode, implicit $exec, implicit killed [[V_LOG_F16_sdwa]](tied-def 0)
- ; GFX9-NEXT: [[V_LOG_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[V_MUL_F16_sdwa]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit [[V_MUL_F16_sdwa]](tied-def 0)
- ; GFX9-NEXT: [[V_MUL_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, killed [[V_LOG_F16_sdwa1]], 0, [[S_MOV_B32_]], 0, 0, 5, 2, 5, 6, implicit $mode, implicit $exec, implicit killed [[V_LOG_F16_sdwa1]](tied-def 0)
- ; GFX9-NEXT: [[V_LOG_F16_sdwa2:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[COPY1]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit [[COPY1]](tied-def 0)
- ; GFX9-NEXT: [[V_MUL_F16_sdwa2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, killed [[V_LOG_F16_sdwa2]], 0, [[S_MOV_B32_]], 0, 0, 4, 2, 4, 6, implicit $mode, implicit $exec, implicit killed [[V_LOG_F16_sdwa2]](tied-def 0)
- ; GFX9-NEXT: [[V_LOG_F16_sdwa3:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[V_MUL_F16_sdwa2]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit [[V_MUL_F16_sdwa2]](tied-def 0)
- ; GFX9-NEXT: [[V_MUL_F16_sdwa3:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, killed [[V_LOG_F16_sdwa3]], 0, [[S_MOV_B32_]], 0, 0, 5, 2, 5, 6, implicit $mode, implicit $exec, implicit killed [[V_LOG_F16_sdwa3]](tied-def 0)
- ; GFX9-NEXT: $vgpr0 = COPY [[V_MUL_F16_sdwa3]]
- ; GFX9-NEXT: $vgpr1 = COPY [[V_MUL_F16_sdwa1]]
- ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
- %9:vgpr_32 = COPY $vgpr1
+ ; GFX9-NEXT: [[V_LOG_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[COPY]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit [[COPY]](tied-def 0)
+ ; GFX9-NEXT: [[V_EXP_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F16_sdwa 0, [[V_LOG_F16_sdwa]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit [[V_LOG_F16_sdwa]](tied-def 0)
+ ; GFX9-NEXT: [[V_MUL_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, killed [[V_EXP_F16_sdwa]], 0, [[S_MOV_B32_]], 0, 0, 5, 2, 5, 6, implicit $mode, implicit $exec, implicit killed [[V_EXP_F16_sdwa]](tied-def 0)
+ ; GFX9-NEXT: $vgpr0 = COPY [[V_MUL_F16_sdwa]]
+ ; GFX9-NEXT: SI_RETURN implicit $vgpr0
%8:vgpr_32 = COPY $vgpr0
- %11:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %9, 0, 0, implicit $mode, implicit $exec
%12:sreg_32 = S_MOV_B32 14732
- %13:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %11, 0, %12, 0, 0, implicit $mode, implicit $exec
- %15:vgpr_32 = V_LSHRREV_B32_e64 16, %9, implicit $exec
- %17:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %15, 0, 0, implicit $mode, implicit $exec
- %18:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %17, 0, %12, 0, 0, implicit $mode, implicit $exec
%20:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %8, 0, 0, implicit $mode, implicit $exec
- %21:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %20, 0, %12, 0, 0, implicit $mode, implicit $exec
%22:vgpr_32 = V_LSHRREV_B32_e64 16, %8, implicit $exec
- %24:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %22, 0, 0, implicit $mode, implicit $exec
+ %24:vgpr_32 = nofpexcept V_EXP_F16_e64 0, %22, 0, 0, implicit $mode, implicit $exec
%25:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %24, 0, %12, 0, 0, implicit $mode, implicit $exec
- %26:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %21, 0, killed %25, 0, 0, implicit $mode, implicit $exec
- %27:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %13, 0, killed %18, 0, 0, implicit $mode, implicit $exec
+ %26:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %20, 0, killed %25, 0, 0, implicit $mode, implicit $exec
$vgpr0 = COPY %26
- $vgpr1 = COPY %27
- SI_RETURN implicit $vgpr0, implicit $vgpr1
+ SI_RETURN implicit $vgpr0
...
---
-name: log2_v4f16
+name: symmetric_one_edge_fp16
tracksRegLiveness: true
body: |
- bb.0 (%ir-block.0):
- liveins: $vgpr0, $vgpr1
-
- ; GFX9-LABEL: name: log2_v4f16
- ; GFX9: liveins: $vgpr0, $vgpr1
+ bb.0:
+ liveins: $vgpr0
+ ; GFX9-LABEL: name: symmetric_one_edge_fp16
+ ; GFX9: liveins: $vgpr0
; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX9-NEXT: [[V_LOG_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[COPY]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit [[COPY]](tied-def 0)
- ; GFX9-NEXT: [[V_LOG_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[COPY1]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit [[COPY1]](tied-def 0)
- ; GFX9-NEXT: [[V_LOG_F16_sdwa2:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[V_LOG_F16_sdwa]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit [[V_LOG_F16_sdwa]](tied-def 0)
- ; GFX9-NEXT: [[V_LOG_F16_sdwa3:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[V_LOG_F16_sdwa1]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit [[V_LOG_F16_sdwa1]](tied-def 0)
- ; GFX9-NEXT: $vgpr0 = COPY [[V_LOG_F16_sdwa3]]
- ; GFX9-NEXT: $vgpr1 = COPY [[V_LOG_F16_sdwa2]]
- ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
- %9:vgpr_32 = COPY $vgpr1
+ ; GFX9-NEXT: [[V_LOG_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[V_LOG_F16_sdwa]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit [[V_LOG_F16_sdwa]](tied-def 0)
+ ; GFX9-NEXT: $vgpr0 = COPY [[V_LOG_F16_sdwa1]]
+ ; GFX9-NEXT: SI_RETURN implicit $vgpr0
%8:vgpr_32 = COPY $vgpr0
- %11:vgpr_32 = V_LSHRREV_B32_e64 16, %9, implicit $exec
- %13:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %11, 0, 0, implicit $mode, implicit $exec
%14:vgpr_32 = V_LSHRREV_B32_e64 16, %8, implicit $exec
%16:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %14, 0, 0, implicit $mode, implicit $exec
- %18:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %9, 0, 0, implicit $mode, implicit $exec
%20:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %8, 0, 0, implicit $mode, implicit $exec
%21:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %20, 0, killed %16, 0, 0, implicit $mode, implicit $exec
- %22:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %18, 0, killed %13, 0, 0, implicit $mode, implicit $exec
$vgpr0 = COPY %21
- $vgpr1 = COPY %22
- SI_RETURN implicit $vgpr0, implicit $vgpr1
+ SI_RETURN implicit $vgpr0
...
---
-name: exp_v4f16
+name: symmetric_equal_edges_fp16_fp32
tracksRegLiveness: true
body: |
- bb.0 (%ir-block.0):
+ bb.0:
liveins: $vgpr0, $vgpr1
-
- ; GFX9-LABEL: name: exp_v4f16
+ ; GFX9-LABEL: name: symmetric_equal_edges_fp16_fp32
; GFX9: liveins: $vgpr0, $vgpr1
; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX9-NEXT: [[V_CVT_F32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 1069066811
+ ; GFX9-NEXT: [[V_CVT_F32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
; GFX9-NEXT: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed [[V_CVT_F32_F16_e64_]], 0, [[S_MOV_B32_]], 0, 0, implicit $mode, implicit $exec
; GFX9-NEXT: [[V_EXP_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F32_e64 0, killed [[V_MUL_F32_e64_]], 0, 0, implicit $mode, implicit $exec
; GFX9-NEXT: [[V_CVT_F16_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, killed [[V_EXP_F32_e64_]], 0, 0, implicit $mode, implicit $exec
@@ -247,31 +128,11 @@ body: |
; GFX9-NEXT: [[V_MUL_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed [[V_CVT_F32_F16_sdwa]], 0, [[S_MOV_B32_]], 0, 0, implicit $mode, implicit $exec
; GFX9-NEXT: [[V_EXP_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F32_e64 0, killed [[V_MUL_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
; GFX9-NEXT: [[V_CVT_F16_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, killed [[V_EXP_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
- ; GFX9-NEXT: [[V_CVT_F32_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
- ; GFX9-NEXT: [[V_MUL_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed [[V_CVT_F32_F16_e64_1]], 0, [[S_MOV_B32_]], 0, 0, implicit $mode, implicit $exec
- ; GFX9-NEXT: [[V_EXP_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F32_e64 0, killed [[V_MUL_F32_e64_2]], 0, 0, implicit $mode, implicit $exec
- ; GFX9-NEXT: [[V_CVT_F16_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, killed [[V_EXP_F32_e64_2]], 0, 0, implicit $mode, implicit $exec
- ; GFX9-NEXT: [[V_CVT_F32_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_sdwa 0, [[COPY1]], 0, 0, 6, 0, 5, implicit $mode, implicit $exec
- ; GFX9-NEXT: [[V_MUL_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed [[V_CVT_F32_F16_sdwa1]], 0, [[S_MOV_B32_]], 0, 0, implicit $mode, implicit $exec
- ; GFX9-NEXT: [[V_EXP_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F32_e64 0, killed [[V_MUL_F32_e64_3]], 0, 0, implicit $mode, implicit $exec
- ; GFX9-NEXT: [[V_CVT_F16_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, killed [[V_EXP_F32_e64_3]], 0, 0, implicit $mode, implicit $exec
- ; GFX9-NEXT: [[V_PACK_B32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed [[V_CVT_F16_F32_e64_2]], 0, killed [[V_CVT_F16_F32_e64_3]], 0, 0, implicit $mode, implicit $exec
- ; GFX9-NEXT: [[V_PACK_B32_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed [[V_CVT_F16_F32_e64_]], 0, killed [[V_CVT_F16_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_PACK_B32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed [[V_CVT_F16_F32_e64_]], 0, killed [[V_CVT_F16_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
; GFX9-NEXT: $vgpr0 = COPY [[V_PACK_B32_F16_e64_]]
- ; GFX9-NEXT: $vgpr1 = COPY [[V_PACK_B32_F16_e64_1]]
- ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
- %9:vgpr_32 = COPY $vgpr1
+ ; GFX9-NEXT: SI_RETURN implicit $vgpr0
%8:vgpr_32 = COPY $vgpr0
- %11:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, %9, 0, 0, implicit $mode, implicit $exec
%12:sgpr_32 = S_MOV_B32 1069066811
- %13:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed %11, 0, %12, 0, 0, implicit $mode, implicit $exec
- %14:vgpr_32 = nofpexcept V_EXP_F32_e64 0, killed %13, 0, 0, implicit $mode, implicit $exec
- %15:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, killed %14, 0, 0, implicit $mode, implicit $exec
- %17:vgpr_32 = V_LSHRREV_B32_e64 16, %9, implicit $exec
- %19:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, %17, 0, 0, implicit $mode, implicit $exec
- %20:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed %19, 0, %12, 0, 0, implicit $mode, implicit $exec
- %21:vgpr_32 = nofpexcept V_EXP_F32_e64 0, killed %20, 0, 0, implicit $mode, implicit $exec
- %22:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, killed %21, 0, 0, implicit $mode, implicit $exec
%24:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, %8, 0, 0, implicit $mode, implicit $exec
%25:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed %24, 0, %12, 0, 0, implicit $mode, implicit $exec
%26:vgpr_32 = nofpexcept V_EXP_F32_e64 0, killed %25, 0, 0, implicit $mode, implicit $exec
@@ -282,99 +143,72 @@ body: |
%32:vgpr_32 = nofpexcept V_EXP_F32_e64 0, killed %31, 0, 0, implicit $mode, implicit $exec
%33:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, killed %32, 0, 0, implicit $mode, implicit $exec
%34:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %27, 0, killed %33, 0, 0, implicit $mode, implicit $exec
- %35:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %15, 0, killed %22, 0, 0, implicit $mode, implicit $exec
$vgpr0 = COPY %34
- $vgpr1 = COPY %35
- SI_RETURN implicit $vgpr0, implicit $vgpr1
+ SI_RETURN implicit $vgpr0
...
---
-name: cascaded_v4f16
+name: asymmetric_unequal_edges_fp16_fp32
tracksRegLiveness: true
body: |
- bb.0 (%ir-block.0):
+ bb.0:
liveins: $vgpr0, $vgpr1
-
- ; GFX9-LABEL: name: cascaded_v4f16
+ ; GFX9-LABEL: name: asymmetric_unequal_edges_fp16_fp32
; GFX9: liveins: $vgpr0, $vgpr1
; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX9-NEXT: [[V_LOG_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[COPY1]], 0, 0, 6, 0, 5, implicit $mode, implicit $exec
- ; GFX9-NEXT: [[V_LOG_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[COPY]], 0, 0, 6, 0, 5, implicit $mode, implicit $exec
- ; GFX9-NEXT: [[V_LOG_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
- ; GFX9-NEXT: [[V_LOG_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
- ; GFX9-NEXT: [[V_MUL_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed [[V_LOG_F16_e64_1]], 0, 12568, 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 1069066811
+ ; GFX9-NEXT: [[V_CVT_F32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed [[V_CVT_F32_F16_e64_]], 0, [[S_MOV_B32_]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_EXP_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F32_e64 0, killed [[V_MUL_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_CVT_F16_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, killed [[V_EXP_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_MUL_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, [[COPY]], 0, [[S_MOV_B32_]], 0, 0, 6, 0, 5, 6, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_EXP_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F16_e64 0, killed [[V_MUL_F16_sdwa]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_PACK_B32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed [[V_CVT_F16_F32_e64_]], 0, killed [[V_EXP_F16_e64_]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: $vgpr0 = COPY [[V_PACK_B32_F16_e64_]]
+ ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+ %8:vgpr_32 = COPY $vgpr0
+ %12:sgpr_32 = S_MOV_B32 1069066811
+ %24:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, %8, 0, 0, implicit $mode, implicit $exec
+ %25:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed %24, 0, %12, 0, 0, implicit $mode, implicit $exec
+ %26:vgpr_32 = nofpexcept V_EXP_F32_e64 0, killed %25, 0, 0, implicit $mode, implicit $exec
+ %27:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, killed %26, 0, 0, implicit $mode, implicit $exec
+ %28:vgpr_32 = V_LSHRREV_B32_e64 16, %8, implicit $exec
+ %31:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %28, 0, %12, 0, 0, implicit $mode, implicit $exec
+ %32:vgpr_32 = nofpexcept V_EXP_F16_e64 0, killed %31, 0, 0, implicit $mode, implicit $exec
+ %34:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %27, 0, killed %32, 0, 0, implicit $mode, implicit $exec
+ $vgpr0 = COPY %34
+ SI_RETURN implicit $vgpr0
+...
+
+---
+name: interleaved_symmetric_edges_fp16
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+ ; GFX9-LABEL: name: interleaved_symmetric_edges_fp16
+ ; GFX9: liveins: $vgpr0
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9-NEXT: [[V_LOG_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[COPY]], 0, 0, 6, 0, 5, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_LOG_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_MUL_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed [[V_LOG_F16_e64_]], 0, 12568, 0, 0, implicit $mode, implicit $exec
; GFX9-NEXT: [[V_SIN_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed [[V_MUL_F16_e64_]], 0, 0, implicit $mode, implicit $exec
- ; GFX9-NEXT: [[V_MUL_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed [[V_LOG_F16_sdwa1]], 0, 12568, 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_MUL_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed [[V_LOG_F16_sdwa]], 0, 12568, 0, 0, implicit $mode, implicit $exec
; GFX9-NEXT: [[V_SIN_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed [[V_MUL_F16_e64_1]], 0, 0, implicit $mode, implicit $exec
- ; GFX9-NEXT: [[V_MUL_F16_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed [[V_LOG_F16_e64_]], 0, 12568, 0, 0, implicit $mode, implicit $exec
- ; GFX9-NEXT: [[V_SIN_F16_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed [[V_MUL_F16_e64_2]], 0, 0, implicit $mode, implicit $exec
- ; GFX9-NEXT: [[V_MUL_F16_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed [[V_LOG_F16_sdwa]], 0, 12568, 0, 0, implicit $mode, implicit $exec
- ; GFX9-NEXT: [[V_SIN_F16_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed [[V_MUL_F16_e64_3]], 0, 0, implicit $mode, implicit $exec
- ; GFX9-NEXT: [[V_PACK_B32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed [[V_SIN_F16_e64_2]], 0, killed [[V_SIN_F16_e64_3]], 0, 0, implicit $mode, implicit $exec
- ; GFX9-NEXT: [[V_PACK_B32_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed [[V_SIN_F16_e64_]], 0, killed [[V_SIN_F16_e64_1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_PACK_B32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed [[V_SIN_F16_e64_]], 0, killed [[V_SIN_F16_e64_1]], 0, 0, implicit $mode, implicit $exec
; GFX9-NEXT: $vgpr0 = COPY [[V_PACK_B32_F16_e64_]]
- ; GFX9-NEXT: $vgpr1 = COPY [[V_PACK_B32_F16_e64_1]]
- ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
- %9:vgpr_32 = COPY $vgpr1
+ ; GFX9-NEXT: SI_RETURN implicit $vgpr0
%8:vgpr_32 = COPY $vgpr0
%11:vgpr_32 = V_LSHRREV_B32_e64 16, %8, implicit $exec
%13:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %11, 0, 0, implicit $mode, implicit $exec
- %14:vgpr_32 = V_LSHRREV_B32_e64 16, %9, implicit $exec
- %16:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %14, 0, 0, implicit $mode, implicit $exec
%18:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %8, 0, 0, implicit $mode, implicit $exec
- %20:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %9, 0, 0, implicit $mode, implicit $exec
- %22:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %20, 0, 12568, 0, 0, implicit $mode, implicit $exec
- %23:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed %22, 0, 0, implicit $mode, implicit $exec
- %24:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %16, 0, 12568, 0, 0, implicit $mode, implicit $exec
- %25:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed %24, 0, 0, implicit $mode, implicit $exec
%26:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %18, 0, 12568, 0, 0, implicit $mode, implicit $exec
%27:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed %26, 0, 0, implicit $mode, implicit $exec
%28:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %13, 0, 12568, 0, 0, implicit $mode, implicit $exec
%29:vgpr_32 = nofpexcept V_SIN_F16_e64 0, killed %28, 0, 0, implicit $mode, implicit $exec
%30:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %27, 0, killed %29, 0, 0, implicit $mode, implicit $exec
- %31:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %23, 0, killed %25, 0, 0, implicit $mode, implicit $exec
$vgpr0 = COPY %30
- $vgpr1 = COPY %31
- SI_RETURN implicit $vgpr0, implicit $vgpr1
-...
-
----
-name: unbalanced_operations_packed
-tracksRegLiveness: true
-body: |
- bb.0 (%ir-block.0):
- liveins: $vgpr0, $vgpr1
- ; GFX9-LABEL: name: unbalanced_operations_packed
- ; GFX9: liveins: $vgpr0, $vgpr1
- ; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX9-NEXT: [[V_LOG_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[COPY]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit [[COPY]](tied-def 0)
- ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 14732
- ; GFX9-NEXT: [[V_MUL_F16_sdwa:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, killed [[V_LOG_F16_sdwa]], 0, [[S_MOV_B32_]], 0, 0, 4, 2, 4, 6, implicit $mode, implicit $exec, implicit killed [[V_LOG_F16_sdwa]](tied-def 0)
- ; GFX9-NEXT: [[V_LOG_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[V_MUL_F16_sdwa]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit [[V_MUL_F16_sdwa]](tied-def 0)
- ; GFX9-NEXT: [[V_LOG_F16_sdwa2:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[COPY1]], 0, 0, 4, 2, 4, implicit $mode, implicit $exec, implicit [[COPY1]](tied-def 0)
- ; GFX9-NEXT: [[V_LOG_F16_sdwa3:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_sdwa 0, [[V_LOG_F16_sdwa2]], 0, 0, 5, 2, 5, implicit $mode, implicit $exec, implicit [[V_LOG_F16_sdwa2]](tied-def 0)
- ; GFX9-NEXT: [[V_MUL_F16_sdwa1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_sdwa 0, killed [[V_LOG_F16_sdwa3]], 0, [[S_MOV_B32_]], 0, 0, 5, 2, 5, 6, implicit $mode, implicit $exec, implicit killed [[V_LOG_F16_sdwa3]](tied-def 0)
- ; GFX9-NEXT: $vgpr0 = COPY [[V_MUL_F16_sdwa1]]
- ; GFX9-NEXT: $vgpr1 = COPY [[V_LOG_F16_sdwa1]]
- ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
- %9:vgpr_32 = COPY $vgpr1
- %8:vgpr_32 = COPY $vgpr0
- %11:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %9, 0, 0, implicit $mode, implicit $exec
- %12:sreg_32 = S_MOV_B32 14732
- %13:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %11, 0, %12, 0, 0, implicit $mode, implicit $exec
- %15:vgpr_32 = V_LSHRREV_B32_e64 16, %9, implicit $exec
- %17:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %15, 0, 0, implicit $mode, implicit $exec
- %20:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %8, 0, 0, implicit $mode, implicit $exec
- %22:vgpr_32 = V_LSHRREV_B32_e64 16, %8, implicit $exec
- %24:vgpr_32 = nofpexcept V_LOG_F16_e64 0, %22, 0, 0, implicit $mode, implicit $exec
- %25:vgpr_32 = nofpexcept V_MUL_F16_e64 0, killed %24, 0, %12, 0, 0, implicit $mode, implicit $exec
- %26:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %20, 0, killed %25, 0, 0, implicit $mode, implicit $exec
- %27:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %13, 0, killed %17, 0, 0, implicit $mode, implicit $exec
- $vgpr0 = COPY %26
- $vgpr1 = COPY %27
- SI_RETURN implicit $vgpr0, implicit $vgpr1
+ SI_RETURN implicit $vgpr0
...
diff --git a/llvm/test/CodeGen/AMDGPU/vector-fp16.ll b/llvm/test/CodeGen/AMDGPU/vector-fp16.ll
deleted file mode 100644
index 501630e790200..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/vector-fp16.ll
+++ /dev/null
@@ -1,2758 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX906 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX908 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
-
-declare <1 x half> @llvm.sin.v1f16(<1 x half>)
-declare <1 x half> @llvm.cos.v1f16(<1 x half>)
-declare <1 x half> @llvm.log.v1f16(<1 x half>)
-declare <1 x half> @llvm.log2.v1f16(<1 x half>)
-declare <1 x half> @llvm.log10.v1f16(<1 x half>)
-declare <1 x half> @llvm.exp.v1f16(<1 x half>)
-declare <1 x half> @llvm.exp2.v1f16(<1 x half>)
-declare <1 x half> @llvm.exp10.v1f16(<1 x half>)
-declare <1 x half> @llvm.sqrt.v1f16(<1 x half>)
-
-declare <2 x half> @llvm.sin.v2f16(<2 x half>)
-declare <2 x half> @llvm.cos.v2f16(<2 x half>)
-declare <2 x half> @llvm.log.v2f16(<2 x half>)
-declare <2 x half> @llvm.log2.v2f16(<2 x half>)
-declare <2 x half> @llvm.log10.v2f16(<2 x half>)
-declare <2 x half> @llvm.exp.v2f16(<2 x half>)
-declare <2 x half> @llvm.exp2.v2f16(<2 x half>)
-declare <2 x half> @llvm.exp10.v2f16(<2 x half>)
-declare <2 x half> @llvm.sqrt.v2f16(<2 x half>)
-
-declare <4 x half> @llvm.sin.v4f16(<4 x half>)
-declare <4 x half> @llvm.cos.v4f16(<4 x half>)
-declare <4 x half> @llvm.log.v4f16(<4 x half>)
-declare <4 x half> @llvm.log2.v4f16(<4 x half>)
-declare <4 x half> @llvm.log10.v4f16(<4 x half>)
-declare <4 x half> @llvm.exp.v4f16(<4 x half>)
-declare <4 x half> @llvm.exp2.v4f16(<4 x half>)
-declare <4 x half> @llvm.exp10.v4f16(<4 x half>)
-declare <4 x half> @llvm.sqrt.v4f16(<4 x half>)
-
-declare <5 x half> @llvm.sin.v5f16(<5 x half>)
-declare <5 x half> @llvm.cos.v5f16(<5 x half>)
-declare <5 x half> @llvm.log.v5f16(<5 x half>)
-declare <5 x half> @llvm.log2.v5f16(<5 x half>)
-declare <5 x half> @llvm.log10.v5f16(<5 x half>)
-declare <5 x half> @llvm.exp.v5f16(<5 x half>)
-declare <5 x half> @llvm.exp2.v5f16(<5 x half>)
-declare <5 x half> @llvm.exp10.v5f16(<5 x half>)
-declare <5 x half> @llvm.sqrt.v5f16(<5 x half>)
-
-
-define <1 x half> @sin_v1f16(<1 x half> %a) {
-; GFX8-LABEL: sin_v1f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
-; GFX8-NEXT: v_fract_f16_e32 v0, v0
-; GFX8-NEXT: v_sin_f16_e32 v0, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: sin_v1f16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
-; GFX9-NEXT: v_sin_f16_e32 v0, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: sin_v1f16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
-; GFX10-NEXT: v_sin_f16_e32 v0, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: sin_v1f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_sin_f16_e32 v0, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %res = call <1 x half> @llvm.sin.v1f16(<1 x half> %a)
- ret <1 x half> %res
-}
-
-define <1 x half> @cos_v1f16(<1 x half> %a) {
-; GFX8-LABEL: cos_v1f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
-; GFX8-NEXT: v_fract_f16_e32 v0, v0
-; GFX8-NEXT: v_cos_f16_e32 v0, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: cos_v1f16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
-; GFX9-NEXT: v_cos_f16_e32 v0, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: cos_v1f16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
-; GFX10-NEXT: v_cos_f16_e32 v0, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: cos_v1f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cos_f16_e32 v0, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %res = call <1 x half> @llvm.cos.v1f16(<1 x half> %a)
- ret <1 x half> %res
-}
-
-define <1 x half> @log_v1f16(<1 x half> %a) {
-; GFX8-LABEL: log_v1f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_log_f16_e32 v0, v0
-; GFX8-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX906-LABEL: log_v1f16:
-; GFX906: ; %bb.0:
-; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_log_f16_e32 v0, v0
-; GFX906-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX906-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX908-LABEL: log_v1f16:
-; GFX908: ; %bb.0:
-; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_log_f16_e32 v0, v0
-; GFX908-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX908-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: log_v1f16:
-; GFX942: ; %bb.0:
-; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_log_f16_e32 v0, v0
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX942-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: log_v1f16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_log_f16_e32 v0, v0
-; GFX10-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: log_v1f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_log_f16_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %res = call <1 x half> @llvm.log.v1f16(<1 x half> %a)
- ret <1 x half> %res
-}
-
-define <1 x half> @log2_v1f16(<1 x half> %a) {
-; GFX8-LABEL: log2_v1f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_log_f16_e32 v0, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: log2_v1f16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_log_f16_e32 v0, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: log2_v1f16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_log_f16_e32 v0, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: log2_v1f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_log_f16_e32 v0, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %res = call <1 x half> @llvm.log2.v1f16(<1 x half> %a)
- ret <1 x half> %res
-}
-
-define <1 x half> @log10_v1f16(<1 x half> %a) {
-; GFX8-LABEL: log10_v1f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_log_f16_e32 v0, v0
-; GFX8-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX906-LABEL: log10_v1f16:
-; GFX906: ; %bb.0:
-; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_log_f16_e32 v0, v0
-; GFX906-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX906-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX908-LABEL: log10_v1f16:
-; GFX908: ; %bb.0:
-; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_log_f16_e32 v0, v0
-; GFX908-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX908-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: log10_v1f16:
-; GFX942: ; %bb.0:
-; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_log_f16_e32 v0, v0
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX942-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: log10_v1f16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_log_f16_e32 v0, v0
-; GFX10-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: log10_v1f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_log_f16_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %res = call <1 x half> @llvm.log10.v1f16(<1 x half> %a)
- ret <1 x half> %res
-}
-
-define <1 x half> @exp_v1f16(<1 x half> %a) {
-; GFX8-LABEL: exp_v1f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX8-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX8-NEXT: v_exp_f32_e32 v0, v0
-; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX906-LABEL: exp_v1f16:
-; GFX906: ; %bb.0:
-; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX906-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX906-NEXT: v_exp_f32_e32 v0, v0
-; GFX906-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX906-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX908-LABEL: exp_v1f16:
-; GFX908: ; %bb.0:
-; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX908-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX908-NEXT: v_exp_f32_e32 v0, v0
-; GFX908-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX908-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: exp_v1f16:
-; GFX942: ; %bb.0:
-; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX942-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX942-NEXT: v_exp_f32_e32 v0, v0
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX942-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: exp_v1f16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX10-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX10-NEXT: v_exp_f32_e32 v0, v0
-; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: exp_v1f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX11-NEXT: v_exp_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %res = call <1 x half> @llvm.exp.v1f16(<1 x half> %a)
- ret <1 x half> %res
-}
-
-define <1 x half> @exp2_v1f16(<1 x half> %a) {
-; GFX8-LABEL: exp2_v1f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_exp_f16_e32 v0, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: exp2_v1f16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_exp_f16_e32 v0, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: exp2_v1f16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_exp_f16_e32 v0, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: exp2_v1f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_exp_f16_e32 v0, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %res = call <1 x half> @llvm.exp2.v1f16(<1 x half> %a)
- ret <1 x half> %res
-}
-
-define <1 x half> @exp10_v1f16(<1 x half> %a) {
-; GFX8-LABEL: exp10_v1f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX8-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX8-NEXT: v_exp_f32_e32 v0, v0
-; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX906-LABEL: exp10_v1f16:
-; GFX906: ; %bb.0:
-; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX906-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX906-NEXT: v_exp_f32_e32 v0, v0
-; GFX906-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX906-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX908-LABEL: exp10_v1f16:
-; GFX908: ; %bb.0:
-; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX908-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX908-NEXT: v_exp_f32_e32 v0, v0
-; GFX908-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX908-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: exp10_v1f16:
-; GFX942: ; %bb.0:
-; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX942-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX942-NEXT: v_exp_f32_e32 v0, v0
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX942-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: exp10_v1f16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX10-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX10-NEXT: v_exp_f32_e32 v0, v0
-; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: exp10_v1f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX11-NEXT: v_exp_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %res = call <1 x half> @llvm.exp10.v1f16(<1 x half> %a)
- ret <1 x half> %res
-}
-
-define <1 x half> @sqrt_v1f16(<1 x half> %a) {
-; GFX8-LABEL: sqrt_v1f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_sqrt_f16_e32 v0, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: sqrt_v1f16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_sqrt_f16_e32 v0, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: sqrt_v1f16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_sqrt_f16_e32 v0, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: sqrt_v1f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %res = call <1 x half> @llvm.sqrt.v1f16(<1 x half> %a)
- ret <1 x half> %res
-}
-
-define <2 x half> @sin_v2f16(<2 x half> %a) {
-; GFX8-LABEL: sin_v2f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, 0x3118
-; GFX8-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
-; GFX8-NEXT: v_fract_f16_e32 v1, v1
-; GFX8-NEXT: v_fract_f16_e32 v0, v0
-; GFX8-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT: v_sin_f16_e32 v0, v0
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX906-LABEL: sin_v2f16:
-; GFX906: ; %bb.0:
-; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_mov_b32_e32 v1, 0x3118
-; GFX906-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX906-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX906-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX906-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX908-LABEL: sin_v2f16:
-; GFX908: ; %bb.0:
-; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v1, 0x3118
-; GFX908-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX908-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX908-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX908-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX908-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: sin_v2f16:
-; GFX942: ; %bb.0:
-; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v1, 0x3118
-; GFX942-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX942-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: sin_v2f16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, 0x3118
-; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: sin_v2f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
-; GFX11-NEXT: v_sin_f16_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_sin_f16_e32 v1, v1
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %res = call <2 x half> @llvm.sin.v2f16(<2 x half> %a)
- ret <2 x half> %res
-}
-
-define <2 x half> @cos_v2f16(<2 x half> %a) {
-; GFX8-LABEL: cos_v2f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, 0x3118
-; GFX8-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
-; GFX8-NEXT: v_fract_f16_e32 v1, v1
-; GFX8-NEXT: v_fract_f16_e32 v0, v0
-; GFX8-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT: v_cos_f16_e32 v0, v0
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX906-LABEL: cos_v2f16:
-; GFX906: ; %bb.0:
-; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_mov_b32_e32 v1, 0x3118
-; GFX906-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX906-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX906-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX906-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX908-LABEL: cos_v2f16:
-; GFX908: ; %bb.0:
-; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v1, 0x3118
-; GFX908-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX908-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX908-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX908-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX908-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: cos_v2f16:
-; GFX942: ; %bb.0:
-; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v1, 0x3118
-; GFX942-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX942-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: cos_v2f16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, 0x3118
-; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: cos_v2f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
-; GFX11-NEXT: v_cos_f16_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cos_f16_e32 v1, v1
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %res = call <2 x half> @llvm.cos.v2f16(<2 x half> %a)
- ret <2 x half> %res
-}
-
-define <2 x half> @log_v2f16(<2 x half> %a) {
-; GFX8-LABEL: log_v2f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_log_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_log_f16_e32 v0, v0
-; GFX8-NEXT: v_mov_b32_e32 v2, 0x398c
-; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX906-LABEL: log_v2f16:
-; GFX906: ; %bb.0:
-; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX906-NEXT: s_movk_i32 s4, 0x398c
-; GFX906-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX906-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX906-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX908-LABEL: log_v2f16:
-; GFX908: ; %bb.0:
-; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX908-NEXT: s_movk_i32 s4, 0x398c
-; GFX908-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX908-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX908-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX908-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: log_v2f16:
-; GFX942: ; %bb.0:
-; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX942-NEXT: s_movk_i32 s0, 0x398c
-; GFX942-NEXT: v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX942-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: log_v2f16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_e32 v1, 0x398c
-; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: log_v2f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT: v_log_f16_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_log_f16_e32 v1, v1
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX11-NEXT: v_mul_f16_e32 v1, 0x398c, v1
-; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %res = call <2 x half> @llvm.log.v2f16(<2 x half> %a)
- ret <2 x half> %res
-}
-
-define <2 x half> @log2_v2f16(<2 x half> %a) {
-; GFX8-LABEL: log2_v2f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_log_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_log_f16_e32 v0, v0
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX906-LABEL: log2_v2f16:
-; GFX906: ; %bb.0:
-; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX906-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX906-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX908-LABEL: log2_v2f16:
-; GFX908: ; %bb.0:
-; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX908-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX908-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: log2_v2f16:
-; GFX942: ; %bb.0:
-; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX942-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: log2_v2f16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: log2_v2f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT: v_log_f16_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_log_f16_e32 v1, v1
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %res = call <2 x half> @llvm.log2.v2f16(<2 x half> %a)
- ret <2 x half> %res
-}
-
-define <2 x half> @log10_v2f16(<2 x half> %a) {
-; GFX8-LABEL: log10_v2f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_log_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_log_f16_e32 v0, v0
-; GFX8-NEXT: v_mov_b32_e32 v2, 0x34d1
-; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX906-LABEL: log10_v2f16:
-; GFX906: ; %bb.0:
-; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX906-NEXT: s_movk_i32 s4, 0x34d1
-; GFX906-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX906-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX906-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX908-LABEL: log10_v2f16:
-; GFX908: ; %bb.0:
-; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX908-NEXT: s_movk_i32 s4, 0x34d1
-; GFX908-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX908-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX908-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX908-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: log10_v2f16:
-; GFX942: ; %bb.0:
-; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX942-NEXT: s_movk_i32 s0, 0x34d1
-; GFX942-NEXT: v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX942-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: log10_v2f16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_e32 v1, 0x34d1
-; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: log10_v2f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT: v_log_f16_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_log_f16_e32 v1, v1
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX11-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
-; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %res = call <2 x half> @llvm.log10.v2f16(<2 x half> %a)
- ret <2 x half> %res
-}
-
-define <2 x half> @exp_v2f16(<2 x half> %a) {
-; GFX8-LABEL: exp_v2f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cvt_f32_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX8-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX8-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX8-NEXT: v_exp_f32_e32 v1, v1
-; GFX8-NEXT: v_exp_f32_e32 v0, v0
-; GFX8-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: exp_v2f16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v0
-; GFX9-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX9-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX9-NEXT: v_exp_f32_e32 v1, v1
-; GFX9-NEXT: v_exp_f32_e32 v0, v0
-; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: exp_v2f16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v0
-; GFX10-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX10-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX10-NEXT: v_exp_f32_e32 v1, v1
-; GFX10-NEXT: v_exp_f32_e32 v0, v0
-; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: exp_v2f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX11-NEXT: v_dual_mul_f32 v0, 0x3fb8aa3b, v0 :: v_dual_mul_f32 v1, 0x3fb8aa3b, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_exp_f32_e32 v0, v0
-; GFX11-NEXT: v_exp_f32_e32 v1, v1
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %res = call <2 x half> @llvm.exp.v2f16(<2 x half> %a)
- ret <2 x half> %res
-}
-
-define <2 x half> @exp2_v2f16(<2 x half> %a) {
-; GFX8-LABEL: exp2_v2f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_exp_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_exp_f16_e32 v0, v0
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX906-LABEL: exp2_v2f16:
-; GFX906: ; %bb.0:
-; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX906-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX906-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX908-LABEL: exp2_v2f16:
-; GFX908: ; %bb.0:
-; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX908-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX908-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: exp2_v2f16:
-; GFX942: ; %bb.0:
-; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX942-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: exp2_v2f16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: exp2_v2f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT: v_exp_f16_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_exp_f16_e32 v1, v1
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %res = call <2 x half> @llvm.exp2.v2f16(<2 x half> %a)
- ret <2 x half> %res
-}
-
-define <2 x half> @exp10_v2f16(<2 x half> %a) {
-; GFX8-LABEL: exp10_v2f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cvt_f32_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX8-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX8-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX8-NEXT: v_exp_f32_e32 v1, v1
-; GFX8-NEXT: v_exp_f32_e32 v0, v0
-; GFX8-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: exp10_v2f16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v0
-; GFX9-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX9-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX9-NEXT: v_exp_f32_e32 v1, v1
-; GFX9-NEXT: v_exp_f32_e32 v0, v0
-; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: exp10_v2f16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v0
-; GFX10-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX10-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX10-NEXT: v_exp_f32_e32 v1, v1
-; GFX10-NEXT: v_exp_f32_e32 v0, v0
-; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: exp10_v2f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX11-NEXT: v_dual_mul_f32 v0, 0x3fb8aa3b, v0 :: v_dual_mul_f32 v1, 0x3fb8aa3b, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_exp_f32_e32 v0, v0
-; GFX11-NEXT: v_exp_f32_e32 v1, v1
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %res = call <2 x half> @llvm.exp10.v2f16(<2 x half> %a)
- ret <2 x half> %res
-}
-
-define <2 x half> @sqrt_v2f16(<2 x half> %a) {
-; GFX8-LABEL: sqrt_v2f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_sqrt_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_sqrt_f16_e32 v0, v0
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX906-LABEL: sqrt_v2f16:
-; GFX906: ; %bb.0:
-; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX906-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX906-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX908-LABEL: sqrt_v2f16:
-; GFX908: ; %bb.0:
-; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX908-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX908-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: sqrt_v2f16:
-; GFX942: ; %bb.0:
-; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX942-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: sqrt_v2f16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: sqrt_v2f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_sqrt_f16_e32 v1, v1
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %res = call <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
- ret <2 x half> %res
-}
-
-define <4 x half> @sin_v4f16(<4 x half> %a) {
-; GFX8-LABEL: sin_v4f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v3, 0x3118
-; GFX8-NEXT: v_mul_f16_e32 v2, 0.15915494, v1
-; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_mul_f16_e32 v4, 0.15915494, v0
-; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_fract_f16_e32 v2, v2
-; GFX8-NEXT: v_fract_f16_e32 v1, v1
-; GFX8-NEXT: v_fract_f16_e32 v4, v4
-; GFX8-NEXT: v_fract_f16_e32 v0, v0
-; GFX8-NEXT: v_sin_f16_e32 v2, v2
-; GFX8-NEXT: v_sin_f16_e32 v4, v4
-; GFX8-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
-; GFX8-NEXT: v_or_b32_e32 v1, v2, v1
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX906-LABEL: sin_v4f16:
-; GFX906: ; %bb.0:
-; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_mov_b32_e32 v2, 0x3118
-; GFX906-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX906-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX906-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX906-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX906-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX906-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX906-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX908-LABEL: sin_v4f16:
-; GFX908: ; %bb.0:
-; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v2, 0x3118
-; GFX908-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX908-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX908-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX908-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX908-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX908-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX908-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX908-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX908-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: sin_v4f16:
-; GFX942: ; %bb.0:
-; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, 0x3118
-; GFX942-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX942-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX942-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX942-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX942-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX942-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX942-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: sin_v4f16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v2, 0x3118
-; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: sin_v4f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
-; GFX11-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
-; GFX11-NEXT: v_mul_f16_e32 v3, 0.15915494, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_sin_f16_e32 v1, v1
-; GFX11-NEXT: v_sin_f16_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_sin_f16_e32 v2, v2
-; GFX11-NEXT: v_sin_f16_e32 v3, v3
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_pack_b32_f16 v0, v0, v2
-; GFX11-NEXT: v_pack_b32_f16 v1, v1, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %res = call <4 x half> @llvm.sin.v4f16(<4 x half> %a)
- ret <4 x half> %res
-}
-
-define <4 x half> @cos_v4f16(<4 x half> %a) {
-; GFX8-LABEL: cos_v4f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v3, 0x3118
-; GFX8-NEXT: v_mul_f16_e32 v2, 0.15915494, v1
-; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_mul_f16_e32 v4, 0.15915494, v0
-; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_fract_f16_e32 v2, v2
-; GFX8-NEXT: v_fract_f16_e32 v1, v1
-; GFX8-NEXT: v_fract_f16_e32 v4, v4
-; GFX8-NEXT: v_fract_f16_e32 v0, v0
-; GFX8-NEXT: v_cos_f16_e32 v2, v2
-; GFX8-NEXT: v_cos_f16_e32 v4, v4
-; GFX8-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
-; GFX8-NEXT: v_or_b32_e32 v1, v2, v1
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX906-LABEL: cos_v4f16:
-; GFX906: ; %bb.0:
-; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_mov_b32_e32 v2, 0x3118
-; GFX906-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX906-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX906-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX906-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX906-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX906-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX906-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX908-LABEL: cos_v4f16:
-; GFX908: ; %bb.0:
-; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v2, 0x3118
-; GFX908-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX908-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX908-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX908-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX908-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX908-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX908-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX908-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX908-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: cos_v4f16:
-; GFX942: ; %bb.0:
-; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, 0x3118
-; GFX942-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX942-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX942-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX942-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX942-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX942-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX942-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: cos_v4f16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v2, 0x3118
-; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: cos_v4f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
-; GFX11-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
-; GFX11-NEXT: v_mul_f16_e32 v3, 0.15915494, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cos_f16_e32 v1, v1
-; GFX11-NEXT: v_cos_f16_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cos_f16_e32 v2, v2
-; GFX11-NEXT: v_cos_f16_e32 v3, v3
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_pack_b32_f16 v0, v0, v2
-; GFX11-NEXT: v_pack_b32_f16 v1, v1, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %res = call <4 x half> @llvm.cos.v4f16(<4 x half> %a)
- ret <4 x half> %res
-}
-
-define <4 x half> @log_v4f16(<4 x half> %a) {
-; GFX8-LABEL: log_v4f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_log_f16_e32 v2, v1
-; GFX8-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_log_f16_e32 v3, v0
-; GFX8-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_mov_b32_e32 v4, 0x398c
-; GFX8-NEXT: v_mul_f16_e32 v2, 0x398c, v2
-; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_mul_f16_e32 v3, 0x398c, v3
-; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_e32 v0, v3, v0
-; GFX8-NEXT: v_or_b32_e32 v1, v2, v1
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX906-LABEL: log_v4f16:
-; GFX906: ; %bb.0:
-; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX906-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX906-NEXT: s_movk_i32 s4, 0x398c
-; GFX906-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX906-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX906-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX906-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX906-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX908-LABEL: log_v4f16:
-; GFX908: ; %bb.0:
-; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX908-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX908-NEXT: s_movk_i32 s4, 0x398c
-; GFX908-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX908-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX908-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX908-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX908-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX908-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX908-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: log_v4f16:
-; GFX942: ; %bb.0:
-; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX942-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX942-NEXT: s_movk_i32 s0, 0x398c
-; GFX942-NEXT: v_mul_f16_sdwa v1, v1, s0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT: v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX942-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX942-NEXT: v_mul_f16_sdwa v1, v1, s0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX942-NEXT: v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX942-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: log_v4f16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_e32 v2, 0x398c
-; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: log_v4f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-NEXT: v_log_f16_e32 v1, v1
-; GFX11-NEXT: v_log_f16_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_log_f16_e32 v2, v2
-; GFX11-NEXT: v_log_f16_e32 v3, v3
-; GFX11-NEXT: v_mul_f16_e32 v1, 0x398c, v1
-; GFX11-NEXT: s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f16_e32 v2, 0x398c, v2
-; GFX11-NEXT: v_mul_f16_e32 v3, 0x398c, v3
-; GFX11-NEXT: v_pack_b32_f16 v0, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_pack_b32_f16 v1, v1, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %res = call <4 x half> @llvm.log.v4f16(<4 x half> %a)
- ret <4 x half> %res
-}
-
-define <4 x half> @log2_v4f16(<4 x half> %a) {
-; GFX8-LABEL: log2_v4f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_log_f16_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_log_f16_sdwa v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_log_f16_e32 v0, v0
-; GFX8-NEXT: v_log_f16_e32 v1, v1
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v3
-; GFX8-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: log2_v4f16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX9-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX9-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: log2_v4f16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: log2_v4f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-NEXT: v_log_f16_e32 v1, v1
-; GFX11-NEXT: v_log_f16_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_log_f16_e32 v2, v2
-; GFX11-NEXT: v_log_f16_e32 v3, v3
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_pack_b32_f16 v0, v0, v2
-; GFX11-NEXT: v_pack_b32_f16 v1, v1, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %res = call <4 x half> @llvm.log2.v4f16(<4 x half> %a)
- ret <4 x half> %res
-}
-
-define <4 x half> @log10_v4f16(<4 x half> %a) {
-; GFX8-LABEL: log10_v4f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_log_f16_e32 v2, v1
-; GFX8-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_log_f16_e32 v3, v0
-; GFX8-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_mov_b32_e32 v4, 0x34d1
-; GFX8-NEXT: v_mul_f16_e32 v2, 0x34d1, v2
-; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_mul_f16_e32 v3, 0x34d1, v3
-; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_e32 v0, v3, v0
-; GFX8-NEXT: v_or_b32_e32 v1, v2, v1
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX906-LABEL: log10_v4f16:
-; GFX906: ; %bb.0:
-; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX906-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX906-NEXT: s_movk_i32 s4, 0x34d1
-; GFX906-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX906-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX906-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX906-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX906-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX908-LABEL: log10_v4f16:
-; GFX908: ; %bb.0:
-; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX908-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX908-NEXT: s_movk_i32 s4, 0x34d1
-; GFX908-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX908-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX908-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX908-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX908-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX908-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX908-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: log10_v4f16:
-; GFX942: ; %bb.0:
-; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX942-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX942-NEXT: s_movk_i32 s0, 0x34d1
-; GFX942-NEXT: v_mul_f16_sdwa v1, v1, s0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT: v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX942-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX942-NEXT: v_mul_f16_sdwa v1, v1, s0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX942-NEXT: v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX942-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: log10_v4f16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_e32 v2, 0x34d1
-; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: log10_v4f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-NEXT: v_log_f16_e32 v1, v1
-; GFX11-NEXT: v_log_f16_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_log_f16_e32 v2, v2
-; GFX11-NEXT: v_log_f16_e32 v3, v3
-; GFX11-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
-; GFX11-NEXT: s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f16_e32 v2, 0x34d1, v2
-; GFX11-NEXT: v_mul_f16_e32 v3, 0x34d1, v3
-; GFX11-NEXT: v_pack_b32_f16 v0, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_pack_b32_f16 v1, v1, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %res = call <4 x half> @llvm.log10.v4f16(<4 x half> %a)
- ret <4 x half> %res
-}
-
-define <4 x half> @exp_v4f16(<4 x half> %a) {
-; GFX8-LABEL: exp_v4f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v1
-; GFX8-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX8-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2
-; GFX8-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX8-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3
-; GFX8-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX8-NEXT: v_exp_f32_e32 v2, v2
-; GFX8-NEXT: v_exp_f32_e32 v3, v3
-; GFX8-NEXT: v_exp_f32_e32 v0, v0
-; GFX8-NEXT: v_exp_f32_e32 v1, v1
-; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX8-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT: v_or_b32_e32 v0, v3, v0
-; GFX8-NEXT: v_or_b32_e32 v1, v2, v1
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: exp_v4f16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v1
-; GFX9-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX9-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2
-; GFX9-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX9-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3
-; GFX9-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX9-NEXT: v_exp_f32_e32 v2, v2
-; GFX9-NEXT: v_exp_f32_e32 v3, v3
-; GFX9-NEXT: v_exp_f32_e32 v0, v0
-; GFX9-NEXT: v_exp_f32_e32 v1, v1
-; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX9-NEXT: v_pack_b32_f16 v0, v3, v0
-; GFX9-NEXT: v_pack_b32_f16 v1, v2, v1
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: exp_v4f16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1
-; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX10-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2
-; GFX10-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3
-; GFX10-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX10-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX10-NEXT: v_exp_f32_e32 v2, v2
-; GFX10-NEXT: v_exp_f32_e32 v3, v3
-; GFX10-NEXT: v_exp_f32_e32 v0, v0
-; GFX10-NEXT: v_exp_f32_e32 v1, v1
-; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX10-NEXT: v_pack_b32_f16 v0, v3, v0
-; GFX10-NEXT: v_pack_b32_f16 v1, v2, v1
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: exp_v4f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_mul_f32 v1, 0x3fb8aa3b, v1 :: v_dual_mul_f32 v0, 0x3fb8aa3b, v0
-; GFX11-NEXT: v_dual_mul_f32 v2, 0x3fb8aa3b, v2 :: v_dual_mul_f32 v3, 0x3fb8aa3b, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_exp_f32_e32 v1, v1
-; GFX11-NEXT: v_exp_f32_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_exp_f32_e32 v2, v2
-; GFX11-NEXT: v_exp_f32_e32 v3, v3
-; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX11-NEXT: s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX11-NEXT: v_pack_b32_f16 v0, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_pack_b32_f16 v1, v1, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %res = call <4 x half> @llvm.exp.v4f16(<4 x half> %a)
- ret <4 x half> %res
-}
-
-define <4 x half> @exp2_v4f16(<4 x half> %a) {
-; GFX8-LABEL: exp2_v4f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_exp_f16_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_exp_f16_sdwa v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_exp_f16_e32 v0, v0
-; GFX8-NEXT: v_exp_f16_e32 v1, v1
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v3
-; GFX8-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: exp2_v4f16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX9-NEXT: v_exp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX9-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_exp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: exp2_v4f16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT: v_exp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_exp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: exp2_v4f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-NEXT: v_exp_f16_e32 v1, v1
-; GFX11-NEXT: v_exp_f16_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_exp_f16_e32 v2, v2
-; GFX11-NEXT: v_exp_f16_e32 v3, v3
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_pack_b32_f16 v0, v0, v2
-; GFX11-NEXT: v_pack_b32_f16 v1, v1, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %res = call <4 x half> @llvm.exp2.v4f16(<4 x half> %a)
- ret <4 x half> %res
-}
-
-define <4 x half> @exp10_v4f16(<4 x half> %a) {
-; GFX8-LABEL: exp10_v4f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v1
-; GFX8-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX8-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2
-; GFX8-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX8-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3
-; GFX8-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX8-NEXT: v_exp_f32_e32 v2, v2
-; GFX8-NEXT: v_exp_f32_e32 v3, v3
-; GFX8-NEXT: v_exp_f32_e32 v0, v0
-; GFX8-NEXT: v_exp_f32_e32 v1, v1
-; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX8-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT: v_or_b32_e32 v0, v3, v0
-; GFX8-NEXT: v_or_b32_e32 v1, v2, v1
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: exp10_v4f16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v1
-; GFX9-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX9-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2
-; GFX9-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX9-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3
-; GFX9-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX9-NEXT: v_exp_f32_e32 v2, v2
-; GFX9-NEXT: v_exp_f32_e32 v3, v3
-; GFX9-NEXT: v_exp_f32_e32 v0, v0
-; GFX9-NEXT: v_exp_f32_e32 v1, v1
-; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX9-NEXT: v_pack_b32_f16 v0, v3, v0
-; GFX9-NEXT: v_pack_b32_f16 v1, v2, v1
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: exp10_v4f16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1
-; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX10-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2
-; GFX10-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3
-; GFX10-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX10-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX10-NEXT: v_exp_f32_e32 v2, v2
-; GFX10-NEXT: v_exp_f32_e32 v3, v3
-; GFX10-NEXT: v_exp_f32_e32 v0, v0
-; GFX10-NEXT: v_exp_f32_e32 v1, v1
-; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX10-NEXT: v_pack_b32_f16 v0, v3, v0
-; GFX10-NEXT: v_pack_b32_f16 v1, v2, v1
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: exp10_v4f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_mul_f32 v1, 0x3fb8aa3b, v1 :: v_dual_mul_f32 v0, 0x3fb8aa3b, v0
-; GFX11-NEXT: v_dual_mul_f32 v2, 0x3fb8aa3b, v2 :: v_dual_mul_f32 v3, 0x3fb8aa3b, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_exp_f32_e32 v1, v1
-; GFX11-NEXT: v_exp_f32_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_exp_f32_e32 v2, v2
-; GFX11-NEXT: v_exp_f32_e32 v3, v3
-; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX11-NEXT: s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX11-NEXT: v_pack_b32_f16 v0, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_pack_b32_f16 v1, v1, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %res = call <4 x half> @llvm.exp10.v4f16(<4 x half> %a)
- ret <4 x half> %res
-}
-
-define <4 x half> @sqrt_v4f16(<4 x half> %a) {
-; GFX8-LABEL: sqrt_v4f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_sqrt_f16_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_sqrt_f16_sdwa v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_sqrt_f16_e32 v0, v0
-; GFX8-NEXT: v_sqrt_f16_e32 v1, v1
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v3
-; GFX8-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: sqrt_v4f16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX9-NEXT: v_sqrt_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX9-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_sqrt_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: sqrt_v4f16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT: v_sqrt_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_sqrt_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: sqrt_v4f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-NEXT: v_sqrt_f16_e32 v1, v1
-; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_sqrt_f16_e32 v2, v2
-; GFX11-NEXT: v_sqrt_f16_e32 v3, v3
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_pack_b32_f16 v0, v0, v2
-; GFX11-NEXT: v_pack_b32_f16 v1, v1, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %res = call <4 x half> @llvm.sqrt.v4f16(<4 x half> %a)
- ret <4 x half> %res
-}
-
-define <5 x half> @sin_v5f16(<5 x half> %a) {
-; GFX8-LABEL: sin_v5f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, 0x3118
-; GFX8-NEXT: v_mul_f16_e32 v3, 0.15915494, v1
-; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_mul_f16_e32 v5, 0.15915494, v0
-; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
-; GFX8-NEXT: v_fract_f16_e32 v3, v3
-; GFX8-NEXT: v_fract_f16_e32 v1, v1
-; GFX8-NEXT: v_fract_f16_e32 v5, v5
-; GFX8-NEXT: v_fract_f16_e32 v0, v0
-; GFX8-NEXT: v_sin_f16_e32 v3, v3
-; GFX8-NEXT: v_sin_f16_e32 v5, v5
-; GFX8-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT: v_fract_f16_e32 v2, v2
-; GFX8-NEXT: v_sin_f16_e32 v2, v2
-; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
-; GFX8-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX906-LABEL: sin_v5f16:
-; GFX906: ; %bb.0:
-; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_mov_b32_e32 v3, 0x3118
-; GFX906-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX906-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX906-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
-; GFX906-NEXT: v_sin_f16_e32 v2, v2
-; GFX906-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX906-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX906-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX906-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX906-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX908-LABEL: sin_v5f16:
-; GFX908: ; %bb.0:
-; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v3, 0x3118
-; GFX908-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX908-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX908-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX908-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX908-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
-; GFX908-NEXT: v_sin_f16_e32 v2, v2
-; GFX908-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX908-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX908-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX908-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX908-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: sin_v5f16:
-; GFX942: ; %bb.0:
-; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0x3118
-; GFX942-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX942-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX942-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
-; GFX942-NEXT: v_sin_f16_e32 v2, v2
-; GFX942-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX942-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX942-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX942-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX942-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: sin_v5f16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v3, 0x3118
-; GFX10-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
-; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT: v_sin_f16_e32 v2, v2
-; GFX10-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: sin_v5f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1
-; GFX11-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
-; GFX11-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
-; GFX11-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
-; GFX11-NEXT: v_mul_f16_e32 v3, 0.15915494, v3
-; GFX11-NEXT: v_mul_f16_e32 v4, 0.15915494, v4
-; GFX11-NEXT: v_sin_f16_e32 v1, v1
-; GFX11-NEXT: v_sin_f16_e32 v0, v0
-; GFX11-NEXT: v_sin_f16_e32 v2, v2
-; GFX11-NEXT: v_sin_f16_e32 v3, v3
-; GFX11-NEXT: v_sin_f16_e32 v4, v4
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_pack_b32_f16 v0, v0, v3
-; GFX11-NEXT: v_pack_b32_f16 v1, v1, v4
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %res = call <5 x half> @llvm.sin.v5f16(<5 x half> %a)
- ret <5 x half> %res
-}
-
-define <5 x half> @cos_v5f16(<5 x half> %a) {
-; GFX8-LABEL: cos_v5f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, 0x3118
-; GFX8-NEXT: v_mul_f16_e32 v3, 0.15915494, v1
-; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_mul_f16_e32 v5, 0.15915494, v0
-; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
-; GFX8-NEXT: v_fract_f16_e32 v3, v3
-; GFX8-NEXT: v_fract_f16_e32 v1, v1
-; GFX8-NEXT: v_fract_f16_e32 v5, v5
-; GFX8-NEXT: v_fract_f16_e32 v0, v0
-; GFX8-NEXT: v_cos_f16_e32 v3, v3
-; GFX8-NEXT: v_cos_f16_e32 v5, v5
-; GFX8-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT: v_fract_f16_e32 v2, v2
-; GFX8-NEXT: v_cos_f16_e32 v2, v2
-; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
-; GFX8-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX906-LABEL: cos_v5f16:
-; GFX906: ; %bb.0:
-; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_mov_b32_e32 v3, 0x3118
-; GFX906-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX906-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX906-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
-; GFX906-NEXT: v_cos_f16_e32 v2, v2
-; GFX906-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX906-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX906-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX906-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX906-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX908-LABEL: cos_v5f16:
-; GFX908: ; %bb.0:
-; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v3, 0x3118
-; GFX908-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX908-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX908-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX908-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX908-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
-; GFX908-NEXT: v_cos_f16_e32 v2, v2
-; GFX908-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX908-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX908-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX908-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX908-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: cos_v5f16:
-; GFX942: ; %bb.0:
-; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0x3118
-; GFX942-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX942-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX942-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
-; GFX942-NEXT: v_cos_f16_e32 v2, v2
-; GFX942-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX942-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX942-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX942-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX942-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: cos_v5f16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v3, 0x3118
-; GFX10-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
-; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT: v_cos_f16_e32 v2, v2
-; GFX10-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: cos_v5f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1
-; GFX11-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
-; GFX11-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
-; GFX11-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
-; GFX11-NEXT: v_mul_f16_e32 v3, 0.15915494, v3
-; GFX11-NEXT: v_mul_f16_e32 v4, 0.15915494, v4
-; GFX11-NEXT: v_cos_f16_e32 v1, v1
-; GFX11-NEXT: v_cos_f16_e32 v0, v0
-; GFX11-NEXT: v_cos_f16_e32 v2, v2
-; GFX11-NEXT: v_cos_f16_e32 v3, v3
-; GFX11-NEXT: v_cos_f16_e32 v4, v4
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_pack_b32_f16 v0, v0, v3
-; GFX11-NEXT: v_pack_b32_f16 v1, v1, v4
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %res = call <5 x half> @llvm.cos.v5f16(<5 x half> %a)
- ret <5 x half> %res
-}
-
-define <5 x half> @log_v5f16(<5 x half> %a) {
-; GFX8-LABEL: log_v5f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_log_f16_e32 v3, v1
-; GFX8-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_log_f16_e32 v4, v0
-; GFX8-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_log_f16_e32 v2, v2
-; GFX8-NEXT: v_mov_b32_e32 v5, 0x398c
-; GFX8-NEXT: v_mul_f16_e32 v3, 0x398c, v3
-; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_mul_f16_e32 v4, 0x398c, v4
-; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_mul_f16_e32 v2, 0x398c, v2
-; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
-; GFX8-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX906-LABEL: log_v5f16:
-; GFX906: ; %bb.0:
-; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX906-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX906-NEXT: s_movk_i32 s4, 0x398c
-; GFX906-NEXT: v_log_f16_e32 v2, v2
-; GFX906-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX906-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX906-NEXT: v_mul_f16_e32 v2, 0x398c, v2
-; GFX906-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX906-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX906-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX908-LABEL: log_v5f16:
-; GFX908: ; %bb.0:
-; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX908-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX908-NEXT: s_movk_i32 s4, 0x398c
-; GFX908-NEXT: v_log_f16_e32 v2, v2
-; GFX908-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX908-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX908-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX908-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX908-NEXT: v_mul_f16_e32 v2, 0x398c, v2
-; GFX908-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX908-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX908-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: log_v5f16:
-; GFX942: ; %bb.0:
-; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX942-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX942-NEXT: s_movk_i32 s0, 0x398c
-; GFX942-NEXT: v_log_f16_e32 v2, v2
-; GFX942-NEXT: v_mul_f16_sdwa v1, v1, s0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT: v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX942-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX942-NEXT: v_mul_f16_e32 v2, 0x398c, v2
-; GFX942-NEXT: v_mul_f16_sdwa v1, v1, s0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX942-NEXT: v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX942-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: log_v5f16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_e32 v3, 0x398c
-; GFX10-NEXT: v_log_f16_e32 v2, v2
-; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT: v_mul_f16_e32 v2, 0x398c, v2
-; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: log_v5f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1
-; GFX11-NEXT: v_log_f16_e32 v1, v1
-; GFX11-NEXT: v_log_f16_e32 v0, v0
-; GFX11-NEXT: v_log_f16_e32 v2, v2
-; GFX11-NEXT: v_log_f16_e32 v3, v3
-; GFX11-NEXT: v_log_f16_e32 v4, v4
-; GFX11-NEXT: v_mul_f16_e32 v1, 0x398c, v1
-; GFX11-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX11-NEXT: s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mul_f16_e32 v2, 0x398c, v2
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f16_e32 v3, 0x398c, v3
-; GFX11-NEXT: v_mul_f16_e32 v4, 0x398c, v4
-; GFX11-NEXT: v_pack_b32_f16 v0, v0, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_pack_b32_f16 v1, v1, v4
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %res = call <5 x half> @llvm.log.v5f16(<5 x half> %a)
- ret <5 x half> %res
-}
-
-define <5 x half> @log2_v5f16(<5 x half> %a) {
-; GFX8-LABEL: log2_v5f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_log_f16_sdwa v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_log_f16_sdwa v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_log_f16_e32 v0, v0
-; GFX8-NEXT: v_log_f16_e32 v1, v1
-; GFX8-NEXT: v_log_f16_e32 v2, v2
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT: v_or_b32_e32 v1, v1, v3
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: log2_v5f16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX9-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX9-NEXT: v_log_f16_e32 v2, v2
-; GFX9-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: log2_v5f16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT: v_log_f16_e32 v2, v2
-; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: log2_v5f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1
-; GFX11-NEXT: v_log_f16_e32 v1, v1
-; GFX11-NEXT: v_log_f16_e32 v0, v0
-; GFX11-NEXT: v_log_f16_e32 v2, v2
-; GFX11-NEXT: v_log_f16_e32 v3, v3
-; GFX11-NEXT: v_log_f16_e32 v4, v4
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_pack_b32_f16 v0, v0, v3
-; GFX11-NEXT: v_pack_b32_f16 v1, v1, v4
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %res = call <5 x half> @llvm.log2.v5f16(<5 x half> %a)
- ret <5 x half> %res
-}
-
-define <5 x half> @log10_v5f16(<5 x half> %a) {
-; GFX8-LABEL: log10_v5f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_log_f16_e32 v3, v1
-; GFX8-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_log_f16_e32 v4, v0
-; GFX8-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_log_f16_e32 v2, v2
-; GFX8-NEXT: v_mov_b32_e32 v5, 0x34d1
-; GFX8-NEXT: v_mul_f16_e32 v3, 0x34d1, v3
-; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_mul_f16_e32 v4, 0x34d1, v4
-; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_mul_f16_e32 v2, 0x34d1, v2
-; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
-; GFX8-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX906-LABEL: log10_v5f16:
-; GFX906: ; %bb.0:
-; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX906-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX906-NEXT: s_movk_i32 s4, 0x34d1
-; GFX906-NEXT: v_log_f16_e32 v2, v2
-; GFX906-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX906-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX906-NEXT: v_mul_f16_e32 v2, 0x34d1, v2
-; GFX906-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX906-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX906-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX908-LABEL: log10_v5f16:
-; GFX908: ; %bb.0:
-; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX908-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX908-NEXT: s_movk_i32 s4, 0x34d1
-; GFX908-NEXT: v_log_f16_e32 v2, v2
-; GFX908-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX908-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX908-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX908-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX908-NEXT: v_mul_f16_e32 v2, 0x34d1, v2
-; GFX908-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX908-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX908-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: log10_v5f16:
-; GFX942: ; %bb.0:
-; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX942-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX942-NEXT: s_movk_i32 s0, 0x34d1
-; GFX942-NEXT: v_log_f16_e32 v2, v2
-; GFX942-NEXT: v_mul_f16_sdwa v1, v1, s0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT: v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX942-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX942-NEXT: v_mul_f16_e32 v2, 0x34d1, v2
-; GFX942-NEXT: v_mul_f16_sdwa v1, v1, s0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX942-NEXT: v_mul_f16_sdwa v0, v0, s0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX942-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: log10_v5f16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_e32 v3, 0x34d1
-; GFX10-NEXT: v_log_f16_e32 v2, v2
-; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT: v_mul_f16_e32 v2, 0x34d1, v2
-; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT: v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: log10_v5f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1
-; GFX11-NEXT: v_log_f16_e32 v1, v1
-; GFX11-NEXT: v_log_f16_e32 v0, v0
-; GFX11-NEXT: v_log_f16_e32 v2, v2
-; GFX11-NEXT: v_log_f16_e32 v3, v3
-; GFX11-NEXT: v_log_f16_e32 v4, v4
-; GFX11-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
-; GFX11-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX11-NEXT: s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mul_f16_e32 v2, 0x34d1, v2
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f16_e32 v3, 0x34d1, v3
-; GFX11-NEXT: v_mul_f16_e32 v4, 0x34d1, v4
-; GFX11-NEXT: v_pack_b32_f16 v0, v0, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_pack_b32_f16 v1, v1, v4
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %res = call <5 x half> @llvm.log10.v5f16(<5 x half> %a)
- ret <5 x half> %res
-}
-
-define <5 x half> @exp_v5f16(<5 x half> %a) {
-; GFX8-LABEL: exp_v5f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX8-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_cvt_f32_f16_e32 v4, v0
-; GFX8-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX8-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3
-; GFX8-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX8-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v4
-; GFX8-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX8-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2
-; GFX8-NEXT: v_exp_f32_e32 v3, v3
-; GFX8-NEXT: v_exp_f32_e32 v4, v4
-; GFX8-NEXT: v_exp_f32_e32 v0, v0
-; GFX8-NEXT: v_exp_f32_e32 v1, v1
-; GFX8-NEXT: v_exp_f32_e32 v2, v2
-; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX8-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX8-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
-; GFX8-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: exp_v5f16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX9-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v0
-; GFX9-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX9-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3
-; GFX9-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX9-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v4
-; GFX9-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX9-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2
-; GFX9-NEXT: v_exp_f32_e32 v3, v3
-; GFX9-NEXT: v_exp_f32_e32 v4, v4
-; GFX9-NEXT: v_exp_f32_e32 v0, v0
-; GFX9-NEXT: v_exp_f32_e32 v1, v1
-; GFX9-NEXT: v_exp_f32_e32 v2, v2
-; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX9-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX9-NEXT: v_pack_b32_f16 v0, v4, v0
-; GFX9-NEXT: v_pack_b32_f16 v1, v3, v1
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: exp_v5f16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v0
-; GFX10-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX10-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3
-; GFX10-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v4
-; GFX10-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX10-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX10-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2
-; GFX10-NEXT: v_exp_f32_e32 v3, v3
-; GFX10-NEXT: v_exp_f32_e32 v4, v4
-; GFX10-NEXT: v_exp_f32_e32 v0, v0
-; GFX10-NEXT: v_exp_f32_e32 v1, v1
-; GFX10-NEXT: v_exp_f32_e32 v2, v2
-; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX10-NEXT: v_pack_b32_f16 v0, v4, v0
-; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: exp_v5f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1
-; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_mul_f32 v1, 0x3fb8aa3b, v1 :: v_dual_mul_f32 v0, 0x3fb8aa3b, v0
-; GFX11-NEXT: v_dual_mul_f32 v2, 0x3fb8aa3b, v2 :: v_dual_mul_f32 v3, 0x3fb8aa3b, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v4
-; GFX11-NEXT: v_exp_f32_e32 v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_exp_f32_e32 v0, v0
-; GFX11-NEXT: v_exp_f32_e32 v2, v2
-; GFX11-NEXT: v_exp_f32_e32 v3, v3
-; GFX11-NEXT: v_exp_f32_e32 v4, v4
-; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX11-NEXT: v_pack_b32_f16 v0, v0, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_pack_b32_f16 v1, v1, v4
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %res = call <5 x half> @llvm.exp.v5f16(<5 x half> %a)
- ret <5 x half> %res
-}
-
-define <5 x half> @exp2_v5f16(<5 x half> %a) {
-; GFX8-LABEL: exp2_v5f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_exp_f16_sdwa v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_exp_f16_sdwa v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_exp_f16_e32 v0, v0
-; GFX8-NEXT: v_exp_f16_e32 v1, v1
-; GFX8-NEXT: v_exp_f16_e32 v2, v2
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT: v_or_b32_e32 v1, v1, v3
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: exp2_v5f16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX9-NEXT: v_exp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX9-NEXT: v_exp_f16_e32 v2, v2
-; GFX9-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_exp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: exp2_v5f16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT: v_exp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT: v_exp_f16_e32 v2, v2
-; GFX10-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_exp_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: exp2_v5f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1
-; GFX11-NEXT: v_exp_f16_e32 v1, v1
-; GFX11-NEXT: v_exp_f16_e32 v0, v0
-; GFX11-NEXT: v_exp_f16_e32 v2, v2
-; GFX11-NEXT: v_exp_f16_e32 v3, v3
-; GFX11-NEXT: v_exp_f16_e32 v4, v4
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_pack_b32_f16 v0, v0, v3
-; GFX11-NEXT: v_pack_b32_f16 v1, v1, v4
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %res = call <5 x half> @llvm.exp2.v5f16(<5 x half> %a)
- ret <5 x half> %res
-}
-
-define <5 x half> @exp10_v5f16(<5 x half> %a) {
-; GFX8-LABEL: exp10_v5f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX8-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_cvt_f32_f16_e32 v4, v0
-; GFX8-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX8-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3
-; GFX8-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX8-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v4
-; GFX8-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX8-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2
-; GFX8-NEXT: v_exp_f32_e32 v3, v3
-; GFX8-NEXT: v_exp_f32_e32 v4, v4
-; GFX8-NEXT: v_exp_f32_e32 v0, v0
-; GFX8-NEXT: v_exp_f32_e32 v1, v1
-; GFX8-NEXT: v_exp_f32_e32 v2, v2
-; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX8-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX8-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
-; GFX8-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: exp10_v5f16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX9-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v0
-; GFX9-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX9-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3
-; GFX9-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX9-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v4
-; GFX9-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX9-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2
-; GFX9-NEXT: v_exp_f32_e32 v3, v3
-; GFX9-NEXT: v_exp_f32_e32 v4, v4
-; GFX9-NEXT: v_exp_f32_e32 v0, v0
-; GFX9-NEXT: v_exp_f32_e32 v1, v1
-; GFX9-NEXT: v_exp_f32_e32 v2, v2
-; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX9-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX9-NEXT: v_pack_b32_f16 v0, v4, v0
-; GFX9-NEXT: v_pack_b32_f16 v1, v3, v1
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: exp10_v5f16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v0
-; GFX10-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX10-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3
-; GFX10-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v4
-; GFX10-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX10-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX10-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2
-; GFX10-NEXT: v_exp_f32_e32 v3, v3
-; GFX10-NEXT: v_exp_f32_e32 v4, v4
-; GFX10-NEXT: v_exp_f32_e32 v0, v0
-; GFX10-NEXT: v_exp_f32_e32 v1, v1
-; GFX10-NEXT: v_exp_f32_e32 v2, v2
-; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX10-NEXT: v_pack_b32_f16 v0, v4, v0
-; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: exp10_v5f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1
-; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_mul_f32 v1, 0x3fb8aa3b, v1 :: v_dual_mul_f32 v0, 0x3fb8aa3b, v0
-; GFX11-NEXT: v_dual_mul_f32 v2, 0x3fb8aa3b, v2 :: v_dual_mul_f32 v3, 0x3fb8aa3b, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v4
-; GFX11-NEXT: v_exp_f32_e32 v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_exp_f32_e32 v0, v0
-; GFX11-NEXT: v_exp_f32_e32 v2, v2
-; GFX11-NEXT: v_exp_f32_e32 v3, v3
-; GFX11-NEXT: v_exp_f32_e32 v4, v4
-; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX11-NEXT: v_pack_b32_f16 v0, v0, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_pack_b32_f16 v1, v1, v4
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %res = call <5 x half> @llvm.exp10.v5f16(<5 x half> %a)
- ret <5 x half> %res
-}
-
-define <5 x half> @sqrt_v5f16(<5 x half> %a) {
-; GFX8-LABEL: sqrt_v5f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_sqrt_f16_sdwa v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_sqrt_f16_sdwa v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_sqrt_f16_e32 v0, v0
-; GFX8-NEXT: v_sqrt_f16_e32 v1, v1
-; GFX8-NEXT: v_sqrt_f16_e32 v2, v2
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT: v_or_b32_e32 v1, v1, v3
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: sqrt_v5f16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX9-NEXT: v_sqrt_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX9-NEXT: v_sqrt_f16_e32 v2, v2
-; GFX9-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_sqrt_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: sqrt_v5f16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT: v_sqrt_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-; GFX10-NEXT: v_sqrt_f16_e32 v2, v2
-; GFX10-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_sqrt_f16_sdwa v1, v1 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: sqrt_v5f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1
-; GFX11-NEXT: v_sqrt_f16_e32 v1, v1
-; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
-; GFX11-NEXT: v_sqrt_f16_e32 v2, v2
-; GFX11-NEXT: v_sqrt_f16_e32 v3, v3
-; GFX11-NEXT: v_sqrt_f16_e32 v4, v4
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_pack_b32_f16 v0, v0, v3
-; GFX11-NEXT: v_pack_b32_f16 v1, v1, v4
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %res = call <5 x half> @llvm.sqrt.v5f16(<5 x half> %a)
- ret <5 x half> %res
-}
-
-define <4 x half> @cascaded_v4f16(<4 x half> %a) {
-; GFX8-LABEL: cascaded_v4f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_log_f16_e32 v2, v1
-; GFX8-NEXT: v_log_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_log_f16_e32 v0, v0
-; GFX8-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
-; GFX8-NEXT: v_mul_f16_e32 v3, 0.15915494, v3
-; GFX8-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
-; GFX8-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
-; GFX8-NEXT: v_fract_f16_e32 v2, v2
-; GFX8-NEXT: v_fract_f16_e32 v1, v1
-; GFX8-NEXT: v_fract_f16_e32 v0, v0
-; GFX8-NEXT: v_fract_f16_e32 v3, v3
-; GFX8-NEXT: v_sin_f16_e32 v2, v2
-; GFX8-NEXT: v_sin_f16_e32 v0, v0
-; GFX8-NEXT: v_sin_f16_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v3
-; GFX8-NEXT: v_or_b32_e32 v1, v2, v1
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: cascaded_v4f16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_log_f16_e32 v2, v1
-; GFX9-NEXT: v_log_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_log_f16_e32 v0, v0
-; GFX9-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
-; GFX9-NEXT: v_mul_f16_e32 v3, 0.15915494, v3
-; GFX9-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
-; GFX9-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
-; GFX9-NEXT: v_sin_f16_e32 v2, v2
-; GFX9-NEXT: v_sin_f16_e32 v0, v0
-; GFX9-NEXT: v_sin_f16_e32 v3, v3
-; GFX9-NEXT: v_sin_f16_e32 v1, v1
-; GFX9-NEXT: v_pack_b32_f16 v0, v0, v3
-; GFX9-NEXT: v_pack_b32_f16 v1, v2, v1
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: cascaded_v4f16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_log_f16_e32 v2, v1
-; GFX10-NEXT: v_log_f16_e32 v3, v0
-; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
-; GFX10-NEXT: v_mul_f16_e32 v3, 0.15915494, v3
-; GFX10-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
-; GFX10-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
-; GFX10-NEXT: v_sin_f16_e32 v2, v2
-; GFX10-NEXT: v_sin_f16_e32 v3, v3
-; GFX10-NEXT: v_sin_f16_e32 v0, v0
-; GFX10-NEXT: v_sin_f16_e32 v1, v1
-; GFX10-NEXT: v_pack_b32_f16 v0, v3, v0
-; GFX10-NEXT: v_pack_b32_f16 v1, v2, v1
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: cascaded_v4f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-NEXT: v_log_f16_e32 v1, v1
-; GFX11-NEXT: v_log_f16_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_log_f16_e32 v2, v2
-; GFX11-NEXT: v_log_f16_e32 v3, v3
-; GFX11-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
-; GFX11-NEXT: s_delay_alu instid0(TRANS32_DEP_3)
-; GFX11-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
-; GFX11-NEXT: v_mul_f16_e32 v3, 0.15915494, v3
-; GFX11-NEXT: v_sin_f16_e32 v1, v1
-; GFX11-NEXT: v_sin_f16_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_sin_f16_e32 v2, v2
-; GFX11-NEXT: v_sin_f16_e32 v3, v3
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_pack_b32_f16 v0, v0, v2
-; GFX11-NEXT: v_pack_b32_f16 v1, v1, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %b = call <4 x half> @llvm.log2.v4f16(<4 x half> %a)
- %res = call <4 x half> @llvm.sin.v4f16(<4 x half> %b)
- ret <4 x half> %res
-}
-
-define <5 x half> @cascaded_v5f16(<5 x half> %a) {
-; GFX8-LABEL: cascaded_v5f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_log_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_log_f16_e32 v4, v1
-; GFX8-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_log_f16_e32 v0, v0
-; GFX8-NEXT: v_log_f16_e32 v2, v2
-; GFX8-NEXT: v_mul_f16_e32 v4, 0.15915494, v4
-; GFX8-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
-; GFX8-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
-; GFX8-NEXT: v_mul_f16_e32 v3, 0.15915494, v3
-; GFX8-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
-; GFX8-NEXT: v_fract_f16_e32 v4, v4
-; GFX8-NEXT: v_fract_f16_e32 v1, v1
-; GFX8-NEXT: v_fract_f16_e32 v0, v0
-; GFX8-NEXT: v_fract_f16_e32 v3, v3
-; GFX8-NEXT: v_sin_f16_e32 v4, v4
-; GFX8-NEXT: v_sin_f16_e32 v0, v0
-; GFX8-NEXT: v_sin_f16_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT: v_fract_f16_e32 v2, v2
-; GFX8-NEXT: v_sin_f16_e32 v2, v2
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v3
-; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: cascaded_v5f16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_log_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_log_f16_e32 v4, v1
-; GFX9-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_log_f16_e32 v0, v0
-; GFX9-NEXT: v_log_f16_e32 v2, v2
-; GFX9-NEXT: v_mul_f16_e32 v4, 0.15915494, v4
-; GFX9-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
-; GFX9-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
-; GFX9-NEXT: v_mul_f16_e32 v3, 0.15915494, v3
-; GFX9-NEXT: v_sin_f16_e32 v4, v4
-; GFX9-NEXT: v_sin_f16_e32 v0, v0
-; GFX9-NEXT: v_sin_f16_e32 v3, v3
-; GFX9-NEXT: v_sin_f16_e32 v1, v1
-; GFX9-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
-; GFX9-NEXT: v_sin_f16_e32 v2, v2
-; GFX9-NEXT: v_pack_b32_f16 v0, v0, v3
-; GFX9-NEXT: v_pack_b32_f16 v1, v4, v1
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: cascaded_v5f16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_log_f16_e32 v3, v1
-; GFX10-NEXT: v_log_f16_e32 v4, v0
-; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_log_f16_e32 v2, v2
-; GFX10-NEXT: v_mul_f16_e32 v3, 0.15915494, v3
-; GFX10-NEXT: v_mul_f16_e32 v4, 0.15915494, v4
-; GFX10-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
-; GFX10-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
-; GFX10-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
-; GFX10-NEXT: v_sin_f16_e32 v3, v3
-; GFX10-NEXT: v_sin_f16_e32 v4, v4
-; GFX10-NEXT: v_sin_f16_e32 v0, v0
-; GFX10-NEXT: v_sin_f16_e32 v1, v1
-; GFX10-NEXT: v_sin_f16_e32 v2, v2
-; GFX10-NEXT: v_pack_b32_f16 v0, v4, v0
-; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: cascaded_v5f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1
-; GFX11-NEXT: v_log_f16_e32 v1, v1
-; GFX11-NEXT: v_log_f16_e32 v0, v0
-; GFX11-NEXT: v_log_f16_e32 v2, v2
-; GFX11-NEXT: v_log_f16_e32 v3, v3
-; GFX11-NEXT: v_log_f16_e32 v4, v4
-; GFX11-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
-; GFX11-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
-; GFX11-NEXT: s_delay_alu instid0(TRANS32_DEP_3)
-; GFX11-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f16_e32 v3, 0.15915494, v3
-; GFX11-NEXT: v_mul_f16_e32 v4, 0.15915494, v4
-; GFX11-NEXT: v_sin_f16_e32 v1, v1
-; GFX11-NEXT: v_sin_f16_e32 v0, v0
-; GFX11-NEXT: v_sin_f16_e32 v2, v2
-; GFX11-NEXT: v_sin_f16_e32 v3, v3
-; GFX11-NEXT: v_sin_f16_e32 v4, v4
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_pack_b32_f16 v0, v0, v3
-; GFX11-NEXT: v_pack_b32_f16 v1, v1, v4
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %b = call <5 x half> @llvm.log2.v5f16(<5 x half> %a)
- %res = call <5 x half> @llvm.sin.v5f16(<5 x half> %b)
- ret <5 x half> %res
-}
>From 726bd97f75b45e330c71db8c1a6eff4bb16eeebe Mon Sep 17 00:00:00 2001
From: vikashgu <Vikash.Gupta at amd.com>
Date: Fri, 5 Dec 2025 06:17:52 +0000
Subject: [PATCH 09/11] Incorporated the algorithm into the existing
SIPeepholeSDWA infrastructure utilizing standard definded APIs.
---
llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 606 ++++++++++--------
.../amdgcn-cs-chain-intrinsic-dyn-vgpr-w32.ll | 8 +-
2 files changed, 331 insertions(+), 283 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 0d973b6fbee99..4d58456695031 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -23,11 +23,12 @@
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include <optional>
-#include <queue>
using namespace llvm;
@@ -52,6 +53,8 @@ class SDWADstOperand;
using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>;
using SDWAOperandsMap = MapVector<MachineInstr *, SDWAOperandsVector>;
+struct FP16PackCandidate;
+
class SIPeepholeSDWA {
private:
MachineRegisterInfo *MRI;
@@ -73,15 +76,8 @@ class SIPeepholeSDWA {
bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const;
- void eliminateFP16Packing(MachineBasicBlock &MBB, const GCNSubtarget &ST);
- unsigned
- computeMIChainsForPackedOps(MachineInstr *ParentMI,
- std::queue<MachineOperand *> &DefSrcQueue,
- const GCNSubtarget &ST);
- void convertMIToSDWAWithOpsel(MachineInstr *MI, MachineOperand &SrcMO,
- AMDGPU::SDWA::SdwaSel OpSel);
-
public:
+ friend class SDWAFP16ChainOperand;
bool run(MachineFunction &MF);
};
@@ -103,6 +99,16 @@ class SIPeepholeSDWALegacy : public MachineFunctionPass {
using namespace AMDGPU::SDWA;
+struct FP16PackCandidate {
+ MachineInstr *PackMI = nullptr;
+ MachineOperand *ConnectDst = nullptr;
+ MachineOperand *ConnectSrc = nullptr;
+ MachineOperand *FinalOutMO = nullptr;
+ SmallVector<MachineOperand *, 8> ChainOps;
+ AMDGPU::SDWA::SdwaSel InitialSel = AMDGPU::SDWA::SdwaSel::DWORD;
+ int SecondChainLength = 0;
+};
+
class SDWAOperand {
private:
MachineOperand *Target; // Operand that would be used in converted instruction
@@ -221,6 +227,52 @@ class SDWADstPreserveOperand : public SDWADstOperand {
#endif
};
+class SDWAFP16ChainOperand : public SDWAOperand {
+private:
+ SIPeepholeSDWA &Parent;
+ FP16PackCandidate Candidate;
+
+ static bool processCandidate(SIPeepholeSDWA &Parent,
+ FP16PackCandidate &Candidate);
+ static unsigned
+ computeMIChainsForPackedOps(SIPeepholeSDWA &Parent, MachineInstr *ParentMI,
+ SmallVectorImpl<MachineOperand *> &DefSrcVec,
+ const GCNSubtarget &ST);
+ static void convertMIToSDWAWithOpsel(SIPeepholeSDWA &Parent, MachineInstr *MI,
+ MachineOperand &SrcMO, SdwaSel OpSel);
+
+public:
+ SDWAFP16ChainOperand(SIPeepholeSDWA &Parent, MachineOperand *PackDst,
+ FP16PackCandidate Candidate)
+ : SDWAOperand(PackDst, PackDst), Parent(Parent),
+ Candidate(std::move(Candidate)) {}
+
+ static std::optional<FP16PackCandidate>
+ buildCandidate(SIPeepholeSDWA &Parent, MachineInstr &PackMI,
+ const GCNSubtarget &ST);
+
+ MachineInstr *
+ potentialToConvert(const SIInstrInfo *TII, const GCNSubtarget &ST,
+ SDWAOperandsMap *PotentialMatches = nullptr) override {
+ assert(Candidate.PackMI != nullptr);
+ return Candidate.PackMI;
+ }
+
+ bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override {
+ assert(&MI == Candidate.PackMI);
+ return processCandidate(Parent, Candidate);
+ }
+
+ bool canCombineSelections(const MachineInstr &MI,
+ const SIInstrInfo *TII) override {
+ return true;
+ }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ void print(raw_ostream &OS) const override;
+#endif
+};
+
} // end anonymous namespace
INITIALIZE_PASS(SIPeepholeSDWALegacy, DEBUG_TYPE, "SI Peephole SDWA", false,
@@ -281,6 +333,17 @@ void SDWADstPreserveOperand::print(raw_ostream& OS) const {
#endif
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void SDWAFP16ChainOperand::print(raw_ostream &OS) const {
+ OS << "SDWA fp16 chain dst: " << *getTargetOperand()
+ << " initial_sel:" << Candidate.InitialSel
+ << " chain_ops:" << Candidate.ChainOps.size()
+ << " second_chain_len:" << Candidate.SecondChainLength << '\n';
+ if (Candidate.PackMI)
+ OS << " packed MI: " << *Candidate.PackMI;
+}
+#endif
+
static void copyRegOperand(MachineOperand &To, const MachineOperand &From) {
assert(To.isReg() && From.isReg());
To.setReg(From.getReg());
@@ -977,6 +1040,17 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
OrDst, OrSDWADef, OrOtherDef, DstSel);
}
+ case AMDGPU::V_PACK_B32_F16_e64: {
+ const GCNSubtarget &ST = MRI->getMF().getSubtarget<GCNSubtarget>();
+ if (auto Candidate = SDWAFP16ChainOperand::buildCandidate(*this, MI, ST)) {
+ MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+ if (!Dst || !Dst->isReg() || Dst->getReg().isPhysical())
+ break;
+ return std::make_unique<SDWAFP16ChainOperand>(*this, Dst,
+ std::move(*Candidate));
+ }
+ break;
+ }
}
return std::unique_ptr<SDWAOperand>(nullptr);
@@ -1384,75 +1458,38 @@ bool SIPeepholeSDWALegacy::runOnMachineFunction(MachineFunction &MF) {
}
static bool isSrcDestFP16Bits(MachineInstr *MI, const SIInstrInfo *TII) {
+ static const DenseSet<unsigned> FP16BitOpcodes = {
+ AMDGPU::V_CVT_F16_U16_e32, AMDGPU::V_CVT_F16_U16_e64,
+ AMDGPU::V_CVT_F16_I16_e32, AMDGPU::V_CVT_F16_I16_e64,
+ AMDGPU::V_RCP_F16_e64, AMDGPU::V_RCP_F16_e32,
+ AMDGPU::V_RSQ_F16_e64, AMDGPU::V_RSQ_F16_e32,
+ AMDGPU::V_SQRT_F16_e64, AMDGPU::V_SQRT_F16_e32,
+ AMDGPU::V_LOG_F16_e64, AMDGPU::V_LOG_F16_e32,
+ AMDGPU::V_EXP_F16_e64, AMDGPU::V_EXP_F16_e32,
+ AMDGPU::V_SIN_F16_e64, AMDGPU::V_SIN_F16_e32,
+ AMDGPU::V_COS_F16_e64, AMDGPU::V_COS_F16_e32,
+ AMDGPU::V_FLOOR_F16_e64, AMDGPU::V_FLOOR_F16_e32,
+ AMDGPU::V_CEIL_F16_e64, AMDGPU::V_CEIL_F16_e32,
+ AMDGPU::V_TRUNC_F16_e64, AMDGPU::V_TRUNC_F16_e32,
+ AMDGPU::V_RNDNE_F16_e64, AMDGPU::V_RNDNE_F16_e32,
+ AMDGPU::V_FRACT_F16_e64, AMDGPU::V_FRACT_F16_e32,
+ AMDGPU::V_FREXP_MANT_F16_e64, AMDGPU::V_FREXP_MANT_F16_e32,
+ AMDGPU::V_FREXP_EXP_I16_F16_e64, AMDGPU::V_FREXP_EXP_I16_F16_e32,
+ AMDGPU::V_LDEXP_F16_e64, AMDGPU::V_LDEXP_F16_e32,
+ AMDGPU::V_ADD_F16_e64, AMDGPU::V_ADD_F16_e32,
+ AMDGPU::V_SUB_F16_e64, AMDGPU::V_SUB_F16_e32,
+ AMDGPU::V_SUBREV_F16_e64, AMDGPU::V_SUBREV_F16_e32,
+ AMDGPU::V_MUL_F16_e64, AMDGPU::V_MUL_F16_e32,
+ AMDGPU::V_MAX_F16_e64, AMDGPU::V_MAX_F16_e32,
+ AMDGPU::V_MIN_F16_e64, AMDGPU::V_MIN_F16_e32,
+ AMDGPU::V_MAD_F16_e64, AMDGPU::V_FMA_F16_e64,
+ AMDGPU::V_DIV_FIXUP_F16_e64};
+
unsigned Opcode = MI->getOpcode();
if (TII->isSDWA(Opcode))
Opcode = AMDGPU::getBasicFromSDWAOp(Opcode);
- switch (Opcode) {
- case AMDGPU::V_CVT_F16_U16_e32:
- case AMDGPU::V_CVT_F16_U16_e64:
- case AMDGPU::V_CVT_F16_I16_e32:
- case AMDGPU::V_CVT_F16_I16_e64:
- case AMDGPU::V_RCP_F16_e64:
- case AMDGPU::V_RCP_F16_e32:
- case AMDGPU::V_RSQ_F16_e64:
- case AMDGPU::V_RSQ_F16_e32:
- case AMDGPU::V_SQRT_F16_e64:
- case AMDGPU::V_SQRT_F16_e32:
- case AMDGPU::V_LOG_F16_e64:
- case AMDGPU::V_LOG_F16_e32:
- case AMDGPU::V_EXP_F16_e64:
- case AMDGPU::V_EXP_F16_e32:
- case AMDGPU::V_SIN_F16_e64:
- case AMDGPU::V_SIN_F16_e32:
- case AMDGPU::V_COS_F16_e64:
- case AMDGPU::V_COS_F16_e32:
- case AMDGPU::V_FLOOR_F16_e64:
- case AMDGPU::V_FLOOR_F16_e32:
- case AMDGPU::V_CEIL_F16_e64:
- case AMDGPU::V_CEIL_F16_e32:
- case AMDGPU::V_TRUNC_F16_e64:
- case AMDGPU::V_TRUNC_F16_e32:
- case AMDGPU::V_RNDNE_F16_e64:
- case AMDGPU::V_RNDNE_F16_e32:
- case AMDGPU::V_FRACT_F16_e64:
- case AMDGPU::V_FRACT_F16_e32:
- case AMDGPU::V_FREXP_MANT_F16_e64:
- case AMDGPU::V_FREXP_MANT_F16_e32:
- case AMDGPU::V_FREXP_EXP_I16_F16_e64:
- case AMDGPU::V_FREXP_EXP_I16_F16_e32:
- case AMDGPU::V_LDEXP_F16_e64:
- case AMDGPU::V_LDEXP_F16_e32:
- case AMDGPU::V_ADD_F16_e64:
- case AMDGPU::V_ADD_F16_e32:
- case AMDGPU::V_SUB_F16_e64:
- case AMDGPU::V_SUB_F16_e32:
- case AMDGPU::V_SUBREV_F16_e64:
- case AMDGPU::V_SUBREV_F16_e32:
- case AMDGPU::V_MUL_F16_e64:
- case AMDGPU::V_MUL_F16_e32:
- case AMDGPU::V_MAX_F16_e64:
- case AMDGPU::V_MAX_F16_e32:
- case AMDGPU::V_MIN_F16_e64:
- case AMDGPU::V_MIN_F16_e32:
- case AMDGPU::V_MAD_F16_e64:
- case AMDGPU::V_FMA_F16_e64:
- case AMDGPU::V_DIV_FIXUP_F16_e64:
- return true;
- case AMDGPU::V_MADAK_F16:
- case AMDGPU::V_MADMK_F16:
- case AMDGPU::V_FMAMK_F16:
- case AMDGPU::V_FMAAK_F16:
- // NOTE : SKEPTICAL ABOUT IT
- return false;
- case AMDGPU::V_FMAC_F16_e32:
- case AMDGPU::V_FMAC_F16_e64:
- case AMDGPU::V_MAC_F16_e32:
- case AMDGPU::V_MAC_F16_e64:
- // As their sdwa version allow dst_sel to be equal only set to DWORD
- default:
- return false;
- }
+ return FP16BitOpcodes.contains(Opcode);
}
static bool checkForRightSrcRootAccess(MachineInstr *Def0MI,
@@ -1526,20 +1563,22 @@ static bool dominates(MachineBasicBlock::const_iterator A,
return (I == B);
}
-// Convert MI into its SDWA version with its Dst_Sel & SrcMO_Sel set with OpSel
-// and preserving the rest of Dst's bits.
-void SIPeepholeSDWA::convertMIToSDWAWithOpsel(MachineInstr *MI,
- MachineOperand &SrcMO,
- AMDGPU::SDWA::SdwaSel OpSel) {
+// Convert MI into its SDWA version with its Dst_Sel & SrcMO_Sel set to OpSel
+// and preserve the untouched destination bits by tying the implicit operand.
+void SDWAFP16ChainOperand::convertMIToSDWAWithOpsel(SIPeepholeSDWA &Parent,
+ MachineInstr *MI,
+ MachineOperand &SrcMO,
+ SdwaSel OpSel) {
+ const SIInstrInfo *TII = Parent.TII;
LLVM_DEBUG(dbgs() << "Convert instruction:" << MI);
if (!TII->isSDWA(MI->getOpcode())) {
- MachineInstr *SDWAInst = createSDWAVersion(*MI);
+ MachineInstr *SDWAInst = Parent.createSDWAVersion(*MI);
MI->eraseFromParent();
MI = SDWAInst;
}
- ConvertedInstructions.push_back(MI);
+ Parent.ConvertedInstructions.push_back(MI);
unsigned SDWAOpcode = MI->getOpcode();
++NumSDWAInstructionsToEliminateFP16Pack;
@@ -1591,12 +1630,15 @@ void SIPeepholeSDWA::convertMIToSDWAWithOpsel(MachineInstr *MI,
return;
}
-// BackTracks the given Parent MI to look for any of its use operand that has
-// been defined by FP16 (sdwa-able) in recursive fashion.
-unsigned SIPeepholeSDWA::computeMIChainsForPackedOps(
- MachineInstr *ParentMI, std::queue<MachineOperand *> &DefSrcQueue,
- const GCNSubtarget &ST) {
+// Backtrack ParentMI to locate use operands defined by SDWA-convertible FP16
+// instructions. Track the linear chain as long as exactly one qualifying def is
+// found; bail out once the path forks.
+unsigned SDWAFP16ChainOperand::computeMIChainsForPackedOps(
+ SIPeepholeSDWA &Parent, MachineInstr *ParentMI,
+ SmallVectorImpl<MachineOperand *> &DefSrcVec, const GCNSubtarget &ST) {
unsigned NumOfFP16Def;
+ MachineRegisterInfo *MRI = Parent.MRI;
+ const SIInstrInfo *TII = Parent.TII;
// We will go up the use-def chain for ParentMI, until we encounter the
// exit condition, where we don't find any such defs of use operands
@@ -1620,7 +1662,7 @@ unsigned SIPeepholeSDWA::computeMIChainsForPackedOps(
continue;
NextMIInChain = DefCurrMI;
- DefSrcQueue.push(DefCurrMO);
+ DefSrcVec.push_back(DefCurrMO);
NumOfFP16Def++;
}
@@ -1630,210 +1672,217 @@ unsigned SIPeepholeSDWA::computeMIChainsForPackedOps(
return NumOfFP16Def;
}
-void SIPeepholeSDWA::eliminateFP16Packing(MachineBasicBlock &MBB,
- const GCNSubtarget &ST) {
- if (!ST.has16BitInsts())
- return;
-
- for (MachineInstr &MI : make_early_inc_range(MBB)) {
- if (MI.getOpcode() != AMDGPU::V_PACK_B32_F16_e64)
- continue;
- LLVM_DEBUG(dbgs() << "\nCandidate FP16 Packed MI : " << MI << '\n');
-
- std::queue<MachineOperand *> DefSrc0Queue;
- std::queue<MachineOperand *> DefSrc1Queue;
- MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
- MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
-
- if (!Src0->isReg() || Src0->getReg().isPhysical() ||
- !MRI->hasOneUse(Src0->getReg()) || !Src1->isReg() ||
- Src1->getReg().isPhysical() || !MRI->hasOneUse(Src1->getReg()))
- continue;
-
- MachineOperand *Op0 = findSingleRegDef(Src0, MRI);
- MachineOperand *Op1 = findSingleRegDef(Src1, MRI);
-
- if (!Op0 || !Op1)
- continue;
-
- MachineInstr *ParentMIOp0 = Op0->getParent();
- MachineInstr *ParentMIOp1 = Op1->getParent();
+// Examine V_PACK_B32_F16 uses and attempt to form an FP16 chain candidate that
+// can be converted into SDWA form. This mirrors the legacy flow:
+// Op0Initial -> ... -> Op0Final -> pack
+// Op1Initial -> ... -> Op1Final -> pack
+// If dominance allows, the chains are canonicalized into a single queue that
+// records the order in which SDWA conversions should apply.
+std::optional<FP16PackCandidate>
+SDWAFP16ChainOperand::buildCandidate(SIPeepholeSDWA &Parent, MachineInstr &MI,
+ const GCNSubtarget &ST) {
+ if (MI.getOpcode() != AMDGPU::V_PACK_B32_F16_e64 || !ST.has16BitInsts())
+ return std::nullopt;
+
+ const SIInstrInfo *TII = Parent.TII;
+ MachineRegisterInfo *MRI = Parent.MRI;
+ const SIRegisterInfo *TRI = Parent.TRI;
- if (!isSrcDestFP16Bits(ParentMIOp0, TII) ||
- !isSrcDestFP16Bits(ParentMIOp1, TII) ||
- !isConvertibleToSDWA(*ParentMIOp0, ST, TII) ||
- !isConvertibleToSDWA(*ParentMIOp1, ST, TII))
- continue;
+ MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+ MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+ if (!Src0 || !Src1)
+ return std::nullopt;
+
+ if (!Src0->isReg() || Src0->getReg().isPhysical() ||
+ !MRI->hasOneUse(Src0->getReg()) || !Src1->isReg() ||
+ Src1->getReg().isPhysical() || !MRI->hasOneUse(Src1->getReg()))
+ return std::nullopt;
+
+ MachineOperand *Op0 = findSingleRegDef(Src0, MRI);
+ MachineOperand *Op1 = findSingleRegDef(Src1, MRI);
+ if (!Op0 || !Op1)
+ return std::nullopt;
+
+ MachineInstr *ParentMIOp0 = Op0->getParent();
+ MachineInstr *ParentMIOp1 = Op1->getParent();
+
+ if (!isSrcDestFP16Bits(ParentMIOp0, TII) ||
+ !isSrcDestFP16Bits(ParentMIOp1, TII))
+ return std::nullopt;
+
+ if (!isConvertibleToSDWA(*ParentMIOp0, ST, TII) ||
+ !isConvertibleToSDWA(*ParentMIOp1, ST, TII))
+ return std::nullopt;
+
+ SmallVector<MachineOperand *, 8> DefSrc0Vec;
+ SmallVector<MachineOperand *, 8> DefSrc1Vec;
+ DefSrc0Vec.push_back(Op0);
+ DefSrc1Vec.push_back(Op1);
+
+ // This checks for the given MI, that it only has exact one register MO
+ // use , that is defined by pure FP16 instruction (that is SDWA-able too)
+ unsigned NumOfFP16Def =
+ computeMIChainsForPackedOps(Parent, ParentMIOp0, DefSrc0Vec, ST);
+ if (NumOfFP16Def > 1)
+ return std::nullopt;
+
+ NumOfFP16Def =
+ computeMIChainsForPackedOps(Parent, ParentMIOp1, DefSrc1Vec, ST);
+ if (NumOfFP16Def > 1)
+ return std::nullopt;
+
+ MachineInstr *Def0RootMI = (DefSrc0Vec.back())->getParent();
+ MachineInstr *Def1RootMI = (DefSrc1Vec.back())->getParent();
+ Register SrcRootMOReg = AMDGPU::NoRegister;
+
+ // Now, check if the last operation for each in of the DefSrcQueue
+ // has the common MO, that would be the source root MO for element-wise
+ // fp16 chain operations.
+
+ SmallDenseSet<Register, 8> Def0Regs;
+ for (MachineOperand &MO : Def0RootMI->uses()) {
+ if (MO.isReg() && !MO.getReg().isPhysical())
+ Def0Regs.insert(MO.getReg());
+ }
- DefSrc0Queue.push(Op0);
- DefSrc1Queue.push(Op1);
+ for (MachineOperand &MO : Def1RootMI->uses()) {
+ if (MO.isReg() && !MO.getReg().isPhysical() &&
+ Def0Regs.contains(MO.getReg())) {
+ SrcRootMOReg = MO.getReg();
+ break;
+ }
+ }
- // This checks for the given MI, that it only has exact one register MO
- // use , that is defined by pure FP16 instruction (that is SDWA-able too)
- unsigned NumOfFP16Def;
+ if (SrcRootMOReg == AMDGPU::NoRegister)
+ return std::nullopt;
+
+ // Also we need to ensure that each of the DefXRootMI should access the
+ // lower and upper half word of SrcRootMOReg respectively.
+ if (!checkForRightSrcRootAccess(Def0RootMI, Def1RootMI, SrcRootMOReg, TII))
+ return std::nullopt;
+
+ MachineInstr *Op0FinalMI = (DefSrc0Vec.front())->getParent();
+ MachineInstr *Op1FinalMI = (DefSrc1Vec.front())->getParent();
+ MachineInstr *Op0IntialMI = (DefSrc0Vec.back())->getParent();
+ MachineInstr *Op1IntialMI = (DefSrc1Vec.back())->getParent();
+
+ FP16PackCandidate Candidate;
+ Candidate.PackMI = &MI;
+
+ auto buildCandidateForDefChain =
+ [&](const SmallVectorImpl<MachineOperand *> &DefFromVec,
+ const SmallVectorImpl<MachineOperand *> &DefToVec,
+ AMDGPU::SDWA::SdwaSel InitialSel)
+ -> std::optional<FP16PackCandidate> {
+ Candidate.ConnectSrc = DefFromVec.front();
+ Candidate.FinalOutMO = DefToVec.front();
+ Candidate.InitialSel = InitialSel;
+ Candidate.SecondChainLength = 0;
+
+ MachineInstr *OpToIntialMI = (DefToVec.back())->getParent();
+ int OpIdx = OpToIntialMI->findRegisterUseOperandIdx(SrcRootMOReg, TRI);
+ if (OpIdx == -1)
+ return std::nullopt;
+ Candidate.ConnectDst = &OpToIntialMI->getOperand(OpIdx);
+
+ MachineInstr *OpFromIntialMI = (DefFromVec.back())->getParent();
+ OpIdx = OpFromIntialMI->findRegisterUseOperandIdx(SrcRootMOReg, TRI);
+ if (OpIdx == -1)
+ return std::nullopt;
+ MachineOperand *IntialInMO = &OpFromIntialMI->getOperand(OpIdx);
+
+ Candidate.ChainOps.clear();
+ Candidate.ChainOps.reserve(DefFromVec.size() + DefToVec.size() + 1);
+
+ auto appendVec = [&](const SmallVectorImpl<MachineOperand *> &Vec,
+ bool CountSecond) {
+ for (MachineOperand *MO : Vec) {
+ Candidate.ChainOps.push_back(MO);
+ if (CountSecond)
+ ++Candidate.SecondChainLength;
+ }
+ };
- NumOfFP16Def = computeMIChainsForPackedOps(ParentMIOp0, DefSrc0Queue, ST);
- if (NumOfFP16Def > 1)
- continue;
+ appendVec(DefToVec, true);
+ appendVec(DefFromVec, false);
+ Candidate.ChainOps.push_back(IntialInMO);
+ return Candidate;
+ };
- NumOfFP16Def = computeMIChainsForPackedOps(ParentMIOp1, DefSrc1Queue, ST);
- if (NumOfFP16Def > 1)
- continue;
+ if (dominates(Op0FinalMI, Op1IntialMI))
+ return buildCandidateForDefChain(DefSrc0Vec, DefSrc1Vec,
+ AMDGPU::SDWA::SdwaSel::WORD_1);
- MachineInstr *Def0RootMI = (DefSrc0Queue.back())->getParent();
- MachineInstr *Def1RootMI = (DefSrc1Queue.back())->getParent();
- Register SrcRootMOReg = AMDGPU::NoRegister;
+ if (dominates(Op1FinalMI, Op0IntialMI))
+ return buildCandidateForDefChain(DefSrc1Vec, DefSrc0Vec,
+ AMDGPU::SDWA::SdwaSel::WORD_0);
- // Now, check if the last operation for each in of the DefSrcQueue
- // has the common MO, that would be the source root MO for element-wise
- // fp16 chain operations
- for (MachineOperand &Current0MO : Def0RootMI->uses()) {
- if (!Current0MO.isReg() || Current0MO.getReg().isPhysical())
- continue;
+ LLVM_DEBUG(dbgs() << "No Connecting MI exists" << '\n');
+ return std::nullopt;
+}
- for (MachineOperand &Current1MO : Def1RootMI->uses()) {
- if (!Current1MO.isReg() || Current1MO.getReg().isPhysical())
- continue;
+bool SDWAFP16ChainOperand::processCandidate(SIPeepholeSDWA &Parent,
+ FP16PackCandidate &Candidate) {
+ // The graph below represents the connection :
+ // Op0Initial --> Op0x --> ... --> Op0Final
+ // / \'
+ // SrcRootMO v_pack_b32_f16
+ // \ /
+ // Op1Initial --> Op1x --> ... --> Op1Final
+ // The nomenclature follows the same convention as the legacy algorithm.
+ if (!Candidate.ConnectDst || !Candidate.ConnectSrc || !Candidate.FinalOutMO ||
+ Candidate.ChainOps.size() < 2)
+ return false;
- if (Current0MO.getReg() == Current1MO.getReg() &&
- Current0MO.getSubReg() == Current1MO.getSubReg()) {
- SrcRootMOReg = Current0MO.getReg();
- break;
- }
- }
- // Found it, no more check needed, so break;
- if (SrcRootMOReg != AMDGPU::NoRegister)
- break;
- }
+ MachineRegisterInfo *MRI = Parent.MRI;
+ const SIInstrInfo *TII = Parent.TII;
- if (SrcRootMOReg == AMDGPU::NoRegister)
- continue;
+ copyRegOperand(*Candidate.ConnectDst, *Candidate.ConnectSrc);
+ LLVM_DEBUG(dbgs() << "Updated Connecting MI : "
+ << *Candidate.ConnectDst->getParent() << '\n');
- // Also we need to ensure that each of the DefXRootMI should access the
- // lower and upper half word of SrcRootMOReg respectively.
- if (!checkForRightSrcRootAccess(Def0RootMI, Def1RootMI, SrcRootMOReg, TII))
+ MachineOperand &DefMO =
+ *TII->getNamedOperand(*Candidate.PackMI, AMDGPU::OpName::vdst);
+ for (MachineOperand &MO :
+ make_early_inc_range(MRI->use_nodbg_operands(DefMO.getReg()))) {
+ if (!MO.isReg())
continue;
- // The graph below represents the connection :
- // Op0Intial --> Op0x --> ... --> Op0Final
- // / \'
- // SrcRootMO v_Pack_b32_f16
- // \ /
- // Op1Intial --> Op1x --> ... --> Op1Final
- // The nomenclature is based upon above flow-graph
- //
- // Also for each of DefSrcXQueue :
- // OpXIntial is at back & OpXFinal is at front
- auto Op0FinalMI = (DefSrc0Queue.front())->getParent();
- auto Op1FinalMI = (DefSrc1Queue.front())->getParent();
- auto Op0IntialMI = (DefSrc0Queue.back())->getParent();
- auto Op1IntialMI = (DefSrc1Queue.back())->getParent();
-
- MachineOperand *FinalOutMO = nullptr;
- std::queue<MachineOperand *> ChainedDefOps;
- AMDGPU::SDWA::SdwaSel OpSel = AMDGPU::SDWA::SdwaSel::DWORD;
- int NumOfElemInSecondOpChain = 0;
-
- auto canonicalizedMIFlow =
- [&](std::queue<MachineOperand *> DefFromQueue,
- std::queue<MachineOperand *> DefToQueue) -> void {
- MachineInstr *OpToIntialMI = (DefToQueue.back())->getParent();
- int OpIdx = OpToIntialMI->findRegisterUseOperandIdx(SrcRootMOReg, TRI);
- auto &MOTo = OpToIntialMI->getOperand(OpIdx);
- auto MOFrom = DefFromQueue.front();
- copyRegOperand(MOTo, *MOFrom);
-
- LLVM_DEBUG(dbgs() << "Updated Connecting MI : " << *OpToIntialMI << '\n');
-
- FinalOutMO = DefToQueue.front();
- MachineInstr *OpFromIntialMI = (DefFromQueue.back())->getParent();
- OpIdx = OpFromIntialMI->findRegisterUseOperandIdx(SrcRootMOReg, TRI);
- auto &IntialInMO = OpFromIntialMI->getOperand(OpIdx);
-
- while (!DefToQueue.empty()) {
- ChainedDefOps.push(DefToQueue.front());
- DefToQueue.pop();
- NumOfElemInSecondOpChain++;
- }
- while (!DefFromQueue.empty()) {
- ChainedDefOps.push(DefFromQueue.front());
- DefFromQueue.pop();
- }
- ChainedDefOps.push(&IntialInMO);
- };
-
- // Now, we will change the flow as per the dominace of MI as follows, if
- // possible and store it in ChainedDefOps, so later can be used to convert
- // into its SDWA version:
- //
- // If (dominates(Op0FinalMI, Op1IntialMI)) == TRUE
- // SrcRootMO -> Op0Intial -> Op0x -> ... -> Op0Final
- // -> Op1Intial -> Op1x -> ... -> Op1Final (FinalOutMO)
- //
- // If (dominates(Op1FinalMI, Op0IntialMI)) == TRUE
- // SrcRootMO -> Op1Intial -> Op1x -> ... -> Op1Final
- // -> Op0Intial -> Op0x -> ... -> Op0Final (FinalOutMO)
- //
- // TODO : Else, not handled!
- // One such case is observed when multiple fp16 instruction are chained
- // on a fp16 vector input. For Example :
- //
- // %1 = call <2 x half> @llvm.log.v2f16 (<2 x half> %0)
- // %res = call <2 x half> @llvm.sin.v2f16 (<2 x half> %1)
- // return <2 x half> %res
- if (dominates(Op0FinalMI, Op1IntialMI)) {
- canonicalizedMIFlow(DefSrc0Queue, DefSrc1Queue);
- OpSel = AMDGPU::SDWA::SdwaSel::WORD_1;
- } else if (dominates(Op1FinalMI, Op0IntialMI)) {
- canonicalizedMIFlow(DefSrc1Queue, DefSrc0Queue);
- OpSel = AMDGPU::SDWA::SdwaSel::WORD_0;
- } else {
- LLVM_DEBUG(dbgs() << "No Connecting MI exists" << '\n');
- continue;
+ MO.setReg(Candidate.FinalOutMO->getReg());
+ MO.setSubReg(Candidate.FinalOutMO->getSubReg());
+ }
+ LLVM_DEBUG(dbgs() << "Replace uses of " << DefMO << " in "
+ << *Candidate.PackMI << "With " << *Candidate.FinalOutMO
+ << '\n');
+
+ Candidate.PackMI->eraseFromParent();
+ ++Num16BitPackedInstructionsEliminated;
+
+ AMDGPU::SDWA::SdwaSel OpSel = Candidate.InitialSel;
+ int RemainingSecond = Candidate.SecondChainLength;
+
+ for (unsigned I = 0, E = Candidate.ChainOps.size() - 1; I < E; ++I) {
+ if (RemainingSecond == 0) {
+ if (OpSel == AMDGPU::SDWA::SdwaSel::WORD_0)
+ OpSel = AMDGPU::SDWA::SdwaSel::WORD_1;
+ else
+ OpSel = AMDGPU::SDWA::SdwaSel::WORD_0;
}
- // Replace all use places of MI(v_pack) defMO with FinalOutMO.
- MachineOperand &DefMO = MI.getOperand(0);
- for (MachineOperand &MO :
- make_early_inc_range(MRI->use_nodbg_operands(DefMO.getReg()))) {
- if (!MO.isReg())
- continue;
-
- MO.setReg(FinalOutMO->getReg());
- MO.setSubReg(FinalOutMO->getSubReg());
+ MachineInstr *DefMI = Candidate.ChainOps[I]->getParent();
+ MachineOperand *SrcMO = Candidate.ChainOps[I + 1];
+ if (SrcMO->isDef()) {
+ assert(MRI->hasOneUse(SrcMO->getReg()));
+ SrcMO = findSingleRegUse(SrcMO, MRI);
+ assert(SrcMO && DefMI == SrcMO->getParent() &&
+ "the only use is not in DefMI");
}
- LLVM_DEBUG(dbgs() << "Replace uses of " << DefMO << " in " << MI << "With "
- << *FinalOutMO << '\n');
-
- // Delete v_pack machine instruction
- LLVM_DEBUG(dbgs() << "\nInstruction to be deleted : " << MI << "\n\n");
- MI.eraseFromParent();
- ++Num16BitPackedInstructionsEliminated;
-
- // Convert machine instruction into SDWA-version
- while (ChainedDefOps.size() != 1) {
- if (NumOfElemInSecondOpChain == 0) {
- if (OpSel == AMDGPU::SDWA::SdwaSel::WORD_0)
- OpSel = AMDGPU::SDWA::SdwaSel::WORD_1;
- else
- OpSel = AMDGPU::SDWA::SdwaSel::WORD_0;
- }
-
- MachineInstr *DefMI = ChainedDefOps.front()->getParent();
- ChainedDefOps.pop();
- MachineOperand *SrcMO = ChainedDefOps.front();
-
- // Take SrcMO (which are def) as its usage in DefMI
- if (SrcMO->isDef()) {
- assert(MRI->hasOneUse(SrcMO->getReg()));
- SrcMO = findSingleRegUse(SrcMO, MRI);
- assert(DefMI == SrcMO->getParent() && "the only use is not in DefMI");
- }
- convertMIToSDWAWithOpsel(DefMI, *SrcMO, OpSel);
- NumOfElemInSecondOpChain--;
- }
+ convertMIToSDWAWithOpsel(Parent, DefMI, *SrcMO, OpSel);
+ --RemainingSecond;
}
+
+ return true;
}
bool SIPeepholeSDWA::run(MachineFunction &MF) {
@@ -1855,6 +1904,7 @@ bool SIPeepholeSDWA::run(MachineFunction &MF) {
// Look for a possible ADD or SUB that resulted from a previously lowered
// V_{ADD|SUB}_U64_PSEUDO. The function pseudoOpConvertToVOP2
// lowers the pair of instructions into e32 form.
+ // Also, handles any V_PACK instructions that are encountered here.
matchSDWAOperands(MBB);
for (const auto &OperandPair : SDWAOperands) {
const auto &Operand = OperandPair.second;
@@ -1870,7 +1920,10 @@ bool SIPeepholeSDWA::run(MachineFunction &MF) {
case AMDGPU::V_CNDMASK_B32_e64:
convertVcndmaskToVOP2(*PotentialMI, ST);
break;
- };
+ case AMDGPU::V_PACK_B32_F16_e64:
+ Operand->convertToSDWA(*PotentialMI, TII);
+ break;
+ }
}
SDWAOperands.clear();
@@ -1902,11 +1955,6 @@ bool SIPeepholeSDWA::run(MachineFunction &MF) {
legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST);
} while (Changed);
- // Process each v_pack_b32_fp16 instruction in MBB.
- eliminateFP16Packing(MBB, ST);
- Ret |= !ConvertedInstructions.empty();
- while (!ConvertedInstructions.empty())
- legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST);
}
return Ret;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-cs-chain-intrinsic-dyn-vgpr-w32.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-cs-chain-intrinsic-dyn-vgpr-w32.ll
index 8243d5c1e21dd..74b5639d902a4 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn-cs-chain-intrinsic-dyn-vgpr-w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn-cs-chain-intrinsic-dyn-vgpr-w32.ll
@@ -106,10 +106,10 @@ define amdgpu_cs_chain void @high_sgpr_pressure(<30 x i32> inreg %sgpr, { i32, p
; GISEL-GFX12-NEXT: s_mov_b32 s34, retry_vgpr_alloc at abs32@lo
; GISEL-GFX12-NEXT: s_mov_b32 s35, retry_vgpr_alloc at abs32@hi
; GISEL-GFX12-NEXT: s_alloc_vgpr 64
-; GISEL-GFX12-NEXT: s_wait_alu 0xfffe
+; GISEL-GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GISEL-GFX12-NEXT: s_cselect_b64 s[30:31], s[30:31], s[34:35]
; GISEL-GFX12-NEXT: s_cselect_b32 exec_lo, 7, -1
-; GISEL-GFX12-NEXT: s_wait_alu 0xfffe
+; GISEL-GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31]
;
; DAGISEL-GFX12-LABEL: high_sgpr_pressure:
@@ -124,10 +124,10 @@ define amdgpu_cs_chain void @high_sgpr_pressure(<30 x i32> inreg %sgpr, { i32, p
; DAGISEL-GFX12-NEXT: s_mov_b32 s35, callee_high_sgpr at abs32@hi
; DAGISEL-GFX12-NEXT: s_mov_b32 s34, callee_high_sgpr at abs32@lo
; DAGISEL-GFX12-NEXT: s_alloc_vgpr 64
-; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe
+; DAGISEL-GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; DAGISEL-GFX12-NEXT: s_cselect_b64 s[34:35], s[34:35], s[30:31]
; DAGISEL-GFX12-NEXT: s_cselect_b32 exec_lo, 7, -1
-; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe
+; DAGISEL-GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; DAGISEL-GFX12-NEXT: s_setpc_b64 s[34:35]
call void(ptr, i32, <30 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee_high_sgpr, i32 7, <30 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 1, i32 inreg 64, i32 inreg -1, ptr @retry_vgpr_alloc)
unreachable
>From 5a83fc22716c8ef7b16ae178c0419bf908f58136 Mon Sep 17 00:00:00 2001
From: vikashgu <Vikash.Gupta at amd.com>
Date: Tue, 9 Dec 2025 07:34:46 +0000
Subject: [PATCH 10/11] Updated the comments & isSrcDestFP16Bits() to address
review.
---
llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 58 ++++++++++++-----------
1 file changed, 30 insertions(+), 28 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 4d58456695031..95cdd19dd6f4d 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -1459,30 +1459,30 @@ bool SIPeepholeSDWALegacy::runOnMachineFunction(MachineFunction &MF) {
static bool isSrcDestFP16Bits(MachineInstr *MI, const SIInstrInfo *TII) {
static const DenseSet<unsigned> FP16BitOpcodes = {
- AMDGPU::V_CVT_F16_U16_e32, AMDGPU::V_CVT_F16_U16_e64,
- AMDGPU::V_CVT_F16_I16_e32, AMDGPU::V_CVT_F16_I16_e64,
- AMDGPU::V_RCP_F16_e64, AMDGPU::V_RCP_F16_e32,
- AMDGPU::V_RSQ_F16_e64, AMDGPU::V_RSQ_F16_e32,
- AMDGPU::V_SQRT_F16_e64, AMDGPU::V_SQRT_F16_e32,
- AMDGPU::V_LOG_F16_e64, AMDGPU::V_LOG_F16_e32,
- AMDGPU::V_EXP_F16_e64, AMDGPU::V_EXP_F16_e32,
- AMDGPU::V_SIN_F16_e64, AMDGPU::V_SIN_F16_e32,
- AMDGPU::V_COS_F16_e64, AMDGPU::V_COS_F16_e32,
- AMDGPU::V_FLOOR_F16_e64, AMDGPU::V_FLOOR_F16_e32,
- AMDGPU::V_CEIL_F16_e64, AMDGPU::V_CEIL_F16_e32,
- AMDGPU::V_TRUNC_F16_e64, AMDGPU::V_TRUNC_F16_e32,
- AMDGPU::V_RNDNE_F16_e64, AMDGPU::V_RNDNE_F16_e32,
- AMDGPU::V_FRACT_F16_e64, AMDGPU::V_FRACT_F16_e32,
- AMDGPU::V_FREXP_MANT_F16_e64, AMDGPU::V_FREXP_MANT_F16_e32,
+ // VOP1 FP16 unary operations
+ AMDGPU::V_CVT_F16_U16_e32, AMDGPU::V_CVT_F16_U16_e64,
+ AMDGPU::V_CVT_F16_I16_e32, AMDGPU::V_CVT_F16_I16_e64,
+ AMDGPU::V_RCP_F16_e64, AMDGPU::V_RCP_F16_e32, AMDGPU::V_RSQ_F16_e64,
+ AMDGPU::V_RSQ_F16_e32, AMDGPU::V_SQRT_F16_e64, AMDGPU::V_SQRT_F16_e32,
+ AMDGPU::V_LOG_F16_e64, AMDGPU::V_LOG_F16_e32, AMDGPU::V_EXP_F16_e64,
+ AMDGPU::V_EXP_F16_e32, AMDGPU::V_SIN_F16_e64, AMDGPU::V_SIN_F16_e32,
+ AMDGPU::V_COS_F16_e64, AMDGPU::V_COS_F16_e32, AMDGPU::V_FLOOR_F16_e64,
+ AMDGPU::V_FLOOR_F16_e32, AMDGPU::V_CEIL_F16_e64, AMDGPU::V_CEIL_F16_e32,
+ AMDGPU::V_TRUNC_F16_e64, AMDGPU::V_TRUNC_F16_e32, AMDGPU::V_RNDNE_F16_e64,
+ AMDGPU::V_RNDNE_F16_e32, AMDGPU::V_FRACT_F16_e64, AMDGPU::V_FRACT_F16_e32,
+ AMDGPU::V_FREXP_MANT_F16_e64, AMDGPU::V_FREXP_MANT_F16_e32,
AMDGPU::V_FREXP_EXP_I16_F16_e64, AMDGPU::V_FREXP_EXP_I16_F16_e32,
- AMDGPU::V_LDEXP_F16_e64, AMDGPU::V_LDEXP_F16_e32,
- AMDGPU::V_ADD_F16_e64, AMDGPU::V_ADD_F16_e32,
- AMDGPU::V_SUB_F16_e64, AMDGPU::V_SUB_F16_e32,
- AMDGPU::V_SUBREV_F16_e64, AMDGPU::V_SUBREV_F16_e32,
- AMDGPU::V_MUL_F16_e64, AMDGPU::V_MUL_F16_e32,
- AMDGPU::V_MAX_F16_e64, AMDGPU::V_MAX_F16_e32,
- AMDGPU::V_MIN_F16_e64, AMDGPU::V_MIN_F16_e32,
- AMDGPU::V_MAD_F16_e64, AMDGPU::V_FMA_F16_e64,
+ // VOP2 FP16 binary operations
+ AMDGPU::V_LDEXP_F16_e64, AMDGPU::V_LDEXP_F16_e32, AMDGPU::V_ADD_F16_e64,
+ AMDGPU::V_ADD_F16_e32, AMDGPU::V_SUB_F16_e64, AMDGPU::V_SUB_F16_e32,
+ AMDGPU::V_SUBREV_F16_e64, AMDGPU::V_SUBREV_F16_e32, AMDGPU::V_MUL_F16_e64,
+ AMDGPU::V_MUL_F16_e32, AMDGPU::V_MAX_F16_e64, AMDGPU::V_MAX_F16_e32,
+ AMDGPU::V_MIN_F16_e64, AMDGPU::V_MIN_F16_e32,
+ // VOP2 FP16 multiply-accumulate operations
+ AMDGPU::V_FMAC_F16_e64, AMDGPU::V_FMAC_F16_e32, AMDGPU::V_MAC_F16_e64,
+ AMDGPU::V_MAC_F16_e32,
+ // VOP3 FP16 ternary operations
+ AMDGPU::V_MAD_F16_e64, AMDGPU::V_FMA_F16_e64,
AMDGPU::V_DIV_FIXUP_F16_e64};
unsigned Opcode = MI->getOpcode();
@@ -1496,13 +1496,12 @@ static bool checkForRightSrcRootAccess(MachineInstr *Def0MI,
MachineInstr *Def1MI,
Register SrcRootReg,
const SIInstrInfo *TII) {
- // As if could, the Def1MI would have been sdwa-ed in order to access
- // upper half, and Def0MI should not be as it accessing lower half.
+ // The intended scenario is that Def1MI already reads the upper half from
+ // SrcRootReg via SDWA-able instruction while Def0MI still consumes the lower
+ // half from SrcRootReg without the SDWA counterpart. Any other arrangement
+ // would imply violation of SrcRootReg usage.
if (!TII->isSDWA(Def1MI->getOpcode()) || TII->isSDWA(Def0MI->getOpcode()))
return false;
-
- // Def1 should be writing into entire DWORD of dst, with unused part set
- // to zero-pad.
MachineOperand *Def1DstSel =
TII->getNamedOperand(*Def1MI, AMDGPU::OpName::dst_sel);
if (!Def1DstSel || Def1DstSel->getImm() != AMDGPU::SDWA::SdwaSel::DWORD)
@@ -1513,6 +1512,9 @@ static bool checkForRightSrcRootAccess(MachineInstr *Def0MI,
Def1DstUnused->getImm() != AMDGPU::SDWA::DstUnused::UNUSED_PAD)
return false;
+ // Helper to validate whether DefMI uses SrcRootReg as the specified source
+ // operand (SrcName), and if the corresponding SDWA selection operand
+ // (SrcSelName) matches the expected SdwaSel.
const auto CheckSrcSel = [&](MachineInstr *DefMI, AMDGPU::OpName SrcName,
AMDGPU::OpName SrcSelName,
AMDGPU::SDWA::SdwaSel SdwaSel) -> bool {
>From 1811c99174b9a896cb32817dae95910ae225c60c Mon Sep 17 00:00:00 2001
From: vikashgu <Vikash.Gupta at amd.com>
Date: Tue, 9 Dec 2025 10:42:10 +0000
Subject: [PATCH 11/11] Added a tableGen generated lookup table for
isSrcDestFP16Bits.
---
llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 32 +++----------------
.../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 13 ++++++++
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 3 ++
llvm/lib/Target/AMDGPU/VOPInstructions.td | 18 +++++++++++
4 files changed, 38 insertions(+), 28 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 95cdd19dd6f4d..588bb3955cbb3 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -23,6 +23,7 @@
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/SmallVector.h"
@@ -1457,39 +1458,14 @@ bool SIPeepholeSDWALegacy::runOnMachineFunction(MachineFunction &MF) {
return SIPeepholeSDWA().run(MF);
}
+/// Returns true if the instruction has FP16 destination and all 16-bit sources.
+/// This is TableGen-generated via VOPSrcDestFP16Table in VOPInstructions.td.
static bool isSrcDestFP16Bits(MachineInstr *MI, const SIInstrInfo *TII) {
- static const DenseSet<unsigned> FP16BitOpcodes = {
- // VOP1 FP16 unary operations
- AMDGPU::V_CVT_F16_U16_e32, AMDGPU::V_CVT_F16_U16_e64,
- AMDGPU::V_CVT_F16_I16_e32, AMDGPU::V_CVT_F16_I16_e64,
- AMDGPU::V_RCP_F16_e64, AMDGPU::V_RCP_F16_e32, AMDGPU::V_RSQ_F16_e64,
- AMDGPU::V_RSQ_F16_e32, AMDGPU::V_SQRT_F16_e64, AMDGPU::V_SQRT_F16_e32,
- AMDGPU::V_LOG_F16_e64, AMDGPU::V_LOG_F16_e32, AMDGPU::V_EXP_F16_e64,
- AMDGPU::V_EXP_F16_e32, AMDGPU::V_SIN_F16_e64, AMDGPU::V_SIN_F16_e32,
- AMDGPU::V_COS_F16_e64, AMDGPU::V_COS_F16_e32, AMDGPU::V_FLOOR_F16_e64,
- AMDGPU::V_FLOOR_F16_e32, AMDGPU::V_CEIL_F16_e64, AMDGPU::V_CEIL_F16_e32,
- AMDGPU::V_TRUNC_F16_e64, AMDGPU::V_TRUNC_F16_e32, AMDGPU::V_RNDNE_F16_e64,
- AMDGPU::V_RNDNE_F16_e32, AMDGPU::V_FRACT_F16_e64, AMDGPU::V_FRACT_F16_e32,
- AMDGPU::V_FREXP_MANT_F16_e64, AMDGPU::V_FREXP_MANT_F16_e32,
- AMDGPU::V_FREXP_EXP_I16_F16_e64, AMDGPU::V_FREXP_EXP_I16_F16_e32,
- // VOP2 FP16 binary operations
- AMDGPU::V_LDEXP_F16_e64, AMDGPU::V_LDEXP_F16_e32, AMDGPU::V_ADD_F16_e64,
- AMDGPU::V_ADD_F16_e32, AMDGPU::V_SUB_F16_e64, AMDGPU::V_SUB_F16_e32,
- AMDGPU::V_SUBREV_F16_e64, AMDGPU::V_SUBREV_F16_e32, AMDGPU::V_MUL_F16_e64,
- AMDGPU::V_MUL_F16_e32, AMDGPU::V_MAX_F16_e64, AMDGPU::V_MAX_F16_e32,
- AMDGPU::V_MIN_F16_e64, AMDGPU::V_MIN_F16_e32,
- // VOP2 FP16 multiply-accumulate operations
- AMDGPU::V_FMAC_F16_e64, AMDGPU::V_FMAC_F16_e32, AMDGPU::V_MAC_F16_e64,
- AMDGPU::V_MAC_F16_e32,
- // VOP3 FP16 ternary operations
- AMDGPU::V_MAD_F16_e64, AMDGPU::V_FMA_F16_e64,
- AMDGPU::V_DIV_FIXUP_F16_e64};
-
unsigned Opcode = MI->getOpcode();
if (TII->isSDWA(Opcode))
Opcode = AMDGPU::getBasicFromSDWAOp(Opcode);
- return FP16BitOpcodes.contains(Opcode);
+ return AMDGPU::isSrcDestFP16Inst(Opcode);
}
static bool checkForRightSrcRootAccess(MachineInstr *Def0MI,
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 90f0b49ab9a78..dbe3f41a51e71 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -410,6 +410,14 @@ struct VOPTrue16Info {
bool IsTrue16;
};
+struct VOPSrcDestFP16Info {
+ uint16_t Opcode;
+ bool IsSrcDestFP16;
+};
+
+#define GET_VOPSrcDestFP16Table_DECL
+#define GET_VOPSrcDestFP16Table_IMPL
+
#define GET_FP4FP8DstByteSelTable_DECL
#define GET_FP4FP8DstByteSelTable_IMPL
@@ -768,6 +776,11 @@ bool isTrue16Inst(unsigned Opc) {
return Info && Info->IsTrue16;
}
+bool isSrcDestFP16Inst(unsigned Opc) {
+ const VOPSrcDestFP16Info *Info = getSrcDestFP16OpcodeHelper(Opc);
+ return Info && Info->IsSrcDestFP16;
+}
+
FPType getFPDstSelType(unsigned Opc) {
const FP4FP8DstByteSelInfo *Info = getFP4FP8DstByteSelHelper(Opc);
if (!Info)
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 3a352006e006c..6e5592d6682cd 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -986,6 +986,9 @@ unsigned getTemporalHintType(const MCInstrDesc TID);
LLVM_READONLY
bool isTrue16Inst(unsigned Opc);
+LLVM_READONLY
+bool isSrcDestFP16Inst(unsigned Opc);
+
LLVM_READONLY
FPType getFPDstSelType(unsigned Opc);
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index ea3edb8ca6662..407351592c329 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -73,6 +73,15 @@ class VOP_Pseudo <string opName, string suffix, VOPProfile P, dag outs, dag ins,
bit IsTrue16 = P.IsTrue16;
VOPProfile Pfl = P;
+ // True if destination is FP16 and all sources are 16-bit (FP16, BF16, or INT16).
+ // Used for V_PACK_B32_F16 optimization in SIPeepholeSDWA Pass.
+ bit IsSrcDestFP16 = !and(
+ !eq(P.DstVT, f16),
+ !or(!eq(P.Src0VT, untyped), !eq(P.Src0VT.Size, 16)),
+ !or(!eq(P.Src1VT, untyped), !eq(P.Src1VT.Size, 16)),
+ !or(!eq(P.Src2VT, untyped), !eq(P.Src2VT.Size, 16))
+ );
+
string AsmOperands;
}
@@ -2278,3 +2287,12 @@ def VOPTrue16Table : GenericTable {
let PrimaryKey = ["Opcode"];
let PrimaryKeyName = "getTrue16OpcodeHelper";
}
+
+def VOPSrcDestFP16Table : GenericTable {
+ let FilterClass = "VOP_Pseudo";
+ let CppTypeName = "VOPSrcDestFP16Info";
+ let Fields = ["Opcode", "IsSrcDestFP16"];
+
+ let PrimaryKey = ["Opcode"];
+ let PrimaryKeyName = "getSrcDestFP16OpcodeHelper";
+}
More information about the llvm-commits
mailing list