[llvm] [AMDGPU] Add pseudo scalar trans instructions for GFX12 (PR #75204)
Mirko BrkuĊĦanin via llvm-commits
llvm-commits at lists.llvm.org
Fri Dec 15 01:06:48 PST 2023
https://github.com/mbrkusanin updated https://github.com/llvm/llvm-project/pull/75204
>From c36718f61817b42d39b9ef6f1b7e5366f6916963 Mon Sep 17 00:00:00 2001
From: Mirko Brkusanin <Mirko.Brkusanin at amd.com>
Date: Tue, 12 Dec 2023 16:33:19 +0100
Subject: [PATCH 1/4] [AMDGPU][MC] Add pseudo scalar transcendental
instructions for GFX12
---
llvm/lib/Target/AMDGPU/AMDGPU.td | 10 +
.../Disassembler/AMDGPUDisassembler.cpp | 1 +
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 1 +
llvm/lib/Target/AMDGPU/VOP3Instructions.td | 37 ++
llvm/lib/Target/AMDGPU/VOPInstructions.td | 4 +
llvm/test/MC/AMDGPU/gfx12_asm_vop3.s | 570 ++++++++++++++++++
.../Disassembler/AMDGPU/gfx12_dasm_vop3.txt | 570 ++++++++++++++++++
7 files changed, 1193 insertions(+)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 920cf784858768..91e0c86cd365c6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -822,6 +822,12 @@ def FeatureVGPRSingleUseHintInsts : SubtargetFeature<"vgpr-singleuse-hint",
"Has single-use VGPR hint instructions"
>;
+def FeaturePseudoScalarTrans : SubtargetFeature<"pseudo-scalar-trans",
+ "HasPseudoScalarTrans",
+ "true",
+ "Has Pseudo Scalar Transcendental instructions"
+>;
+
//===------------------------------------------------------------===//
// Subtarget Features (options and debugging)
//===------------------------------------------------------------===//
@@ -1467,6 +1473,7 @@ def FeatureISAVersion12 : FeatureSet<
FeaturePackedTID,
FeatureVcmpxPermlaneHazard,
FeatureSALUFloatInsts,
+ FeaturePseudoScalarTrans,
FeatureVGPRSingleUseHintInsts,
FeatureMADIntraFwdBug,
FeatureScalarDwordx3Loads]>;
@@ -2009,6 +2016,9 @@ def HasSALUFloatInsts : Predicate<"Subtarget->hasSALUFloatInsts()">,
def HasVGPRSingleUseHintInsts : Predicate<"Subtarget->hasVGPRSingleUseHintInsts()">,
AssemblerPredicate<(all_of FeatureVGPRSingleUseHintInsts)>;
+def HasPseudoScalarTrans : Predicate<"Subtarget->hasPseudoScalarTrans()">,
+ AssemblerPredicate<(all_of FeaturePseudoScalarTrans)>;
+
def HasGDS : Predicate<"Subtarget->hasGDS()">;
def HasGWS : Predicate<"Subtarget->hasGWS()">;
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 392bc626167cf6..ed2e7e4f189e01 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -209,6 +209,7 @@ DECODE_OPERAND_REG_8(VReg_512)
DECODE_OPERAND_REG_8(VReg_1024)
DECODE_OPERAND_REG_7(SReg_32, OPW32)
+DECODE_OPERAND_REG_7(SReg_32_XEXEC, OPW32)
DECODE_OPERAND_REG_7(SReg_32_XM0_XEXEC, OPW32)
DECODE_OPERAND_REG_7(SReg_32_XEXEC_HI, OPW32)
DECODE_OPERAND_REG_7(SReg_64, OPW64)
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index a316d608bf573d..a412602bf1bde0 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -198,6 +198,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool ScalarizeGlobal = false;
bool HasSALUFloatInsts = false;
bool HasVGPRSingleUseHintInsts = false;
+ bool HasPseudoScalarTrans = false;
bool HasVcmpxPermlaneHazard = false;
bool HasVMEMtoScalarWriteHazard = false;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 685c9ac6a2be40..d7b11cc90ea5ab 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -840,6 +840,33 @@ let SubtargetPredicate = HasDot9Insts, IsDOT=1 in {
defm V_DOT2_BF16_BF16 : VOP3Inst<"v_dot2_bf16_bf16", VOP3_DOT_Profile<VOP_I16_V2I16_V2I16_I16>, int_amdgcn_fdot2_bf16_bf16>;
}
+class VOP_Pseudo_Scalar<RegisterClass Dst, RegisterOperand SrcOp,
+ ValueType dstVt, ValueType srcVt = dstVt>
+ : VOPProfile<[dstVt, srcVt, untyped, untyped]> {
+ let DstRC = VOPDstOperand<Dst>;
+ let Src0RC64 = SrcOp;
+
+ let HasOMod = 1;
+ let HasModifiers = 1;
+}
+
+def VOP_Pseudo_Scalar_F32 : VOP_Pseudo_Scalar<SReg_32_XEXEC, SSrc_f32, f32>;
+def VOP_Pseudo_Scalar_F16 : VOP_Pseudo_Scalar<SReg_32_XEXEC, SSrc_f16, f32, f16>;
+
+let SubtargetPredicate = HasPseudoScalarTrans, TRANS = 1,
+ isReMaterializable = 1 in {
+ defm V_S_EXP_F32 : VOP3PseudoScalarInst<"v_s_exp_f32", VOP_Pseudo_Scalar_F32>;
+ defm V_S_EXP_F16 : VOP3PseudoScalarInst<"v_s_exp_f16", VOP_Pseudo_Scalar_F16>;
+ defm V_S_LOG_F32 : VOP3PseudoScalarInst<"v_s_log_f32", VOP_Pseudo_Scalar_F32>;
+ defm V_S_LOG_F16 : VOP3PseudoScalarInst<"v_s_log_f16", VOP_Pseudo_Scalar_F16>;
+ defm V_S_RCP_F32 : VOP3PseudoScalarInst<"v_s_rcp_f32", VOP_Pseudo_Scalar_F32>;
+ defm V_S_RCP_F16 : VOP3PseudoScalarInst<"v_s_rcp_f16", VOP_Pseudo_Scalar_F16>;
+ defm V_S_RSQ_F32 : VOP3PseudoScalarInst<"v_s_rsq_f32", VOP_Pseudo_Scalar_F32>;
+ defm V_S_RSQ_F16 : VOP3PseudoScalarInst<"v_s_rsq_f16", VOP_Pseudo_Scalar_F16>;
+ defm V_S_SQRT_F32 : VOP3PseudoScalarInst<"v_s_sqrt_f32", VOP_Pseudo_Scalar_F32>;
+ defm V_S_SQRT_F16 : VOP3PseudoScalarInst<"v_s_sqrt_f16", VOP_Pseudo_Scalar_F16>;
+}
+
//===----------------------------------------------------------------------===//
// Integer Clamp Patterns
//===----------------------------------------------------------------------===//
@@ -906,6 +933,16 @@ defm V_MINIMUMMAXIMUM_F32 : VOP3Only_Realtriple_gfx12<0x26c>;
defm V_MAXIMUMMINIMUM_F32 : VOP3Only_Realtriple_gfx12<0x26d>;
defm V_MINIMUMMAXIMUM_F16 : VOP3Only_Realtriple_t16_gfx12<0x26e>;
defm V_MAXIMUMMINIMUM_F16 : VOP3Only_Realtriple_t16_gfx12<0x26f>;
+defm V_S_EXP_F32 : VOP3Only_Real_Base_gfx12<0x280>;
+defm V_S_EXP_F16 : VOP3Only_Real_Base_gfx12<0x281>;
+defm V_S_LOG_F32 : VOP3Only_Real_Base_gfx12<0x282>;
+defm V_S_LOG_F16 : VOP3Only_Real_Base_gfx12<0x283>;
+defm V_S_RCP_F32 : VOP3Only_Real_Base_gfx12<0x284>;
+defm V_S_RCP_F16 : VOP3Only_Real_Base_gfx12<0x285>;
+defm V_S_RSQ_F32 : VOP3Only_Real_Base_gfx12<0x286>;
+defm V_S_RSQ_F16 : VOP3Only_Real_Base_gfx12<0x287>;
+defm V_S_SQRT_F32 : VOP3Only_Real_Base_gfx12<0x288>;
+defm V_S_SQRT_F16 : VOP3Only_Real_Base_gfx12<0x289>;
defm V_MAD_CO_U64_U32 : VOP3be_Real_with_name_gfx12<0x2fe, "V_MAD_U64_U32", "v_mad_co_u64_u32">;
defm V_MAD_CO_I64_I32 : VOP3be_Real_with_name_gfx12<0x2ff, "V_MAD_I64_I32", "v_mad_co_i64_i32">;
defm V_MINIMUM_F64 : VOP3Only_Real_Base_gfx12<0x341>;
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 96184486e06f77..943a6770b4372f 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -1303,6 +1303,10 @@ multiclass VOP3Inst<string OpName, VOPProfile P, SDPatternOperator node = null_f
} // end SubtargetPredicate = isGFX11Plus
}
+multiclass VOP3PseudoScalarInst<string OpName, VOPProfile P> {
+ def _e64 : VOP3_Pseudo<OpName, P>;
+}
+
//===----------------------------------------------------------------------===//
// VOP3 DPP
//===----------------------------------------------------------------------===//
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
index e97e4b8c7241c5..c7d66f416e9426 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
@@ -6611,3 +6611,573 @@ v_minimummaximum_f16 v5, 0.5, -m0, 0.5
v_minimummaximum_f16 v5, -src_scc, |vcc_lo|, -1
// GFX12: encoding: [0x05,0x02,0x6e,0xd6,0xfd,0xd4,0x04,0x23]
+
+v_s_exp_f32 s5, s1
+// GFX12: encoding: [0x05,0x00,0x80,0xd6,0x01,0x00,0x00,0x00]
+
+v_s_exp_f32 s5, s105
+// GFX12: encoding: [0x05,0x00,0x80,0xd6,0x69,0x00,0x00,0x00]
+
+v_s_exp_f32 s5, vcc_lo
+// GFX12: encoding: [0x05,0x00,0x80,0xd6,0x6a,0x00,0x00,0x00]
+
+v_s_exp_f32 s5, vcc_hi
+// GFX12: encoding: [0x05,0x00,0x80,0xd6,0x6b,0x00,0x00,0x00]
+
+v_s_exp_f32 s5, ttmp15
+// GFX12: encoding: [0x05,0x00,0x80,0xd6,0x7b,0x00,0x00,0x00]
+
+v_s_exp_f32 s5, m0
+// GFX12: encoding: [0x05,0x00,0x80,0xd6,0x7d,0x00,0x00,0x00]
+
+v_s_exp_f32 s5, exec_lo
+// GFX12: encoding: [0x05,0x00,0x80,0xd6,0x7e,0x00,0x00,0x00]
+
+v_s_exp_f32 s5, exec_hi
+// GFX12: encoding: [0x05,0x00,0x80,0xd6,0x7f,0x00,0x00,0x00]
+
+v_s_exp_f32 s5, null
+// GFX12: encoding: [0x05,0x00,0x80,0xd6,0x7c,0x00,0x00,0x00]
+
+v_s_exp_f32 s5, -1
+// GFX12: encoding: [0x05,0x00,0x80,0xd6,0xc1,0x00,0x00,0x00]
+
+v_s_exp_f32 s5, 0.5
+// GFX12: encoding: [0x05,0x00,0x80,0xd6,0xf0,0x00,0x00,0x00]
+
+v_s_exp_f32 s5, src_scc
+// GFX12: encoding: [0x05,0x00,0x80,0xd6,0xfd,0x00,0x00,0x00]
+
+v_s_exp_f32 s105, 0xaf123456
+// GFX12: encoding: [0x69,0x00,0x80,0xd6,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
+
+v_s_exp_f32 s5, -s1
+// GFX12: encoding: [0x05,0x00,0x80,0xd6,0x01,0x00,0x00,0x20]
+
+v_s_exp_f32 s5, |s1|
+// GFX12: encoding: [0x05,0x01,0x80,0xd6,0x01,0x00,0x00,0x00]
+
+v_s_exp_f32 s5, s1 clamp
+// GFX12: encoding: [0x05,0x80,0x80,0xd6,0x01,0x00,0x00,0x00]
+
+v_s_exp_f32 s5, s1 mul:2
+// GFX12: encoding: [0x05,0x00,0x80,0xd6,0x01,0x00,0x00,0x08]
+
+v_s_exp_f32 s5, s1 mul:4
+// GFX12: encoding: [0x05,0x00,0x80,0xd6,0x01,0x00,0x00,0x10]
+
+v_s_exp_f32 s5, s1 div:2
+// GFX12: encoding: [0x05,0x00,0x80,0xd6,0x01,0x00,0x00,0x18]
+
+v_s_exp_f16 s5, s1
+// GFX12: encoding: [0x05,0x00,0x81,0xd6,0x01,0x00,0x00,0x00]
+
+v_s_exp_f16 s5, s105
+// GFX12: encoding: [0x05,0x00,0x81,0xd6,0x69,0x00,0x00,0x00]
+
+v_s_exp_f16 s5, vcc_lo
+// GFX12: encoding: [0x05,0x00,0x81,0xd6,0x6a,0x00,0x00,0x00]
+
+v_s_exp_f16 s5, vcc_hi
+// GFX12: encoding: [0x05,0x00,0x81,0xd6,0x6b,0x00,0x00,0x00]
+
+v_s_exp_f16 s5, ttmp15
+// GFX12: encoding: [0x05,0x00,0x81,0xd6,0x7b,0x00,0x00,0x00]
+
+v_s_exp_f16 s5, m0
+// GFX12: encoding: [0x05,0x00,0x81,0xd6,0x7d,0x00,0x00,0x00]
+
+v_s_exp_f16 s5, exec_lo
+// GFX12: encoding: [0x05,0x00,0x81,0xd6,0x7e,0x00,0x00,0x00]
+
+v_s_exp_f16 s5, exec_hi
+// GFX12: encoding: [0x05,0x00,0x81,0xd6,0x7f,0x00,0x00,0x00]
+
+v_s_exp_f16 s5, null
+// GFX12: encoding: [0x05,0x00,0x81,0xd6,0x7c,0x00,0x00,0x00]
+
+v_s_exp_f16 s5, -1
+// GFX12: encoding: [0x05,0x00,0x81,0xd6,0xc1,0x00,0x00,0x00]
+
+v_s_exp_f16 s5, 0.5
+// GFX12: encoding: [0x05,0x00,0x81,0xd6,0xf0,0x00,0x00,0x00]
+
+v_s_exp_f16 s5, src_scc
+// GFX12: encoding: [0x05,0x00,0x81,0xd6,0xfd,0x00,0x00,0x00]
+
+v_s_exp_f16 s105, 0xaf12
+// GFX12: encoding: [0x69,0x00,0x81,0xd6,0xff,0x00,0x00,0x00,0x12,0xaf,0x00,0x00]
+
+v_s_exp_f16 s5, -s1
+// GFX12: encoding: [0x05,0x00,0x81,0xd6,0x01,0x00,0x00,0x20]
+
+v_s_exp_f16 s5, |s1|
+// GFX12: encoding: [0x05,0x01,0x81,0xd6,0x01,0x00,0x00,0x00]
+
+v_s_exp_f16 s5, s1 clamp
+// GFX12: encoding: [0x05,0x80,0x81,0xd6,0x01,0x00,0x00,0x00]
+
+v_s_exp_f16 s5, s1 mul:2
+// GFX12: encoding: [0x05,0x00,0x81,0xd6,0x01,0x00,0x00,0x08]
+
+v_s_exp_f16 s5, s1 mul:4
+// GFX12: encoding: [0x05,0x00,0x81,0xd6,0x01,0x00,0x00,0x10]
+
+v_s_exp_f16 s5, s1 div:2
+// GFX12: encoding: [0x05,0x00,0x81,0xd6,0x01,0x00,0x00,0x18]
+
+v_s_log_f32 s5, s1
+// GFX12: encoding: [0x05,0x00,0x82,0xd6,0x01,0x00,0x00,0x00]
+
+v_s_log_f32 s5, s105
+// GFX12: encoding: [0x05,0x00,0x82,0xd6,0x69,0x00,0x00,0x00]
+
+v_s_log_f32 s5, vcc_lo
+// GFX12: encoding: [0x05,0x00,0x82,0xd6,0x6a,0x00,0x00,0x00]
+
+v_s_log_f32 s5, vcc_hi
+// GFX12: encoding: [0x05,0x00,0x82,0xd6,0x6b,0x00,0x00,0x00]
+
+v_s_log_f32 s5, ttmp15
+// GFX12: encoding: [0x05,0x00,0x82,0xd6,0x7b,0x00,0x00,0x00]
+
+v_s_log_f32 s5, m0
+// GFX12: encoding: [0x05,0x00,0x82,0xd6,0x7d,0x00,0x00,0x00]
+
+v_s_log_f32 s5, exec_lo
+// GFX12: encoding: [0x05,0x00,0x82,0xd6,0x7e,0x00,0x00,0x00]
+
+v_s_log_f32 s5, exec_hi
+// GFX12: encoding: [0x05,0x00,0x82,0xd6,0x7f,0x00,0x00,0x00]
+
+v_s_log_f32 s5, null
+// GFX12: encoding: [0x05,0x00,0x82,0xd6,0x7c,0x00,0x00,0x00]
+
+v_s_log_f32 s5, -1
+// GFX12: encoding: [0x05,0x00,0x82,0xd6,0xc1,0x00,0x00,0x00]
+
+v_s_log_f32 s5, 0.5
+// GFX12: encoding: [0x05,0x00,0x82,0xd6,0xf0,0x00,0x00,0x00]
+
+v_s_log_f32 s5, src_scc
+// GFX12: encoding: [0x05,0x00,0x82,0xd6,0xfd,0x00,0x00,0x00]
+
+v_s_log_f32 s105, 0xaf123456
+// GFX12: encoding: [0x69,0x00,0x82,0xd6,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
+
+v_s_log_f32 s5, -s1
+// GFX12: encoding: [0x05,0x00,0x82,0xd6,0x01,0x00,0x00,0x20]
+
+v_s_log_f32 s5, |s1|
+// GFX12: encoding: [0x05,0x01,0x82,0xd6,0x01,0x00,0x00,0x00]
+
+v_s_log_f32 s5, s1 clamp
+// GFX12: encoding: [0x05,0x80,0x82,0xd6,0x01,0x00,0x00,0x00]
+
+v_s_log_f32 s5, s1 mul:2
+// GFX12: encoding: [0x05,0x00,0x82,0xd6,0x01,0x00,0x00,0x08]
+
+v_s_log_f32 s5, s1 mul:4
+// GFX12: encoding: [0x05,0x00,0x82,0xd6,0x01,0x00,0x00,0x10]
+
+v_s_log_f32 s5, s1 div:2
+// GFX12: encoding: [0x05,0x00,0x82,0xd6,0x01,0x00,0x00,0x18]
+
+v_s_log_f16 s5, s1
+// GFX12: encoding: [0x05,0x00,0x83,0xd6,0x01,0x00,0x00,0x00]
+
+v_s_log_f16 s5, s105
+// GFX12: encoding: [0x05,0x00,0x83,0xd6,0x69,0x00,0x00,0x00]
+
+v_s_log_f16 s5, vcc_lo
+// GFX12: encoding: [0x05,0x00,0x83,0xd6,0x6a,0x00,0x00,0x00]
+
+v_s_log_f16 s5, vcc_hi
+// GFX12: encoding: [0x05,0x00,0x83,0xd6,0x6b,0x00,0x00,0x00]
+
+v_s_log_f16 s5, ttmp15
+// GFX12: encoding: [0x05,0x00,0x83,0xd6,0x7b,0x00,0x00,0x00]
+
+v_s_log_f16 s5, m0
+// GFX12: encoding: [0x05,0x00,0x83,0xd6,0x7d,0x00,0x00,0x00]
+
+v_s_log_f16 s5, exec_lo
+// GFX12: encoding: [0x05,0x00,0x83,0xd6,0x7e,0x00,0x00,0x00]
+
+v_s_log_f16 s5, exec_hi
+// GFX12: encoding: [0x05,0x00,0x83,0xd6,0x7f,0x00,0x00,0x00]
+
+v_s_log_f16 s5, null
+// GFX12: encoding: [0x05,0x00,0x83,0xd6,0x7c,0x00,0x00,0x00]
+
+v_s_log_f16 s5, -1
+// GFX12: encoding: [0x05,0x00,0x83,0xd6,0xc1,0x00,0x00,0x00]
+
+v_s_log_f16 s5, 0.5
+// GFX12: encoding: [0x05,0x00,0x83,0xd6,0xf0,0x00,0x00,0x00]
+
+v_s_log_f16 s5, src_scc
+// GFX12: encoding: [0x05,0x00,0x83,0xd6,0xfd,0x00,0x00,0x00]
+
+v_s_log_f16 s105, 0xaf12
+// GFX12: encoding: [0x69,0x00,0x83,0xd6,0xff,0x00,0x00,0x00,0x12,0xaf,0x00,0x00]
+
+v_s_log_f16 s5, -s1
+// GFX12: encoding: [0x05,0x00,0x83,0xd6,0x01,0x00,0x00,0x20]
+
+v_s_log_f16 s5, |s1|
+// GFX12: encoding: [0x05,0x01,0x83,0xd6,0x01,0x00,0x00,0x00]
+
+v_s_log_f16 s5, s1 clamp
+// GFX12: encoding: [0x05,0x80,0x83,0xd6,0x01,0x00,0x00,0x00]
+
+v_s_log_f16 s5, s1 mul:2
+// GFX12: encoding: [0x05,0x00,0x83,0xd6,0x01,0x00,0x00,0x08]
+
+v_s_log_f16 s5, s1 mul:4
+// GFX12: encoding: [0x05,0x00,0x83,0xd6,0x01,0x00,0x00,0x10]
+
+v_s_log_f16 s5, s1 div:2
+// GFX12: encoding: [0x05,0x00,0x83,0xd6,0x01,0x00,0x00,0x18]
+
+v_s_rcp_f32 s5, s1
+// GFX12: encoding: [0x05,0x00,0x84,0xd6,0x01,0x00,0x00,0x00]
+
+v_s_rcp_f32 s5, s105
+// GFX12: encoding: [0x05,0x00,0x84,0xd6,0x69,0x00,0x00,0x00]
+
+v_s_rcp_f32 s5, vcc_lo
+// GFX12: encoding: [0x05,0x00,0x84,0xd6,0x6a,0x00,0x00,0x00]
+
+v_s_rcp_f32 s5, vcc_hi
+// GFX12: encoding: [0x05,0x00,0x84,0xd6,0x6b,0x00,0x00,0x00]
+
+v_s_rcp_f32 s5, ttmp15
+// GFX12: encoding: [0x05,0x00,0x84,0xd6,0x7b,0x00,0x00,0x00]
+
+v_s_rcp_f32 s5, m0
+// GFX12: encoding: [0x05,0x00,0x84,0xd6,0x7d,0x00,0x00,0x00]
+
+v_s_rcp_f32 s5, exec_lo
+// GFX12: encoding: [0x05,0x00,0x84,0xd6,0x7e,0x00,0x00,0x00]
+
+v_s_rcp_f32 s5, exec_hi
+// GFX12: encoding: [0x05,0x00,0x84,0xd6,0x7f,0x00,0x00,0x00]
+
+v_s_rcp_f32 s5, null
+// GFX12: encoding: [0x05,0x00,0x84,0xd6,0x7c,0x00,0x00,0x00]
+
+v_s_rcp_f32 s5, -1
+// GFX12: encoding: [0x05,0x00,0x84,0xd6,0xc1,0x00,0x00,0x00]
+
+v_s_rcp_f32 s5, 0.5
+// GFX12: encoding: [0x05,0x00,0x84,0xd6,0xf0,0x00,0x00,0x00]
+
+v_s_rcp_f32 s5, src_scc
+// GFX12: encoding: [0x05,0x00,0x84,0xd6,0xfd,0x00,0x00,0x00]
+
+v_s_rcp_f32 s105, 0xaf123456
+// GFX12: encoding: [0x69,0x00,0x84,0xd6,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
+
+v_s_rcp_f32 s5, -s1
+// GFX12: encoding: [0x05,0x00,0x84,0xd6,0x01,0x00,0x00,0x20]
+
+v_s_rcp_f32 s5, |s1|
+// GFX12: encoding: [0x05,0x01,0x84,0xd6,0x01,0x00,0x00,0x00]
+
+v_s_rcp_f32 s5, s1 clamp
+// GFX12: encoding: [0x05,0x80,0x84,0xd6,0x01,0x00,0x00,0x00]
+
+v_s_rcp_f32 s5, s1 mul:2
+// GFX12: encoding: [0x05,0x00,0x84,0xd6,0x01,0x00,0x00,0x08]
+
+v_s_rcp_f32 s5, s1 mul:4
+// GFX12: encoding: [0x05,0x00,0x84,0xd6,0x01,0x00,0x00,0x10]
+
+v_s_rcp_f32 s5, s1 div:2
+// GFX12: encoding: [0x05,0x00,0x84,0xd6,0x01,0x00,0x00,0x18]
+
+v_s_rcp_f16 s5, s1
+// GFX12: encoding: [0x05,0x00,0x85,0xd6,0x01,0x00,0x00,0x00]
+
+v_s_rcp_f16 s5, s105
+// GFX12: encoding: [0x05,0x00,0x85,0xd6,0x69,0x00,0x00,0x00]
+
+v_s_rcp_f16 s5, vcc_lo
+// GFX12: encoding: [0x05,0x00,0x85,0xd6,0x6a,0x00,0x00,0x00]
+
+v_s_rcp_f16 s5, vcc_hi
+// GFX12: encoding: [0x05,0x00,0x85,0xd6,0x6b,0x00,0x00,0x00]
+
+v_s_rcp_f16 s5, ttmp15
+// GFX12: encoding: [0x05,0x00,0x85,0xd6,0x7b,0x00,0x00,0x00]
+
+v_s_rcp_f16 s5, m0
+// GFX12: encoding: [0x05,0x00,0x85,0xd6,0x7d,0x00,0x00,0x00]
+
+v_s_rcp_f16 s5, exec_lo
+// GFX12: encoding: [0x05,0x00,0x85,0xd6,0x7e,0x00,0x00,0x00]
+
+v_s_rcp_f16 s5, exec_hi
+// GFX12: encoding: [0x05,0x00,0x85,0xd6,0x7f,0x00,0x00,0x00]
+
+v_s_rcp_f16 s5, null
+// GFX12: encoding: [0x05,0x00,0x85,0xd6,0x7c,0x00,0x00,0x00]
+
+v_s_rcp_f16 s5, -1
+// GFX12: encoding: [0x05,0x00,0x85,0xd6,0xc1,0x00,0x00,0x00]
+
+v_s_rcp_f16 s5, 0.5
+// GFX12: encoding: [0x05,0x00,0x85,0xd6,0xf0,0x00,0x00,0x00]
+
+v_s_rcp_f16 s5, src_scc
+// GFX12: encoding: [0x05,0x00,0x85,0xd6,0xfd,0x00,0x00,0x00]
+
+v_s_rcp_f16 s105, 0xaf12
+// GFX12: encoding: [0x69,0x00,0x85,0xd6,0xff,0x00,0x00,0x00,0x12,0xaf,0x00,0x00]
+
+v_s_rcp_f16 s5, -s1
+// GFX12: encoding: [0x05,0x00,0x85,0xd6,0x01,0x00,0x00,0x20]
+
+v_s_rcp_f16 s5, |s1|
+// GFX12: encoding: [0x05,0x01,0x85,0xd6,0x01,0x00,0x00,0x00]
+
+v_s_rcp_f16 s5, s1 clamp
+// GFX12: encoding: [0x05,0x80,0x85,0xd6,0x01,0x00,0x00,0x00]
+
+v_s_rcp_f16 s5, s1 mul:2
+// GFX12: encoding: [0x05,0x00,0x85,0xd6,0x01,0x00,0x00,0x08]
+
+v_s_rcp_f16 s5, s1 mul:4
+// GFX12: encoding: [0x05,0x00,0x85,0xd6,0x01,0x00,0x00,0x10]
+
+v_s_rcp_f16 s5, s1 div:2
+// GFX12: encoding: [0x05,0x00,0x85,0xd6,0x01,0x00,0x00,0x18]
+
+v_s_rsq_f32 s5, s1
+// GFX12: encoding: [0x05,0x00,0x86,0xd6,0x01,0x00,0x00,0x00]
+
+v_s_rsq_f32 s5, s105
+// GFX12: encoding: [0x05,0x00,0x86,0xd6,0x69,0x00,0x00,0x00]
+
+v_s_rsq_f32 s5, vcc_lo
+// GFX12: encoding: [0x05,0x00,0x86,0xd6,0x6a,0x00,0x00,0x00]
+
+v_s_rsq_f32 s5, vcc_hi
+// GFX12: encoding: [0x05,0x00,0x86,0xd6,0x6b,0x00,0x00,0x00]
+
+v_s_rsq_f32 s5, ttmp15
+// GFX12: encoding: [0x05,0x00,0x86,0xd6,0x7b,0x00,0x00,0x00]
+
+v_s_rsq_f32 s5, m0
+// GFX12: encoding: [0x05,0x00,0x86,0xd6,0x7d,0x00,0x00,0x00]
+
+v_s_rsq_f32 s5, exec_lo
+// GFX12: encoding: [0x05,0x00,0x86,0xd6,0x7e,0x00,0x00,0x00]
+
+v_s_rsq_f32 s5, exec_hi
+// GFX12: encoding: [0x05,0x00,0x86,0xd6,0x7f,0x00,0x00,0x00]
+
+v_s_rsq_f32 s5, null
+// GFX12: encoding: [0x05,0x00,0x86,0xd6,0x7c,0x00,0x00,0x00]
+
+v_s_rsq_f32 s5, -1
+// GFX12: encoding: [0x05,0x00,0x86,0xd6,0xc1,0x00,0x00,0x00]
+
+v_s_rsq_f32 s5, 0.5
+// GFX12: encoding: [0x05,0x00,0x86,0xd6,0xf0,0x00,0x00,0x00]
+
+v_s_rsq_f32 s5, src_scc
+// GFX12: encoding: [0x05,0x00,0x86,0xd6,0xfd,0x00,0x00,0x00]
+
+v_s_rsq_f32 s105, 0xaf123456
+// GFX12: encoding: [0x69,0x00,0x86,0xd6,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
+
+v_s_rsq_f32 s5, -s1
+// GFX12: encoding: [0x05,0x00,0x86,0xd6,0x01,0x00,0x00,0x20]
+
+v_s_rsq_f32 s5, |s1|
+// GFX12: encoding: [0x05,0x01,0x86,0xd6,0x01,0x00,0x00,0x00]
+
+v_s_rsq_f32 s5, s1 clamp
+// GFX12: encoding: [0x05,0x80,0x86,0xd6,0x01,0x00,0x00,0x00]
+
+v_s_rsq_f32 s5, s1 mul:2
+// GFX12: encoding: [0x05,0x00,0x86,0xd6,0x01,0x00,0x00,0x08]
+
+v_s_rsq_f32 s5, s1 mul:4
+// GFX12: encoding: [0x05,0x00,0x86,0xd6,0x01,0x00,0x00,0x10]
+
+v_s_rsq_f32 s5, s1 div:2
+// GFX12: encoding: [0x05,0x00,0x86,0xd6,0x01,0x00,0x00,0x18]
+
+v_s_rsq_f16 s5, s1
+// GFX12: encoding: [0x05,0x00,0x87,0xd6,0x01,0x00,0x00,0x00]
+
+v_s_rsq_f16 s5, s105
+// GFX12: encoding: [0x05,0x00,0x87,0xd6,0x69,0x00,0x00,0x00]
+
+v_s_rsq_f16 s5, vcc_lo
+// GFX12: encoding: [0x05,0x00,0x87,0xd6,0x6a,0x00,0x00,0x00]
+
+v_s_rsq_f16 s5, vcc_hi
+// GFX12: encoding: [0x05,0x00,0x87,0xd6,0x6b,0x00,0x00,0x00]
+
+v_s_rsq_f16 s5, ttmp15
+// GFX12: encoding: [0x05,0x00,0x87,0xd6,0x7b,0x00,0x00,0x00]
+
+v_s_rsq_f16 s5, m0
+// GFX12: encoding: [0x05,0x00,0x87,0xd6,0x7d,0x00,0x00,0x00]
+
+v_s_rsq_f16 s5, exec_lo
+// GFX12: encoding: [0x05,0x00,0x87,0xd6,0x7e,0x00,0x00,0x00]
+
+v_s_rsq_f16 s5, exec_hi
+// GFX12: encoding: [0x05,0x00,0x87,0xd6,0x7f,0x00,0x00,0x00]
+
+v_s_rsq_f16 s5, null
+// GFX12: encoding: [0x05,0x00,0x87,0xd6,0x7c,0x00,0x00,0x00]
+
+v_s_rsq_f16 s5, -1
+// GFX12: encoding: [0x05,0x00,0x87,0xd6,0xc1,0x00,0x00,0x00]
+
+v_s_rsq_f16 s5, 0.5
+// GFX12: encoding: [0x05,0x00,0x87,0xd6,0xf0,0x00,0x00,0x00]
+
+v_s_rsq_f16 s5, src_scc
+// GFX12: encoding: [0x05,0x00,0x87,0xd6,0xfd,0x00,0x00,0x00]
+
+v_s_rsq_f16 s105, 0xaf12
+// GFX12: encoding: [0x69,0x00,0x87,0xd6,0xff,0x00,0x00,0x00,0x12,0xaf,0x00,0x00]
+
+v_s_rsq_f16 s5, -s1
+// GFX12: encoding: [0x05,0x00,0x87,0xd6,0x01,0x00,0x00,0x20]
+
+v_s_rsq_f16 s5, |s1|
+// GFX12: encoding: [0x05,0x01,0x87,0xd6,0x01,0x00,0x00,0x00]
+
+v_s_rsq_f16 s5, s1 clamp
+// GFX12: encoding: [0x05,0x80,0x87,0xd6,0x01,0x00,0x00,0x00]
+
+v_s_rsq_f16 s5, s1 mul:2
+// GFX12: encoding: [0x05,0x00,0x87,0xd6,0x01,0x00,0x00,0x08]
+
+v_s_rsq_f16 s5, s1 mul:4
+// GFX12: encoding: [0x05,0x00,0x87,0xd6,0x01,0x00,0x00,0x10]
+
+v_s_rsq_f16 s5, s1 div:2
+// GFX12: encoding: [0x05,0x00,0x87,0xd6,0x01,0x00,0x00,0x18]
+
+v_s_sqrt_f32 s5, s1
+// GFX12: encoding: [0x05,0x00,0x88,0xd6,0x01,0x00,0x00,0x00]
+
+v_s_sqrt_f32 s5, s105
+// GFX12: encoding: [0x05,0x00,0x88,0xd6,0x69,0x00,0x00,0x00]
+
+v_s_sqrt_f32 s5, vcc_lo
+// GFX12: encoding: [0x05,0x00,0x88,0xd6,0x6a,0x00,0x00,0x00]
+
+v_s_sqrt_f32 s5, vcc_hi
+// GFX12: encoding: [0x05,0x00,0x88,0xd6,0x6b,0x00,0x00,0x00]
+
+v_s_sqrt_f32 s5, ttmp15
+// GFX12: encoding: [0x05,0x00,0x88,0xd6,0x7b,0x00,0x00,0x00]
+
+v_s_sqrt_f32 s5, m0
+// GFX12: encoding: [0x05,0x00,0x88,0xd6,0x7d,0x00,0x00,0x00]
+
+v_s_sqrt_f32 s5, exec_lo
+// GFX12: encoding: [0x05,0x00,0x88,0xd6,0x7e,0x00,0x00,0x00]
+
+v_s_sqrt_f32 s5, exec_hi
+// GFX12: encoding: [0x05,0x00,0x88,0xd6,0x7f,0x00,0x00,0x00]
+
+v_s_sqrt_f32 s5, null
+// GFX12: encoding: [0x05,0x00,0x88,0xd6,0x7c,0x00,0x00,0x00]
+
+v_s_sqrt_f32 s5, -1
+// GFX12: encoding: [0x05,0x00,0x88,0xd6,0xc1,0x00,0x00,0x00]
+
+v_s_sqrt_f32 s5, 0.5
+// GFX12: encoding: [0x05,0x00,0x88,0xd6,0xf0,0x00,0x00,0x00]
+
+v_s_sqrt_f32 s5, src_scc
+// GFX12: encoding: [0x05,0x00,0x88,0xd6,0xfd,0x00,0x00,0x00]
+
+v_s_sqrt_f32 s105, 0xaf123456
+// GFX12: encoding: [0x69,0x00,0x88,0xd6,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
+
+v_s_sqrt_f32 s5, -s1
+// GFX12: encoding: [0x05,0x00,0x88,0xd6,0x01,0x00,0x00,0x20]
+
+v_s_sqrt_f32 s5, |s1|
+// GFX12: encoding: [0x05,0x01,0x88,0xd6,0x01,0x00,0x00,0x00]
+
+v_s_sqrt_f32 s5, s1 clamp
+// GFX12: encoding: [0x05,0x80,0x88,0xd6,0x01,0x00,0x00,0x00]
+
+v_s_sqrt_f32 s5, s1 mul:2
+// GFX12: encoding: [0x05,0x00,0x88,0xd6,0x01,0x00,0x00,0x08]
+
+v_s_sqrt_f32 s5, s1 mul:4
+// GFX12: encoding: [0x05,0x00,0x88,0xd6,0x01,0x00,0x00,0x10]
+
+v_s_sqrt_f32 s5, s1 div:2
+// GFX12: encoding: [0x05,0x00,0x88,0xd6,0x01,0x00,0x00,0x18]
+
+v_s_sqrt_f16 s5, s1
+// GFX12: encoding: [0x05,0x00,0x89,0xd6,0x01,0x00,0x00,0x00]
+
+v_s_sqrt_f16 s5, s105
+// GFX12: encoding: [0x05,0x00,0x89,0xd6,0x69,0x00,0x00,0x00]
+
+v_s_sqrt_f16 s5, vcc_lo
+// GFX12: encoding: [0x05,0x00,0x89,0xd6,0x6a,0x00,0x00,0x00]
+
+v_s_sqrt_f16 s5, vcc_hi
+// GFX12: encoding: [0x05,0x00,0x89,0xd6,0x6b,0x00,0x00,0x00]
+
+v_s_sqrt_f16 s5, ttmp15
+// GFX12: encoding: [0x05,0x00,0x89,0xd6,0x7b,0x00,0x00,0x00]
+
+v_s_sqrt_f16 s5, m0
+// GFX12: encoding: [0x05,0x00,0x89,0xd6,0x7d,0x00,0x00,0x00]
+
+v_s_sqrt_f16 s5, exec_lo
+// GFX12: encoding: [0x05,0x00,0x89,0xd6,0x7e,0x00,0x00,0x00]
+
+v_s_sqrt_f16 s5, exec_hi
+// GFX12: encoding: [0x05,0x00,0x89,0xd6,0x7f,0x00,0x00,0x00]
+
+v_s_sqrt_f16 s5, null
+// GFX12: encoding: [0x05,0x00,0x89,0xd6,0x7c,0x00,0x00,0x00]
+
+v_s_sqrt_f16 s5, -1
+// GFX12: encoding: [0x05,0x00,0x89,0xd6,0xc1,0x00,0x00,0x00]
+
+v_s_sqrt_f16 s5, 0.5
+// GFX12: encoding: [0x05,0x00,0x89,0xd6,0xf0,0x00,0x00,0x00]
+
+v_s_sqrt_f16 s5, src_scc
+// GFX12: encoding: [0x05,0x00,0x89,0xd6,0xfd,0x00,0x00,0x00]
+
+v_s_sqrt_f16 s105, 0xaf12
+// GFX12: encoding: [0x69,0x00,0x89,0xd6,0xff,0x00,0x00,0x00,0x12,0xaf,0x00,0x00]
+
+v_s_sqrt_f16 s5, -s1
+// GFX12: encoding: [0x05,0x00,0x89,0xd6,0x01,0x00,0x00,0x20]
+
+v_s_sqrt_f16 s5, |s1|
+// GFX12: encoding: [0x05,0x01,0x89,0xd6,0x01,0x00,0x00,0x00]
+
+v_s_sqrt_f16 s5, s1 clamp
+// GFX12: encoding: [0x05,0x80,0x89,0xd6,0x01,0x00,0x00,0x00]
+
+v_s_sqrt_f16 s5, s1 mul:2
+// GFX12: encoding: [0x05,0x00,0x89,0xd6,0x01,0x00,0x00,0x08]
+
+v_s_sqrt_f16 s5, s1 mul:4
+// GFX12: encoding: [0x05,0x00,0x89,0xd6,0x01,0x00,0x00,0x10]
+
+v_s_sqrt_f16 s5, s1 div:2
+// GFX12: encoding: [0x05,0x00,0x89,0xd6,0x01,0x00,0x00,0x18]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt
index 8ff8c2c4c4f6a1..bcaa05704a14ac 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt
@@ -6124,3 +6124,573 @@
# GFX12: v_minimummaximum_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6e,0xd6,0x7e,0x82,0xad,0x01]
0x05,0x01,0x6e,0xd6,0x7e,0x82,0xad,0x01
+
+# GFX12: v_s_exp_f32 s5, s1 ; encoding: [0x05,0x00,0x80,0xd6,0x01,0x00,0x00,0x00]
+0x05,0x00,0x80,0xd6,0x01,0x00,0x00,0x00
+
+# GFX12: v_s_exp_f32 s5, s105 ; encoding: [0x05,0x00,0x80,0xd6,0x69,0x00,0x00,0x00]
+0x05,0x00,0x80,0xd6,0x69,0x00,0x00,0x00
+
+# GFX12: v_s_exp_f32 s5, vcc_lo ; encoding: [0x05,0x00,0x80,0xd6,0x6a,0x00,0x00,0x00]
+0x05,0x00,0x80,0xd6,0x6a,0x00,0x00,0x00
+
+# GFX12: v_s_exp_f32 s5, vcc_hi ; encoding: [0x05,0x00,0x80,0xd6,0x6b,0x00,0x00,0x00]
+0x05,0x00,0x80,0xd6,0x6b,0x00,0x00,0x00
+
+# GFX12: v_s_exp_f32 s5, ttmp15 ; encoding: [0x05,0x00,0x80,0xd6,0x7b,0x00,0x00,0x00]
+0x05,0x00,0x80,0xd6,0x7b,0x00,0x00,0x00
+
+# GFX12: v_s_exp_f32 s5, m0 ; encoding: [0x05,0x00,0x80,0xd6,0x7d,0x00,0x00,0x00]
+0x05,0x00,0x80,0xd6,0x7d,0x00,0x00,0x00
+
+# GFX12: v_s_exp_f32 s5, exec_lo ; encoding: [0x05,0x00,0x80,0xd6,0x7e,0x00,0x00,0x00]
+0x05,0x00,0x80,0xd6,0x7e,0x00,0x00,0x00
+
+# GFX12: v_s_exp_f32 s5, exec_hi ; encoding: [0x05,0x00,0x80,0xd6,0x7f,0x00,0x00,0x00]
+0x05,0x00,0x80,0xd6,0x7f,0x00,0x00,0x00
+
+# GFX12: v_s_exp_f32 s5, null ; encoding: [0x05,0x00,0x80,0xd6,0x7c,0x00,0x00,0x00]
+0x05,0x00,0x80,0xd6,0x7c,0x00,0x00,0x00
+
+# GFX12: v_s_exp_f32 s5, -1 ; encoding: [0x05,0x00,0x80,0xd6,0xc1,0x00,0x00,0x00]
+0x05,0x00,0x80,0xd6,0xc1,0x00,0x00,0x00
+
+# GFX12: v_s_exp_f32 s5, 0.5 ; encoding: [0x05,0x00,0x80,0xd6,0xf0,0x00,0x00,0x00]
+0x05,0x00,0x80,0xd6,0xf0,0x00,0x00,0x00
+
+# GFX12: v_s_exp_f32 s5, src_scc ; encoding: [0x05,0x00,0x80,0xd6,0xfd,0x00,0x00,0x00]
+0x05,0x00,0x80,0xd6,0xfd,0x00,0x00,0x00
+
+# GFX12: v_s_exp_f32 s105, 0xaf123456 ; encoding: [0x69,0x00,0x80,0xd6,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
+0x69,0x00,0x80,0xd6,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf
+
+# GFX12: v_s_exp_f32 s5, -s1 ; encoding: [0x05,0x00,0x80,0xd6,0x01,0x00,0x00,0x20]
+0x05,0x00,0x80,0xd6,0x01,0x00,0x00,0x20
+
+# GFX12: v_s_exp_f32 s5, |s1| ; encoding: [0x05,0x01,0x80,0xd6,0x01,0x00,0x00,0x00]
+0x05,0x01,0x80,0xd6,0x01,0x00,0x00,0x00
+
+# GFX12: v_s_exp_f32 s5, s1 clamp ; encoding: [0x05,0x80,0x80,0xd6,0x01,0x00,0x00,0x00]
+0x05,0x80,0x80,0xd6,0x01,0x00,0x00,0x00
+
+# GFX12: v_s_exp_f32 s5, s1 mul:2 ; encoding: [0x05,0x00,0x80,0xd6,0x01,0x00,0x00,0x08]
+0x05,0x00,0x80,0xd6,0x01,0x00,0x00,0x08
+
+# GFX12: v_s_exp_f32 s5, s1 mul:4 ; encoding: [0x05,0x00,0x80,0xd6,0x01,0x00,0x00,0x10]
+0x05,0x00,0x80,0xd6,0x01,0x00,0x00,0x10
+
+# GFX12: v_s_exp_f32 s5, s1 div:2 ; encoding: [0x05,0x00,0x80,0xd6,0x01,0x00,0x00,0x18]
+0x05,0x00,0x80,0xd6,0x01,0x00,0x00,0x18
+
+# GFX12: v_s_exp_f16 s5, s1 ; encoding: [0x05,0x00,0x81,0xd6,0x01,0x00,0x00,0x00]
+0x05,0x00,0x81,0xd6,0x01,0x00,0x00,0x00
+
+# GFX12: v_s_exp_f16 s5, s105 ; encoding: [0x05,0x00,0x81,0xd6,0x69,0x00,0x00,0x00]
+0x05,0x00,0x81,0xd6,0x69,0x00,0x00,0x00
+
+# GFX12: v_s_exp_f16 s5, vcc_lo ; encoding: [0x05,0x00,0x81,0xd6,0x6a,0x00,0x00,0x00]
+0x05,0x00,0x81,0xd6,0x6a,0x00,0x00,0x00
+
+# GFX12: v_s_exp_f16 s5, vcc_hi ; encoding: [0x05,0x00,0x81,0xd6,0x6b,0x00,0x00,0x00]
+0x05,0x00,0x81,0xd6,0x6b,0x00,0x00,0x00
+
+# GFX12: v_s_exp_f16 s5, ttmp15 ; encoding: [0x05,0x00,0x81,0xd6,0x7b,0x00,0x00,0x00]
+0x05,0x00,0x81,0xd6,0x7b,0x00,0x00,0x00
+
+# GFX12: v_s_exp_f16 s5, m0 ; encoding: [0x05,0x00,0x81,0xd6,0x7d,0x00,0x00,0x00]
+0x05,0x00,0x81,0xd6,0x7d,0x00,0x00,0x00
+
+# GFX12: v_s_exp_f16 s5, exec_lo ; encoding: [0x05,0x00,0x81,0xd6,0x7e,0x00,0x00,0x00]
+0x05,0x00,0x81,0xd6,0x7e,0x00,0x00,0x00
+
+# GFX12: v_s_exp_f16 s5, exec_hi ; encoding: [0x05,0x00,0x81,0xd6,0x7f,0x00,0x00,0x00]
+0x05,0x00,0x81,0xd6,0x7f,0x00,0x00,0x00
+
+# GFX12: v_s_exp_f16 s5, null ; encoding: [0x05,0x00,0x81,0xd6,0x7c,0x00,0x00,0x00]
+0x05,0x00,0x81,0xd6,0x7c,0x00,0x00,0x00
+
+# GFX12: v_s_exp_f16 s5, -1 ; encoding: [0x05,0x00,0x81,0xd6,0xc1,0x00,0x00,0x00]
+0x05,0x00,0x81,0xd6,0xc1,0x00,0x00,0x00
+
+# GFX12: v_s_exp_f16 s5, 0.5 ; encoding: [0x05,0x00,0x81,0xd6,0xf0,0x00,0x00,0x00]
+0x05,0x00,0x81,0xd6,0xf0,0x00,0x00,0x00
+
+# GFX12: v_s_exp_f16 s5, src_scc ; encoding: [0x05,0x00,0x81,0xd6,0xfd,0x00,0x00,0x00]
+0x05,0x00,0x81,0xd6,0xfd,0x00,0x00,0x00
+
+# GFX12: v_s_exp_f16 s105, 0xaf12 ; encoding: [0x69,0x00,0x81,0xd6,0xff,0x00,0x00,0x00,0x12,0xaf,0x00,0x00]
+0x69,0x00,0x81,0xd6,0xff,0x00,0x00,0x00,0x12,0xaf,0x00,0x00
+
+# GFX12: v_s_exp_f16 s5, -s1 ; encoding: [0x05,0x00,0x81,0xd6,0x01,0x00,0x00,0x20]
+0x05,0x00,0x81,0xd6,0x01,0x00,0x00,0x20
+
+# GFX12: v_s_exp_f16 s5, |s1| ; encoding: [0x05,0x01,0x81,0xd6,0x01,0x00,0x00,0x00]
+0x05,0x01,0x81,0xd6,0x01,0x00,0x00,0x00
+
+# GFX12: v_s_exp_f16 s5, s1 clamp ; encoding: [0x05,0x80,0x81,0xd6,0x01,0x00,0x00,0x00]
+0x05,0x80,0x81,0xd6,0x01,0x00,0x00,0x00
+
+# GFX12: v_s_exp_f16 s5, s1 mul:2 ; encoding: [0x05,0x00,0x81,0xd6,0x01,0x00,0x00,0x08]
+0x05,0x00,0x81,0xd6,0x01,0x00,0x00,0x08
+
+# GFX12: v_s_exp_f16 s5, s1 mul:4 ; encoding: [0x05,0x00,0x81,0xd6,0x01,0x00,0x00,0x10]
+0x05,0x00,0x81,0xd6,0x01,0x00,0x00,0x10
+
+# GFX12: v_s_exp_f16 s5, s1 div:2 ; encoding: [0x05,0x00,0x81,0xd6,0x01,0x00,0x00,0x18]
+0x05,0x00,0x81,0xd6,0x01,0x00,0x00,0x18
+
+# GFX12: v_s_log_f32 s5, s1 ; encoding: [0x05,0x00,0x82,0xd6,0x01,0x00,0x00,0x00]
+0x05,0x00,0x82,0xd6,0x01,0x00,0x00,0x00
+
+# GFX12: v_s_log_f32 s5, s105 ; encoding: [0x05,0x00,0x82,0xd6,0x69,0x00,0x00,0x00]
+0x05,0x00,0x82,0xd6,0x69,0x00,0x00,0x00
+
+# GFX12: v_s_log_f32 s5, vcc_lo ; encoding: [0x05,0x00,0x82,0xd6,0x6a,0x00,0x00,0x00]
+0x05,0x00,0x82,0xd6,0x6a,0x00,0x00,0x00
+
+# GFX12: v_s_log_f32 s5, vcc_hi ; encoding: [0x05,0x00,0x82,0xd6,0x6b,0x00,0x00,0x00]
+0x05,0x00,0x82,0xd6,0x6b,0x00,0x00,0x00
+
+# GFX12: v_s_log_f32 s5, ttmp15 ; encoding: [0x05,0x00,0x82,0xd6,0x7b,0x00,0x00,0x00]
+0x05,0x00,0x82,0xd6,0x7b,0x00,0x00,0x00
+
+# GFX12: v_s_log_f32 s5, m0 ; encoding: [0x05,0x00,0x82,0xd6,0x7d,0x00,0x00,0x00]
+0x05,0x00,0x82,0xd6,0x7d,0x00,0x00,0x00
+
+# GFX12: v_s_log_f32 s5, exec_lo ; encoding: [0x05,0x00,0x82,0xd6,0x7e,0x00,0x00,0x00]
+0x05,0x00,0x82,0xd6,0x7e,0x00,0x00,0x00
+
+# GFX12: v_s_log_f32 s5, exec_hi ; encoding: [0x05,0x00,0x82,0xd6,0x7f,0x00,0x00,0x00]
+0x05,0x00,0x82,0xd6,0x7f,0x00,0x00,0x00
+
+# GFX12: v_s_log_f32 s5, null ; encoding: [0x05,0x00,0x82,0xd6,0x7c,0x00,0x00,0x00]
+0x05,0x00,0x82,0xd6,0x7c,0x00,0x00,0x00
+
+# GFX12: v_s_log_f32 s5, -1 ; encoding: [0x05,0x00,0x82,0xd6,0xc1,0x00,0x00,0x00]
+0x05,0x00,0x82,0xd6,0xc1,0x00,0x00,0x00
+
+# GFX12: v_s_log_f32 s5, 0.5 ; encoding: [0x05,0x00,0x82,0xd6,0xf0,0x00,0x00,0x00]
+0x05,0x00,0x82,0xd6,0xf0,0x00,0x00,0x00
+
+# GFX12: v_s_log_f32 s5, src_scc ; encoding: [0x05,0x00,0x82,0xd6,0xfd,0x00,0x00,0x00]
+0x05,0x00,0x82,0xd6,0xfd,0x00,0x00,0x00
+
+# GFX12: v_s_log_f32 s105, 0xaf123456 ; encoding: [0x69,0x00,0x82,0xd6,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
+0x69,0x00,0x82,0xd6,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf
+
+# GFX12: v_s_log_f32 s5, -s1 ; encoding: [0x05,0x00,0x82,0xd6,0x01,0x00,0x00,0x20]
+0x05,0x00,0x82,0xd6,0x01,0x00,0x00,0x20
+
+# GFX12: v_s_log_f32 s5, |s1| ; encoding: [0x05,0x01,0x82,0xd6,0x01,0x00,0x00,0x00]
+0x05,0x01,0x82,0xd6,0x01,0x00,0x00,0x00
+
+# GFX12: v_s_log_f32 s5, s1 clamp ; encoding: [0x05,0x80,0x82,0xd6,0x01,0x00,0x00,0x00]
+0x05,0x80,0x82,0xd6,0x01,0x00,0x00,0x00
+
+# GFX12: v_s_log_f32 s5, s1 mul:2 ; encoding: [0x05,0x00,0x82,0xd6,0x01,0x00,0x00,0x08]
+0x05,0x00,0x82,0xd6,0x01,0x00,0x00,0x08
+
+# GFX12: v_s_log_f32 s5, s1 mul:4 ; encoding: [0x05,0x00,0x82,0xd6,0x01,0x00,0x00,0x10]
+0x05,0x00,0x82,0xd6,0x01,0x00,0x00,0x10
+
+# GFX12: v_s_log_f32 s5, s1 div:2 ; encoding: [0x05,0x00,0x82,0xd6,0x01,0x00,0x00,0x18]
+0x05,0x00,0x82,0xd6,0x01,0x00,0x00,0x18
+
+# GFX12: v_s_log_f16 s5, s1 ; encoding: [0x05,0x00,0x83,0xd6,0x01,0x00,0x00,0x00]
+0x05,0x00,0x83,0xd6,0x01,0x00,0x00,0x00
+
+# GFX12: v_s_log_f16 s5, s105 ; encoding: [0x05,0x00,0x83,0xd6,0x69,0x00,0x00,0x00]
+0x05,0x00,0x83,0xd6,0x69,0x00,0x00,0x00
+
+# GFX12: v_s_log_f16 s5, vcc_lo ; encoding: [0x05,0x00,0x83,0xd6,0x6a,0x00,0x00,0x00]
+0x05,0x00,0x83,0xd6,0x6a,0x00,0x00,0x00
+
+# GFX12: v_s_log_f16 s5, vcc_hi ; encoding: [0x05,0x00,0x83,0xd6,0x6b,0x00,0x00,0x00]
+0x05,0x00,0x83,0xd6,0x6b,0x00,0x00,0x00
+
+# GFX12: v_s_log_f16 s5, ttmp15 ; encoding: [0x05,0x00,0x83,0xd6,0x7b,0x00,0x00,0x00]
+0x05,0x00,0x83,0xd6,0x7b,0x00,0x00,0x00
+
+# GFX12: v_s_log_f16 s5, m0 ; encoding: [0x05,0x00,0x83,0xd6,0x7d,0x00,0x00,0x00]
+0x05,0x00,0x83,0xd6,0x7d,0x00,0x00,0x00
+
+# GFX12: v_s_log_f16 s5, exec_lo ; encoding: [0x05,0x00,0x83,0xd6,0x7e,0x00,0x00,0x00]
+0x05,0x00,0x83,0xd6,0x7e,0x00,0x00,0x00
+
+# GFX12: v_s_log_f16 s5, exec_hi ; encoding: [0x05,0x00,0x83,0xd6,0x7f,0x00,0x00,0x00]
+0x05,0x00,0x83,0xd6,0x7f,0x00,0x00,0x00
+
+# GFX12: v_s_log_f16 s5, null ; encoding: [0x05,0x00,0x83,0xd6,0x7c,0x00,0x00,0x00]
+0x05,0x00,0x83,0xd6,0x7c,0x00,0x00,0x00
+
+# GFX12: v_s_log_f16 s5, -1 ; encoding: [0x05,0x00,0x83,0xd6,0xc1,0x00,0x00,0x00]
+0x05,0x00,0x83,0xd6,0xc1,0x00,0x00,0x00
+
+# GFX12: v_s_log_f16 s5, 0.5 ; encoding: [0x05,0x00,0x83,0xd6,0xf0,0x00,0x00,0x00]
+0x05,0x00,0x83,0xd6,0xf0,0x00,0x00,0x00
+
+# GFX12: v_s_log_f16 s5, src_scc ; encoding: [0x05,0x00,0x83,0xd6,0xfd,0x00,0x00,0x00]
+0x05,0x00,0x83,0xd6,0xfd,0x00,0x00,0x00
+
+# GFX12: v_s_log_f16 s105, 0xaf12 ; encoding: [0x69,0x00,0x83,0xd6,0xff,0x00,0x00,0x00,0x12,0xaf,0x00,0x00]
+0x69,0x00,0x83,0xd6,0xff,0x00,0x00,0x00,0x12,0xaf,0x00,0x00
+
+# GFX12: v_s_log_f16 s5, -s1 ; encoding: [0x05,0x00,0x83,0xd6,0x01,0x00,0x00,0x20]
+0x05,0x00,0x83,0xd6,0x01,0x00,0x00,0x20
+
+# GFX12: v_s_log_f16 s5, |s1| ; encoding: [0x05,0x01,0x83,0xd6,0x01,0x00,0x00,0x00]
+0x05,0x01,0x83,0xd6,0x01,0x00,0x00,0x00
+
+# GFX12: v_s_log_f16 s5, s1 clamp ; encoding: [0x05,0x80,0x83,0xd6,0x01,0x00,0x00,0x00]
+0x05,0x80,0x83,0xd6,0x01,0x00,0x00,0x00
+
+# GFX12: v_s_log_f16 s5, s1 mul:2 ; encoding: [0x05,0x00,0x83,0xd6,0x01,0x00,0x00,0x08]
+0x05,0x00,0x83,0xd6,0x01,0x00,0x00,0x08
+
+# GFX12: v_s_log_f16 s5, s1 mul:4 ; encoding: [0x05,0x00,0x83,0xd6,0x01,0x00,0x00,0x10]
+0x05,0x00,0x83,0xd6,0x01,0x00,0x00,0x10
+
+# GFX12: v_s_log_f16 s5, s1 div:2 ; encoding: [0x05,0x00,0x83,0xd6,0x01,0x00,0x00,0x18]
+0x05,0x00,0x83,0xd6,0x01,0x00,0x00,0x18
+
+# GFX12: v_s_rcp_f32 s5, s1 ; encoding: [0x05,0x00,0x84,0xd6,0x01,0x00,0x00,0x00]
+0x05,0x00,0x84,0xd6,0x01,0x00,0x00,0x00
+
+# GFX12: v_s_rcp_f32 s5, s105 ; encoding: [0x05,0x00,0x84,0xd6,0x69,0x00,0x00,0x00]
+0x05,0x00,0x84,0xd6,0x69,0x00,0x00,0x00
+
+# GFX12: v_s_rcp_f32 s5, vcc_lo ; encoding: [0x05,0x00,0x84,0xd6,0x6a,0x00,0x00,0x00]
+0x05,0x00,0x84,0xd6,0x6a,0x00,0x00,0x00
+
+# GFX12: v_s_rcp_f32 s5, vcc_hi ; encoding: [0x05,0x00,0x84,0xd6,0x6b,0x00,0x00,0x00]
+0x05,0x00,0x84,0xd6,0x6b,0x00,0x00,0x00
+
+# GFX12: v_s_rcp_f32 s5, ttmp15 ; encoding: [0x05,0x00,0x84,0xd6,0x7b,0x00,0x00,0x00]
+0x05,0x00,0x84,0xd6,0x7b,0x00,0x00,0x00
+
+# GFX12: v_s_rcp_f32 s5, m0 ; encoding: [0x05,0x00,0x84,0xd6,0x7d,0x00,0x00,0x00]
+0x05,0x00,0x84,0xd6,0x7d,0x00,0x00,0x00
+
+# GFX12: v_s_rcp_f32 s5, exec_lo ; encoding: [0x05,0x00,0x84,0xd6,0x7e,0x00,0x00,0x00]
+0x05,0x00,0x84,0xd6,0x7e,0x00,0x00,0x00
+
+# GFX12: v_s_rcp_f32 s5, exec_hi ; encoding: [0x05,0x00,0x84,0xd6,0x7f,0x00,0x00,0x00]
+0x05,0x00,0x84,0xd6,0x7f,0x00,0x00,0x00
+
+# GFX12: v_s_rcp_f32 s5, null ; encoding: [0x05,0x00,0x84,0xd6,0x7c,0x00,0x00,0x00]
+0x05,0x00,0x84,0xd6,0x7c,0x00,0x00,0x00
+
+# GFX12: v_s_rcp_f32 s5, -1 ; encoding: [0x05,0x00,0x84,0xd6,0xc1,0x00,0x00,0x00]
+0x05,0x00,0x84,0xd6,0xc1,0x00,0x00,0x00
+
+# GFX12: v_s_rcp_f32 s5, 0.5 ; encoding: [0x05,0x00,0x84,0xd6,0xf0,0x00,0x00,0x00]
+0x05,0x00,0x84,0xd6,0xf0,0x00,0x00,0x00
+
+# GFX12: v_s_rcp_f32 s5, src_scc ; encoding: [0x05,0x00,0x84,0xd6,0xfd,0x00,0x00,0x00]
+0x05,0x00,0x84,0xd6,0xfd,0x00,0x00,0x00
+
+# GFX12: v_s_rcp_f32 s105, 0xaf123456 ; encoding: [0x69,0x00,0x84,0xd6,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
+0x69,0x00,0x84,0xd6,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf
+
+# GFX12: v_s_rcp_f32 s5, -s1 ; encoding: [0x05,0x00,0x84,0xd6,0x01,0x00,0x00,0x20]
+0x05,0x00,0x84,0xd6,0x01,0x00,0x00,0x20
+
+# GFX12: v_s_rcp_f32 s5, |s1| ; encoding: [0x05,0x01,0x84,0xd6,0x01,0x00,0x00,0x00]
+0x05,0x01,0x84,0xd6,0x01,0x00,0x00,0x00
+
+# GFX12: v_s_rcp_f32 s5, s1 clamp ; encoding: [0x05,0x80,0x84,0xd6,0x01,0x00,0x00,0x00]
+0x05,0x80,0x84,0xd6,0x01,0x00,0x00,0x00
+
+# GFX12: v_s_rcp_f32 s5, s1 mul:2 ; encoding: [0x05,0x00,0x84,0xd6,0x01,0x00,0x00,0x08]
+0x05,0x00,0x84,0xd6,0x01,0x00,0x00,0x08
+
+# GFX12: v_s_rcp_f32 s5, s1 mul:4 ; encoding: [0x05,0x00,0x84,0xd6,0x01,0x00,0x00,0x10]
+0x05,0x00,0x84,0xd6,0x01,0x00,0x00,0x10
+
+# GFX12: v_s_rcp_f32 s5, s1 div:2 ; encoding: [0x05,0x00,0x84,0xd6,0x01,0x00,0x00,0x18]
+0x05,0x00,0x84,0xd6,0x01,0x00,0x00,0x18
+
+# GFX12: v_s_rcp_f16 s5, s1 ; encoding: [0x05,0x00,0x85,0xd6,0x01,0x00,0x00,0x00]
+0x05,0x00,0x85,0xd6,0x01,0x00,0x00,0x00
+
+# GFX12: v_s_rcp_f16 s5, s105 ; encoding: [0x05,0x00,0x85,0xd6,0x69,0x00,0x00,0x00]
+0x05,0x00,0x85,0xd6,0x69,0x00,0x00,0x00
+
+# GFX12: v_s_rcp_f16 s5, vcc_lo ; encoding: [0x05,0x00,0x85,0xd6,0x6a,0x00,0x00,0x00]
+0x05,0x00,0x85,0xd6,0x6a,0x00,0x00,0x00
+
+# GFX12: v_s_rcp_f16 s5, vcc_hi ; encoding: [0x05,0x00,0x85,0xd6,0x6b,0x00,0x00,0x00]
+0x05,0x00,0x85,0xd6,0x6b,0x00,0x00,0x00
+
+# GFX12: v_s_rcp_f16 s5, ttmp15 ; encoding: [0x05,0x00,0x85,0xd6,0x7b,0x00,0x00,0x00]
+0x05,0x00,0x85,0xd6,0x7b,0x00,0x00,0x00
+
+# GFX12: v_s_rcp_f16 s5, m0 ; encoding: [0x05,0x00,0x85,0xd6,0x7d,0x00,0x00,0x00]
+0x05,0x00,0x85,0xd6,0x7d,0x00,0x00,0x00
+
+# GFX12: v_s_rcp_f16 s5, exec_lo ; encoding: [0x05,0x00,0x85,0xd6,0x7e,0x00,0x00,0x00]
+0x05,0x00,0x85,0xd6,0x7e,0x00,0x00,0x00
+
+# GFX12: v_s_rcp_f16 s5, exec_hi ; encoding: [0x05,0x00,0x85,0xd6,0x7f,0x00,0x00,0x00]
+0x05,0x00,0x85,0xd6,0x7f,0x00,0x00,0x00
+
+# GFX12: v_s_rcp_f16 s5, null ; encoding: [0x05,0x00,0x85,0xd6,0x7c,0x00,0x00,0x00]
+0x05,0x00,0x85,0xd6,0x7c,0x00,0x00,0x00
+
+# GFX12: v_s_rcp_f16 s5, -1 ; encoding: [0x05,0x00,0x85,0xd6,0xc1,0x00,0x00,0x00]
+0x05,0x00,0x85,0xd6,0xc1,0x00,0x00,0x00
+
+# GFX12: v_s_rcp_f16 s5, 0.5 ; encoding: [0x05,0x00,0x85,0xd6,0xf0,0x00,0x00,0x00]
+0x05,0x00,0x85,0xd6,0xf0,0x00,0x00,0x00
+
+# GFX12: v_s_rcp_f16 s5, src_scc ; encoding: [0x05,0x00,0x85,0xd6,0xfd,0x00,0x00,0x00]
+0x05,0x00,0x85,0xd6,0xfd,0x00,0x00,0x00
+
+# GFX12: v_s_rcp_f16 s105, 0xaf12 ; encoding: [0x69,0x00,0x85,0xd6,0xff,0x00,0x00,0x00,0x12,0xaf,0x00,0x00]
+0x69,0x00,0x85,0xd6,0xff,0x00,0x00,0x00,0x12,0xaf,0x00,0x00
+
+# GFX12: v_s_rcp_f16 s5, -s1 ; encoding: [0x05,0x00,0x85,0xd6,0x01,0x00,0x00,0x20]
+0x05,0x00,0x85,0xd6,0x01,0x00,0x00,0x20
+
+# GFX12: v_s_rcp_f16 s5, |s1| ; encoding: [0x05,0x01,0x85,0xd6,0x01,0x00,0x00,0x00]
+0x05,0x01,0x85,0xd6,0x01,0x00,0x00,0x00
+
+# GFX12: v_s_rcp_f16 s5, s1 clamp ; encoding: [0x05,0x80,0x85,0xd6,0x01,0x00,0x00,0x00]
+0x05,0x80,0x85,0xd6,0x01,0x00,0x00,0x00
+
+# GFX12: v_s_rcp_f16 s5, s1 mul:2 ; encoding: [0x05,0x00,0x85,0xd6,0x01,0x00,0x00,0x08]
+0x05,0x00,0x85,0xd6,0x01,0x00,0x00,0x08
+
+# GFX12: v_s_rcp_f16 s5, s1 mul:4 ; encoding: [0x05,0x00,0x85,0xd6,0x01,0x00,0x00,0x10]
+0x05,0x00,0x85,0xd6,0x01,0x00,0x00,0x10
+
+# GFX12: v_s_rcp_f16 s5, s1 div:2 ; encoding: [0x05,0x00,0x85,0xd6,0x01,0x00,0x00,0x18]
+0x05,0x00,0x85,0xd6,0x01,0x00,0x00,0x18
+
+# GFX12: v_s_rsq_f32 s5, s1 ; encoding: [0x05,0x00,0x86,0xd6,0x01,0x00,0x00,0x00]
+0x05,0x00,0x86,0xd6,0x01,0x00,0x00,0x00
+
+# GFX12: v_s_rsq_f32 s5, s105 ; encoding: [0x05,0x00,0x86,0xd6,0x69,0x00,0x00,0x00]
+0x05,0x00,0x86,0xd6,0x69,0x00,0x00,0x00
+
+# GFX12: v_s_rsq_f32 s5, vcc_lo ; encoding: [0x05,0x00,0x86,0xd6,0x6a,0x00,0x00,0x00]
+0x05,0x00,0x86,0xd6,0x6a,0x00,0x00,0x00
+
+# GFX12: v_s_rsq_f32 s5, vcc_hi ; encoding: [0x05,0x00,0x86,0xd6,0x6b,0x00,0x00,0x00]
+0x05,0x00,0x86,0xd6,0x6b,0x00,0x00,0x00
+
+# GFX12: v_s_rsq_f32 s5, ttmp15 ; encoding: [0x05,0x00,0x86,0xd6,0x7b,0x00,0x00,0x00]
+0x05,0x00,0x86,0xd6,0x7b,0x00,0x00,0x00
+
+# GFX12: v_s_rsq_f32 s5, m0 ; encoding: [0x05,0x00,0x86,0xd6,0x7d,0x00,0x00,0x00]
+0x05,0x00,0x86,0xd6,0x7d,0x00,0x00,0x00
+
+# GFX12: v_s_rsq_f32 s5, exec_lo ; encoding: [0x05,0x00,0x86,0xd6,0x7e,0x00,0x00,0x00]
+0x05,0x00,0x86,0xd6,0x7e,0x00,0x00,0x00
+
+# GFX12: v_s_rsq_f32 s5, exec_hi ; encoding: [0x05,0x00,0x86,0xd6,0x7f,0x00,0x00,0x00]
+0x05,0x00,0x86,0xd6,0x7f,0x00,0x00,0x00
+
+# GFX12: v_s_rsq_f32 s5, null ; encoding: [0x05,0x00,0x86,0xd6,0x7c,0x00,0x00,0x00]
+0x05,0x00,0x86,0xd6,0x7c,0x00,0x00,0x00
+
+# GFX12: v_s_rsq_f32 s5, -1 ; encoding: [0x05,0x00,0x86,0xd6,0xc1,0x00,0x00,0x00]
+0x05,0x00,0x86,0xd6,0xc1,0x00,0x00,0x00
+
+# GFX12: v_s_rsq_f32 s5, 0.5 ; encoding: [0x05,0x00,0x86,0xd6,0xf0,0x00,0x00,0x00]
+0x05,0x00,0x86,0xd6,0xf0,0x00,0x00,0x00
+
+# GFX12: v_s_rsq_f32 s5, src_scc ; encoding: [0x05,0x00,0x86,0xd6,0xfd,0x00,0x00,0x00]
+0x05,0x00,0x86,0xd6,0xfd,0x00,0x00,0x00
+
+# GFX12: v_s_rsq_f32 s105, 0xaf123456 ; encoding: [0x69,0x00,0x86,0xd6,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
+0x69,0x00,0x86,0xd6,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf
+
+# GFX12: v_s_rsq_f32 s5, -s1 ; encoding: [0x05,0x00,0x86,0xd6,0x01,0x00,0x00,0x20]
+0x05,0x00,0x86,0xd6,0x01,0x00,0x00,0x20
+
+# GFX12: v_s_rsq_f32 s5, |s1| ; encoding: [0x05,0x01,0x86,0xd6,0x01,0x00,0x00,0x00]
+0x05,0x01,0x86,0xd6,0x01,0x00,0x00,0x00
+
+# GFX12: v_s_rsq_f32 s5, s1 clamp ; encoding: [0x05,0x80,0x86,0xd6,0x01,0x00,0x00,0x00]
+0x05,0x80,0x86,0xd6,0x01,0x00,0x00,0x00
+
+# GFX12: v_s_rsq_f32 s5, s1 mul:2 ; encoding: [0x05,0x00,0x86,0xd6,0x01,0x00,0x00,0x08]
+0x05,0x00,0x86,0xd6,0x01,0x00,0x00,0x08
+
+# GFX12: v_s_rsq_f32 s5, s1 mul:4 ; encoding: [0x05,0x00,0x86,0xd6,0x01,0x00,0x00,0x10]
+0x05,0x00,0x86,0xd6,0x01,0x00,0x00,0x10
+
+# GFX12: v_s_rsq_f32 s5, s1 div:2 ; encoding: [0x05,0x00,0x86,0xd6,0x01,0x00,0x00,0x18]
+0x05,0x00,0x86,0xd6,0x01,0x00,0x00,0x18
+
+# GFX12: v_s_rsq_f16 s5, s1 ; encoding: [0x05,0x00,0x87,0xd6,0x01,0x00,0x00,0x00]
+0x05,0x00,0x87,0xd6,0x01,0x00,0x00,0x00
+
+# GFX12: v_s_rsq_f16 s5, s105 ; encoding: [0x05,0x00,0x87,0xd6,0x69,0x00,0x00,0x00]
+0x05,0x00,0x87,0xd6,0x69,0x00,0x00,0x00
+
+# GFX12: v_s_rsq_f16 s5, vcc_lo ; encoding: [0x05,0x00,0x87,0xd6,0x6a,0x00,0x00,0x00]
+0x05,0x00,0x87,0xd6,0x6a,0x00,0x00,0x00
+
+# GFX12: v_s_rsq_f16 s5, vcc_hi ; encoding: [0x05,0x00,0x87,0xd6,0x6b,0x00,0x00,0x00]
+0x05,0x00,0x87,0xd6,0x6b,0x00,0x00,0x00
+
+# GFX12: v_s_rsq_f16 s5, ttmp15 ; encoding: [0x05,0x00,0x87,0xd6,0x7b,0x00,0x00,0x00]
+0x05,0x00,0x87,0xd6,0x7b,0x00,0x00,0x00
+
+# GFX12: v_s_rsq_f16 s5, m0 ; encoding: [0x05,0x00,0x87,0xd6,0x7d,0x00,0x00,0x00]
+0x05,0x00,0x87,0xd6,0x7d,0x00,0x00,0x00
+
+# GFX12: v_s_rsq_f16 s5, exec_lo ; encoding: [0x05,0x00,0x87,0xd6,0x7e,0x00,0x00,0x00]
+0x05,0x00,0x87,0xd6,0x7e,0x00,0x00,0x00
+
+# GFX12: v_s_rsq_f16 s5, exec_hi ; encoding: [0x05,0x00,0x87,0xd6,0x7f,0x00,0x00,0x00]
+0x05,0x00,0x87,0xd6,0x7f,0x00,0x00,0x00
+
+# GFX12: v_s_rsq_f16 s5, null ; encoding: [0x05,0x00,0x87,0xd6,0x7c,0x00,0x00,0x00]
+0x05,0x00,0x87,0xd6,0x7c,0x00,0x00,0x00
+
+# GFX12: v_s_rsq_f16 s5, -1 ; encoding: [0x05,0x00,0x87,0xd6,0xc1,0x00,0x00,0x00]
+0x05,0x00,0x87,0xd6,0xc1,0x00,0x00,0x00
+
+# GFX12: v_s_rsq_f16 s5, 0.5 ; encoding: [0x05,0x00,0x87,0xd6,0xf0,0x00,0x00,0x00]
+0x05,0x00,0x87,0xd6,0xf0,0x00,0x00,0x00
+
+# GFX12: v_s_rsq_f16 s5, src_scc ; encoding: [0x05,0x00,0x87,0xd6,0xfd,0x00,0x00,0x00]
+0x05,0x00,0x87,0xd6,0xfd,0x00,0x00,0x00
+
+# GFX12: v_s_rsq_f16 s105, 0xaf12 ; encoding: [0x69,0x00,0x87,0xd6,0xff,0x00,0x00,0x00,0x12,0xaf,0x00,0x00]
+0x69,0x00,0x87,0xd6,0xff,0x00,0x00,0x00,0x12,0xaf,0x00,0x00
+
+# GFX12: v_s_rsq_f16 s5, -s1 ; encoding: [0x05,0x00,0x87,0xd6,0x01,0x00,0x00,0x20]
+0x05,0x00,0x87,0xd6,0x01,0x00,0x00,0x20
+
+# GFX12: v_s_rsq_f16 s5, |s1| ; encoding: [0x05,0x01,0x87,0xd6,0x01,0x00,0x00,0x00]
+0x05,0x01,0x87,0xd6,0x01,0x00,0x00,0x00
+
+# GFX12: v_s_rsq_f16 s5, s1 clamp ; encoding: [0x05,0x80,0x87,0xd6,0x01,0x00,0x00,0x00]
+0x05,0x80,0x87,0xd6,0x01,0x00,0x00,0x00
+
+# GFX12: v_s_rsq_f16 s5, s1 mul:2 ; encoding: [0x05,0x00,0x87,0xd6,0x01,0x00,0x00,0x08]
+0x05,0x00,0x87,0xd6,0x01,0x00,0x00,0x08
+
+# GFX12: v_s_rsq_f16 s5, s1 mul:4 ; encoding: [0x05,0x00,0x87,0xd6,0x01,0x00,0x00,0x10]
+0x05,0x00,0x87,0xd6,0x01,0x00,0x00,0x10
+
+# GFX12: v_s_rsq_f16 s5, s1 div:2 ; encoding: [0x05,0x00,0x87,0xd6,0x01,0x00,0x00,0x18]
+0x05,0x00,0x87,0xd6,0x01,0x00,0x00,0x18
+
+# GFX12: v_s_sqrt_f32 s5, s1 ; encoding: [0x05,0x00,0x88,0xd6,0x01,0x00,0x00,0x00]
+0x05,0x00,0x88,0xd6,0x01,0x00,0x00,0x00
+
+# GFX12: v_s_sqrt_f32 s5, s105 ; encoding: [0x05,0x00,0x88,0xd6,0x69,0x00,0x00,0x00]
+0x05,0x00,0x88,0xd6,0x69,0x00,0x00,0x00
+
+# GFX12: v_s_sqrt_f32 s5, vcc_lo ; encoding: [0x05,0x00,0x88,0xd6,0x6a,0x00,0x00,0x00]
+0x05,0x00,0x88,0xd6,0x6a,0x00,0x00,0x00
+
+# GFX12: v_s_sqrt_f32 s5, vcc_hi ; encoding: [0x05,0x00,0x88,0xd6,0x6b,0x00,0x00,0x00]
+0x05,0x00,0x88,0xd6,0x6b,0x00,0x00,0x00
+
+# GFX12: v_s_sqrt_f32 s5, ttmp15 ; encoding: [0x05,0x00,0x88,0xd6,0x7b,0x00,0x00,0x00]
+0x05,0x00,0x88,0xd6,0x7b,0x00,0x00,0x00
+
+# GFX12: v_s_sqrt_f32 s5, m0 ; encoding: [0x05,0x00,0x88,0xd6,0x7d,0x00,0x00,0x00]
+0x05,0x00,0x88,0xd6,0x7d,0x00,0x00,0x00
+
+# GFX12: v_s_sqrt_f32 s5, exec_lo ; encoding: [0x05,0x00,0x88,0xd6,0x7e,0x00,0x00,0x00]
+0x05,0x00,0x88,0xd6,0x7e,0x00,0x00,0x00
+
+# GFX12: v_s_sqrt_f32 s5, exec_hi ; encoding: [0x05,0x00,0x88,0xd6,0x7f,0x00,0x00,0x00]
+0x05,0x00,0x88,0xd6,0x7f,0x00,0x00,0x00
+
+# GFX12: v_s_sqrt_f32 s5, null ; encoding: [0x05,0x00,0x88,0xd6,0x7c,0x00,0x00,0x00]
+0x05,0x00,0x88,0xd6,0x7c,0x00,0x00,0x00
+
+# GFX12: v_s_sqrt_f32 s5, -1 ; encoding: [0x05,0x00,0x88,0xd6,0xc1,0x00,0x00,0x00]
+0x05,0x00,0x88,0xd6,0xc1,0x00,0x00,0x00
+
+# GFX12: v_s_sqrt_f32 s5, 0.5 ; encoding: [0x05,0x00,0x88,0xd6,0xf0,0x00,0x00,0x00]
+0x05,0x00,0x88,0xd6,0xf0,0x00,0x00,0x00
+
+# GFX12: v_s_sqrt_f32 s5, src_scc ; encoding: [0x05,0x00,0x88,0xd6,0xfd,0x00,0x00,0x00]
+0x05,0x00,0x88,0xd6,0xfd,0x00,0x00,0x00
+
+# GFX12: v_s_sqrt_f32 s105, 0xaf123456 ; encoding: [0x69,0x00,0x88,0xd6,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
+0x69,0x00,0x88,0xd6,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf
+
+# GFX12: v_s_sqrt_f32 s5, -s1 ; encoding: [0x05,0x00,0x88,0xd6,0x01,0x00,0x00,0x20]
+0x05,0x00,0x88,0xd6,0x01,0x00,0x00,0x20
+
+# GFX12: v_s_sqrt_f32 s5, |s1| ; encoding: [0x05,0x01,0x88,0xd6,0x01,0x00,0x00,0x00]
+0x05,0x01,0x88,0xd6,0x01,0x00,0x00,0x00
+
+# GFX12: v_s_sqrt_f32 s5, s1 clamp ; encoding: [0x05,0x80,0x88,0xd6,0x01,0x00,0x00,0x00]
+0x05,0x80,0x88,0xd6,0x01,0x00,0x00,0x00
+
+# GFX12: v_s_sqrt_f32 s5, s1 mul:2 ; encoding: [0x05,0x00,0x88,0xd6,0x01,0x00,0x00,0x08]
+0x05,0x00,0x88,0xd6,0x01,0x00,0x00,0x08
+
+# GFX12: v_s_sqrt_f32 s5, s1 mul:4 ; encoding: [0x05,0x00,0x88,0xd6,0x01,0x00,0x00,0x10]
+0x05,0x00,0x88,0xd6,0x01,0x00,0x00,0x10
+
+# GFX12: v_s_sqrt_f32 s5, s1 div:2 ; encoding: [0x05,0x00,0x88,0xd6,0x01,0x00,0x00,0x18]
+0x05,0x00,0x88,0xd6,0x01,0x00,0x00,0x18
+
+# GFX12: v_s_sqrt_f16 s5, s1 ; encoding: [0x05,0x00,0x89,0xd6,0x01,0x00,0x00,0x00]
+0x05,0x00,0x89,0xd6,0x01,0x00,0x00,0x00
+
+# GFX12: v_s_sqrt_f16 s5, s105 ; encoding: [0x05,0x00,0x89,0xd6,0x69,0x00,0x00,0x00]
+0x05,0x00,0x89,0xd6,0x69,0x00,0x00,0x00
+
+# GFX12: v_s_sqrt_f16 s5, vcc_lo ; encoding: [0x05,0x00,0x89,0xd6,0x6a,0x00,0x00,0x00]
+0x05,0x00,0x89,0xd6,0x6a,0x00,0x00,0x00
+
+# GFX12: v_s_sqrt_f16 s5, vcc_hi ; encoding: [0x05,0x00,0x89,0xd6,0x6b,0x00,0x00,0x00]
+0x05,0x00,0x89,0xd6,0x6b,0x00,0x00,0x00
+
+# GFX12: v_s_sqrt_f16 s5, ttmp15 ; encoding: [0x05,0x00,0x89,0xd6,0x7b,0x00,0x00,0x00]
+0x05,0x00,0x89,0xd6,0x7b,0x00,0x00,0x00
+
+# GFX12: v_s_sqrt_f16 s5, m0 ; encoding: [0x05,0x00,0x89,0xd6,0x7d,0x00,0x00,0x00]
+0x05,0x00,0x89,0xd6,0x7d,0x00,0x00,0x00
+
+# GFX12: v_s_sqrt_f16 s5, exec_lo ; encoding: [0x05,0x00,0x89,0xd6,0x7e,0x00,0x00,0x00]
+0x05,0x00,0x89,0xd6,0x7e,0x00,0x00,0x00
+
+# GFX12: v_s_sqrt_f16 s5, exec_hi ; encoding: [0x05,0x00,0x89,0xd6,0x7f,0x00,0x00,0x00]
+0x05,0x00,0x89,0xd6,0x7f,0x00,0x00,0x00
+
+# GFX12: v_s_sqrt_f16 s5, null ; encoding: [0x05,0x00,0x89,0xd6,0x7c,0x00,0x00,0x00]
+0x05,0x00,0x89,0xd6,0x7c,0x00,0x00,0x00
+
+# GFX12: v_s_sqrt_f16 s5, -1 ; encoding: [0x05,0x00,0x89,0xd6,0xc1,0x00,0x00,0x00]
+0x05,0x00,0x89,0xd6,0xc1,0x00,0x00,0x00
+
+# GFX12: v_s_sqrt_f16 s5, 0.5 ; encoding: [0x05,0x00,0x89,0xd6,0xf0,0x00,0x00,0x00]
+0x05,0x00,0x89,0xd6,0xf0,0x00,0x00,0x00
+
+# GFX12: v_s_sqrt_f16 s5, src_scc ; encoding: [0x05,0x00,0x89,0xd6,0xfd,0x00,0x00,0x00]
+0x05,0x00,0x89,0xd6,0xfd,0x00,0x00,0x00
+
+# GFX12: v_s_sqrt_f16 s105, 0xaf12 ; encoding: [0x69,0x00,0x89,0xd6,0xff,0x00,0x00,0x00,0x12,0xaf,0x00,0x00]
+0x69,0x00,0x89,0xd6,0xff,0x00,0x00,0x00,0x12,0xaf,0x00,0x00
+
+# GFX12: v_s_sqrt_f16 s5, -s1 ; encoding: [0x05,0x00,0x89,0xd6,0x01,0x00,0x00,0x20]
+0x05,0x00,0x89,0xd6,0x01,0x00,0x00,0x20
+
+# GFX12: v_s_sqrt_f16 s5, |s1| ; encoding: [0x05,0x01,0x89,0xd6,0x01,0x00,0x00,0x00]
+0x05,0x01,0x89,0xd6,0x01,0x00,0x00,0x00
+
+# GFX12: v_s_sqrt_f16 s5, s1 clamp ; encoding: [0x05,0x80,0x89,0xd6,0x01,0x00,0x00,0x00]
+0x05,0x80,0x89,0xd6,0x01,0x00,0x00,0x00
+
+# GFX12: v_s_sqrt_f16 s5, s1 mul:2 ; encoding: [0x05,0x00,0x89,0xd6,0x01,0x00,0x00,0x08]
+0x05,0x00,0x89,0xd6,0x01,0x00,0x00,0x08
+
+# GFX12: v_s_sqrt_f16 s5, s1 mul:4 ; encoding: [0x05,0x00,0x89,0xd6,0x01,0x00,0x00,0x10]
+0x05,0x00,0x89,0xd6,0x01,0x00,0x00,0x10
+
+# GFX12: v_s_sqrt_f16 s5, s1 div:2 ; encoding: [0x05,0x00,0x89,0xd6,0x01,0x00,0x00,0x18]
+0x05,0x00,0x89,0xd6,0x01,0x00,0x00,0x18
>From 12b824f87ff58b679228f9858db8fdb0fb3848e7 Mon Sep 17 00:00:00 2001
From: Mirko Brkusanin <Mirko.Brkusanin at amd.com>
Date: Tue, 12 Dec 2023 16:36:06 +0100
Subject: [PATCH 2/4] [AMDGPU][MCA] Scheduler updates for pseudo scalar
transcendental instructions
---
llvm/lib/Target/AMDGPU/GCNProcessors.td | 4 +-
llvm/lib/Target/AMDGPU/SISchedule.td | 36 ++++++
llvm/lib/Target/AMDGPU/VOP3Instructions.td | 2 +-
.../AMDGPU/gfx12-pseudo-scalar-trans.s | 103 ++++++++++++++++++
4 files changed, 142 insertions(+), 3 deletions(-)
create mode 100644 llvm/test/tools/llvm-mca/AMDGPU/gfx12-pseudo-scalar-trans.s
diff --git a/llvm/lib/Target/AMDGPU/GCNProcessors.td b/llvm/lib/Target/AMDGPU/GCNProcessors.td
index 80669c04f2c688..96af1a6aab3da7 100644
--- a/llvm/lib/Target/AMDGPU/GCNProcessors.td
+++ b/llvm/lib/Target/AMDGPU/GCNProcessors.td
@@ -284,10 +284,10 @@ def : ProcessorModel<"gfx1151", GFX11SpeedModel,
// GCN GFX12.
//===----------------------------------------------------------------------===//
-def : ProcessorModel<"gfx1200", GFX11SpeedModel,
+def : ProcessorModel<"gfx1200", GFX12SpeedModel,
FeatureISAVersion12.Features
>;
-def : ProcessorModel<"gfx1201", GFX11SpeedModel,
+def : ProcessorModel<"gfx1201", GFX12SpeedModel,
FeatureISAVersion12.Features
>;
diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td
index c67e647a7e7c70..b0e8e4112254d8 100644
--- a/llvm/lib/Target/AMDGPU/SISchedule.td
+++ b/llvm/lib/Target/AMDGPU/SISchedule.td
@@ -68,6 +68,9 @@ def Write8PassDGEMM : SchedWrite;
// Scalar float instructions
def WriteSFPU : SchedWrite;
+// F16 or F32 pseudo scalar transcendental instructions
+def WritePseudoScalarTrans : SchedWrite;
+
// FIXME: Should there be a class for instructions which are VALU
// instructions and have VALU rates, but write to the SALU (i.e. VOPC
// instructions)
@@ -93,6 +96,7 @@ def SIDPFullSpeedModel : SISchedMachineModel;
def SIDPGFX940FullSpeedModel : SISchedMachineModel;
def GFX10SpeedModel : SISchedMachineModel;
def GFX11SpeedModel : SISchedMachineModel;
+def GFX12SpeedModel : SISchedMachineModel;
// XXX: Are the resource counts correct?
def HWBranch : ProcResource<1> {
@@ -174,6 +178,7 @@ multiclass SICommonWriteRes {
def : HWWriteRes<Write16PassMAI, [HWXDL], 16>;
def : UnsupportedWriteRes<WriteSFPU>;
+ def : UnsupportedWriteRes<WritePseudoScalarTrans>;
} // End RetireOOO = 1
def : ReadAdvance<MIVGPRRead, -2>;
@@ -318,6 +323,7 @@ def : HWWriteRes<WriteVMEM, [HWVMEM, HWRC], 320>;
def : HWWriteRes<WriteBarrier, [HWBranch], 2000>;
def : UnsupportedWriteRes<WriteSFPU>;
+def : UnsupportedWriteRes<WritePseudoScalarTrans>;
} // End RetireOOO = 1
def : InstRW<[WriteCopy], (instrs COPY)>;
@@ -351,6 +357,36 @@ def : HWWriteRes<WriteVMEM, [HWVMEM, HWRC], 320>;
def : HWWriteRes<WriteBarrier, [HWBranch], 2000>;
} // End RetireOOO = 1
+def : UnsupportedWriteRes<WritePseudoScalarTrans>;
+
def : InstRW<[WriteCopy], (instrs COPY)>;
} // End SchedModel = GFX11SpeedModel
+
+let SchedModel = GFX12SpeedModel in {
+
+def : HWWriteRes<Write32Bit, [HWVALU, HWRC], 5>;
+def : HWWriteRes<WriteFloatCvt, [HWVALU, HWRC], 5>;
+def : HWWriteRes<Write64Bit, [HWVALU, HWRC], 6>;
+def : HWWriteRes<WriteTrans32, [HWVALU, HWRC], 10>;
+def : HWWriteRes<WriteQuarterRate32, [HWVALU, HWRC], 8>;
+def : HWWriteRes<WriteFloatFMA, [HWVALU, HWRC], 5>;
+def : HWWriteRes<WriteDouble, [HWVALU, HWRC], 38>;
+def : HWWriteRes<WriteDoubleAdd, [HWVALU, HWRC], 38>;
+def : HWWriteRes<WriteDoubleCvt, [HWVALU, HWRC], 38>;
+def : HWWriteRes<WriteIntMul, [HWVALU, HWRC], 8>;
+def : HWWriteRes<WriteTrans64, [HWVALU, HWRC], 40>;
+def : HWWriteRes<WritePseudoScalarTrans, [HWVALU, HWRC], 7>;
+
+def : HWWriteRes<WriteBranch, [HWBranch], 32>;
+def : HWWriteRes<WriteExport, [HWExport, HWRC], 16>;
+def : HWWriteRes<WriteLDS, [HWLGKM, HWRC], 20>;
+def : HWWriteRes<WriteSALU, [HWSALU, HWRC], 2>;
+def : HWWriteRes<WriteSFPU, [HWSALU, HWRC], 4>;
+def : HWWriteRes<WriteSMEM, [HWLGKM, HWRC], 20>;
+def : HWWriteRes<WriteVMEM, [HWVMEM, HWRC], 320>;
+def : HWWriteRes<WriteBarrier, [HWBranch], 2000>;
+
+def : InstRW<[WriteCopy], (instrs COPY)>;
+
+} // End SchedModel = GFX12SpeedModel
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index d7b11cc90ea5ab..5fda10b73a0c02 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -854,7 +854,7 @@ def VOP_Pseudo_Scalar_F32 : VOP_Pseudo_Scalar<SReg_32_XEXEC, SSrc_f32, f32>;
def VOP_Pseudo_Scalar_F16 : VOP_Pseudo_Scalar<SReg_32_XEXEC, SSrc_f16, f32, f16>;
let SubtargetPredicate = HasPseudoScalarTrans, TRANS = 1,
- isReMaterializable = 1 in {
+ isReMaterializable = 1, SchedRW = [WritePseudoScalarTrans] in {
defm V_S_EXP_F32 : VOP3PseudoScalarInst<"v_s_exp_f32", VOP_Pseudo_Scalar_F32>;
defm V_S_EXP_F16 : VOP3PseudoScalarInst<"v_s_exp_f16", VOP_Pseudo_Scalar_F16>;
defm V_S_LOG_F32 : VOP3PseudoScalarInst<"v_s_log_f32", VOP_Pseudo_Scalar_F32>;
diff --git a/llvm/test/tools/llvm-mca/AMDGPU/gfx12-pseudo-scalar-trans.s b/llvm/test/tools/llvm-mca/AMDGPU/gfx12-pseudo-scalar-trans.s
new file mode 100644
index 00000000000000..d11b61919b3ce6
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AMDGPU/gfx12-pseudo-scalar-trans.s
@@ -0,0 +1,103 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=amdgcn -mcpu=gfx1200 --timeline --iterations=1 --timeline-max-cycles=0 < %s | FileCheck %s
+
+v_s_exp_f32 s0, s0
+v_s_log_f32 s0, s0
+v_s_rcp_f32 s1, s1
+v_s_rsq_f32 s1, s0
+v_s_sqrt_f32 s2, s1
+v_s_exp_f16 s3, s1
+v_s_log_f16 s4, s1
+v_s_rcp_f16 s5, s2
+v_s_rsq_f16 s5, s4
+v_s_sqrt_f16 s5, s5
+
+# CHECK: Iterations: 1
+# CHECK-NEXT: Instructions: 10
+# CHECK-NEXT: Total Cycles: 45
+# CHECK-NEXT: Total uOps: 10
+
+# CHECK: Dispatch Width: 1
+# CHECK-NEXT: uOps Per Cycle: 0.22
+# CHECK-NEXT: IPC: 0.22
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 7 1.00 U v_s_exp_f32 s0, s0
+# CHECK-NEXT: 1 7 1.00 U v_s_log_f32 s0, s0
+# CHECK-NEXT: 1 7 1.00 U v_s_rcp_f32 s1, s1
+# CHECK-NEXT: 1 7 1.00 U v_s_rsq_f32 s1, s0
+# CHECK-NEXT: 1 7 1.00 U v_s_sqrt_f32 s2, s1
+# CHECK-NEXT: 1 7 1.00 U v_s_exp_f16 s3, s1
+# CHECK-NEXT: 1 7 1.00 U v_s_log_f16 s4, s1
+# CHECK-NEXT: 1 7 1.00 U v_s_rcp_f16 s5, s2
+# CHECK-NEXT: 1 7 1.00 U v_s_rsq_f16 s5, s4
+# CHECK-NEXT: 1 7 1.00 U v_s_sqrt_f16 s5, s5
+
+# CHECK: Resources:
+# CHECK-NEXT: [0] - HWBranch
+# CHECK-NEXT: [1] - HWExport
+# CHECK-NEXT: [2] - HWLGKM
+# CHECK-NEXT: [3] - HWRC
+# CHECK-NEXT: [4] - HWSALU
+# CHECK-NEXT: [5] - HWVALU
+# CHECK-NEXT: [6] - HWVMEM
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6]
+# CHECK-NEXT: - - - 10.00 - 10.00 -
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: - - - 1.00 - 1.00 - v_s_exp_f32 s0, s0
+# CHECK-NEXT: - - - 1.00 - 1.00 - v_s_log_f32 s0, s0
+# CHECK-NEXT: - - - 1.00 - 1.00 - v_s_rcp_f32 s1, s1
+# CHECK-NEXT: - - - 1.00 - 1.00 - v_s_rsq_f32 s1, s0
+# CHECK-NEXT: - - - 1.00 - 1.00 - v_s_sqrt_f32 s2, s1
+# CHECK-NEXT: - - - 1.00 - 1.00 - v_s_exp_f16 s3, s1
+# CHECK-NEXT: - - - 1.00 - 1.00 - v_s_log_f16 s4, s1
+# CHECK-NEXT: - - - 1.00 - 1.00 - v_s_rcp_f16 s5, s2
+# CHECK-NEXT: - - - 1.00 - 1.00 - v_s_rsq_f16 s5, s4
+# CHECK-NEXT: - - - 1.00 - 1.00 - v_s_sqrt_f16 s5, s5
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 0123456789
+# CHECK-NEXT: Index 0123456789 0123456789 01234
+
+# CHECK: [0,0] DeeeeeeE . . . . . . . . v_s_exp_f32 s0, s0
+# CHECK-NEXT: [0,1] . . DeeeeeeE. . . . . . . v_s_log_f32 s0, s0
+# CHECK-NEXT: [0,2] . . DeeeeeeE . . . . . . v_s_rcp_f32 s1, s1
+# CHECK-NEXT: [0,3] . . . DeeeeeeE . . . . . v_s_rsq_f32 s1, s0
+# CHECK-NEXT: [0,4] . . . . .DeeeeeeE . . . . v_s_sqrt_f32 s2, s1
+# CHECK-NEXT: [0,5] . . . . . DeeeeeeE. . . . v_s_exp_f16 s3, s1
+# CHECK-NEXT: [0,6] . . . . . DeeeeeeE . . . v_s_log_f16 s4, s1
+# CHECK-NEXT: [0,7] . . . . . . DeeeeeeE . . v_s_rcp_f16 s5, s2
+# CHECK-NEXT: [0,8] . . . . . . DeeeeeeE . . v_s_rsq_f16 s5, s4
+# CHECK-NEXT: [0,9] . . . . . . . . DeeeeeeE v_s_sqrt_f16 s5, s5
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 0.0 0.0 0.0 v_s_exp_f32 s0, s0
+# CHECK-NEXT: 1. 1 0.0 0.0 0.0 v_s_log_f32 s0, s0
+# CHECK-NEXT: 2. 1 0.0 0.0 0.0 v_s_rcp_f32 s1, s1
+# CHECK-NEXT: 3. 1 0.0 0.0 0.0 v_s_rsq_f32 s1, s0
+# CHECK-NEXT: 4. 1 0.0 0.0 0.0 v_s_sqrt_f32 s2, s1
+# CHECK-NEXT: 5. 1 0.0 0.0 0.0 v_s_exp_f16 s3, s1
+# CHECK-NEXT: 6. 1 0.0 0.0 0.0 v_s_log_f16 s4, s1
+# CHECK-NEXT: 7. 1 0.0 0.0 0.0 v_s_rcp_f16 s5, s2
+# CHECK-NEXT: 8. 1 0.0 0.0 0.0 v_s_rsq_f16 s5, s4
+# CHECK-NEXT: 9. 1 0.0 0.0 0.0 v_s_sqrt_f16 s5, s5
+# CHECK-NEXT: 1 0.0 0.0 0.0 <total>
>From beceb7528c1071e08df7039b838af0753ee3fc21 Mon Sep 17 00:00:00 2001
From: Mirko Brkusanin <Mirko.Brkusanin at amd.com>
Date: Tue, 12 Dec 2023 16:39:32 +0100
Subject: [PATCH 3/4] [AMDGPU] Codegen for pseudo scalar transcendental
instructions
---
.../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 28 +-
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 2 +
llvm/lib/Target/AMDGPU/SOPInstructions.td | 34 +-
llvm/lib/Target/AMDGPU/VOP3Instructions.td | 26 +-
llvm/lib/Target/AMDGPU/VOPInstructions.td | 13 +-
...st-select-pseudo-scalar-transcendental.mir | 261 +++++++++++++
.../GlobalISel/llvm.amdgcn.rsq.clamp.ll | 6 +-
...ankselect-pseudo-scalar-transcendental.mir | 243 ++++++++++++
.../AMDGPU/pseudo-scalar-transcendental.ll | 357 ++++++++++++++++++
9 files changed, 940 insertions(+), 30 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pseudo-scalar-transcendental.mir
create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-pseudo-scalar-transcendental.mir
create mode 100644 llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index d0c1302c3f003c..f0f82e33ad0c8d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3781,14 +3781,20 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
return getDefaultMappingSOP(MI);
return getDefaultMappingVOP(MI);
}
+ case AMDGPU::G_FSQRT:
+ case AMDGPU::G_FEXP2:
+ case AMDGPU::G_FLOG2: {
+ unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ if (Subtarget.hasPseudoScalarTrans() && (Size == 16 || Size == 32) &&
+ isSALUMapping(MI))
+ return getDefaultMappingSOP(MI);
+ return getDefaultMappingVOP(MI);
+ }
case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU
case AMDGPU::G_SSUBSAT:
case AMDGPU::G_UADDSAT:
case AMDGPU::G_USUBSAT:
case AMDGPU::G_FMAD:
- case AMDGPU::G_FSQRT:
- case AMDGPU::G_FEXP2:
- case AMDGPU::G_FLOG2:
case AMDGPU::G_FLDEXP:
case AMDGPU::G_FMINNUM_IEEE:
case AMDGPU::G_FMAXNUM_IEEE:
@@ -4253,12 +4259,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_sin:
case Intrinsic::amdgcn_cos:
case Intrinsic::amdgcn_log_clamp:
- case Intrinsic::amdgcn_log:
- case Intrinsic::amdgcn_exp2:
- case Intrinsic::amdgcn_rcp:
case Intrinsic::amdgcn_rcp_legacy:
- case Intrinsic::amdgcn_sqrt:
- case Intrinsic::amdgcn_rsq:
case Intrinsic::amdgcn_rsq_legacy:
case Intrinsic::amdgcn_rsq_clamp:
case Intrinsic::amdgcn_fmul_legacy:
@@ -4315,6 +4316,17 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4:
case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8:
return getDefaultMappingVOP(MI);
+ case Intrinsic::amdgcn_log:
+ case Intrinsic::amdgcn_exp2:
+ case Intrinsic::amdgcn_rcp:
+ case Intrinsic::amdgcn_rsq:
+ case Intrinsic::amdgcn_sqrt: {
+ unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ if (Subtarget.hasPseudoScalarTrans() && (Size == 16 || Size == 32) &&
+ isSALUMapping(MI))
+ return getDefaultMappingSOP(MI);
+ return getDefaultMappingVOP(MI);
+ }
case Intrinsic::amdgcn_sbfe:
case Intrinsic::amdgcn_ubfe:
if (isSALUMapping(MI))
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index a412602bf1bde0..a675da8da33989 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1161,6 +1161,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool hasVGPRSingleUseHintInsts() const { return HasVGPRSingleUseHintInsts; }
+ bool hasPseudoScalarTrans() const { return HasPseudoScalarTrans; }
+
/// Return the maximum number of waves per SIMD for kernels using \p SGPRs
/// SGPRs
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index c51534cdbd3054..0c428d7066f10d 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -675,19 +675,8 @@ let SubtargetPredicate = isGFX12Plus in {
} // End SubtargetPredicate = isGFX12Plus
-def SelectPat : PatFrag <
- (ops node:$src1, node:$src2),
- (select SCC, $src1, $src2),
- [{ return !N->isDivergent(); }]
->;
-
let Uses = [SCC] in {
- let AddedComplexity = 20 in {
- def S_CSELECT_B32 : SOP2_32 <"s_cselect_b32",
- [(set i32:$sdst, (SelectPat i32:$src0, i32:$src1))]
- >;
- }
-
+ def S_CSELECT_B32 : SOP2_32 <"s_cselect_b32">;
def S_CSELECT_B64 : SOP2_64 <"s_cselect_b64">;
} // End Uses = [SCC]
@@ -1808,6 +1797,27 @@ def : GetFPModePat<fpmode_mask_gfx6plus>;
// SOP2 Patterns
//===----------------------------------------------------------------------===//
+def UniformSelect : PatFrag<
+ (ops node:$src0, node:$src1),
+ (select SCC, $src0, $src1),
+ [{ return !N->isDivergent(); }]
+>;
+
+let AddedComplexity = 20 in {
+ def : GCNPat<
+ (i32 (UniformSelect i32:$src0, i32:$src1)),
+ (S_CSELECT_B32 SSrc_b32:$src0, SSrc_b32:$src1)
+ >;
+
+ // TODO: The predicate should not be necessary, but enabling this pattern for
+ // all subtargets generates worse code in some cases.
+ let OtherPredicates = [HasPseudoScalarTrans] in
+ def : GCNPat<
+ (f32 (UniformSelect f32:$src0, f32:$src1)),
+ (S_CSELECT_B32 SSrc_b32:$src0, SSrc_b32:$src1)
+ >;
+}
+
// V_ADD_I32_e32/S_ADD_U32 produces carry in VCC/SCC. For the vector
// case, the sgpr-copies pass will fix this to use the vector version.
def : GCNPat <
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 5fda10b73a0c02..170f1f0911bfda 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -855,18 +855,34 @@ def VOP_Pseudo_Scalar_F16 : VOP_Pseudo_Scalar<SReg_32_XEXEC, SSrc_f16, f32, f16>
let SubtargetPredicate = HasPseudoScalarTrans, TRANS = 1,
isReMaterializable = 1, SchedRW = [WritePseudoScalarTrans] in {
- defm V_S_EXP_F32 : VOP3PseudoScalarInst<"v_s_exp_f32", VOP_Pseudo_Scalar_F32>;
+ defm V_S_EXP_F32 : VOP3PseudoScalarInst<"v_s_exp_f32", VOP_Pseudo_Scalar_F32, AMDGPUexp>;
defm V_S_EXP_F16 : VOP3PseudoScalarInst<"v_s_exp_f16", VOP_Pseudo_Scalar_F16>;
- defm V_S_LOG_F32 : VOP3PseudoScalarInst<"v_s_log_f32", VOP_Pseudo_Scalar_F32>;
+ defm V_S_LOG_F32 : VOP3PseudoScalarInst<"v_s_log_f32", VOP_Pseudo_Scalar_F32, AMDGPUlog>;
defm V_S_LOG_F16 : VOP3PseudoScalarInst<"v_s_log_f16", VOP_Pseudo_Scalar_F16>;
- defm V_S_RCP_F32 : VOP3PseudoScalarInst<"v_s_rcp_f32", VOP_Pseudo_Scalar_F32>;
+ defm V_S_RCP_F32 : VOP3PseudoScalarInst<"v_s_rcp_f32", VOP_Pseudo_Scalar_F32, AMDGPUrcp>;
defm V_S_RCP_F16 : VOP3PseudoScalarInst<"v_s_rcp_f16", VOP_Pseudo_Scalar_F16>;
- defm V_S_RSQ_F32 : VOP3PseudoScalarInst<"v_s_rsq_f32", VOP_Pseudo_Scalar_F32>;
+ defm V_S_RSQ_F32 : VOP3PseudoScalarInst<"v_s_rsq_f32", VOP_Pseudo_Scalar_F32, AMDGPUrsq>;
defm V_S_RSQ_F16 : VOP3PseudoScalarInst<"v_s_rsq_f16", VOP_Pseudo_Scalar_F16>;
- defm V_S_SQRT_F32 : VOP3PseudoScalarInst<"v_s_sqrt_f32", VOP_Pseudo_Scalar_F32>;
+ defm V_S_SQRT_F32 : VOP3PseudoScalarInst<"v_s_sqrt_f32", VOP_Pseudo_Scalar_F32, any_amdgcn_sqrt>;
defm V_S_SQRT_F16 : VOP3PseudoScalarInst<"v_s_sqrt_f16", VOP_Pseudo_Scalar_F16>;
}
+class PseudoScalarPatF16<SDPatternOperator node, VOP3_Pseudo inst> : GCNPat <
+ (f16 (UniformUnaryFrag<node> (f16 (VOP3Mods0 f16:$src0, i32:$src0_modifiers,
+ i1:$clamp, i32:$omod)))),
+ (f16 (COPY_TO_REGCLASS (f32 (inst i32:$src0_modifiers, f16:$src0, i1:$clamp,
+ i32:$omod)),
+ SReg_32_XEXEC))
+>;
+
+let SubtargetPredicate = HasPseudoScalarTrans in {
+ def : PseudoScalarPatF16<AMDGPUexpf16, V_S_EXP_F16_e64>;
+ def : PseudoScalarPatF16<AMDGPUlogf16, V_S_LOG_F16_e64>;
+ def : PseudoScalarPatF16<AMDGPUrcp, V_S_RCP_F16_e64>;
+ def : PseudoScalarPatF16<AMDGPUrsq, V_S_RSQ_F16_e64>;
+ def : PseudoScalarPatF16<any_amdgcn_sqrt, V_S_SQRT_F16_e64>;
+}
+
//===----------------------------------------------------------------------===//
// Integer Clamp Patterns
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 943a6770b4372f..fd4626d902acea 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -1303,8 +1303,17 @@ multiclass VOP3Inst<string OpName, VOPProfile P, SDPatternOperator node = null_f
} // end SubtargetPredicate = isGFX11Plus
}
-multiclass VOP3PseudoScalarInst<string OpName, VOPProfile P> {
- def _e64 : VOP3_Pseudo<OpName, P>;
+class UniformUnaryFragOrOp<SDPatternOperator Op> {
+ SDPatternOperator ret = !if(!or(!isa<SDNode>(Op), !isa<PatFrags>(Op)),
+ UniformUnaryFrag<Op>, Op);
+}
+
+multiclass VOP3PseudoScalarInst<string OpName, VOPProfile P,
+ SDPatternOperator node = null_frag> {
+ def _e64 : VOP3_Pseudo<OpName, P, [(set P.DstVT:$vdst,
+ (UniformUnaryFragOrOp<node>.ret
+ (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp,
+ i32:$omod))))]>;
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pseudo-scalar-transcendental.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pseudo-scalar-transcendental.mir
new file mode 100644
index 00000000000000..92cffb1b47bd1c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pseudo-scalar-transcendental.mir
@@ -0,0 +1,261 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
+
+---
+name: v_s_exp_f32
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; CHECK-LABEL: name: v_s_exp_f32
+ ; CHECK: liveins: $sgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; CHECK-NEXT: [[V_S_EXP_F32_e64_:%[0-9]+]]:sreg_32_xexec = nofpexcept V_S_EXP_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: $vgpr0 = COPY [[V_S_EXP_F32_e64_]]
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), %0
+ $vgpr0 = COPY %1(s32)
+
+...
+---
+name: v_s_exp_f16
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; CHECK-LABEL: name: v_s_exp_f16
+ ; CHECK: liveins: $sgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; CHECK-NEXT: [[V_S_EXP_F16_e64_:%[0-9]+]]:sreg_32_xexec = nofpexcept V_S_EXP_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[V_S_EXP_F16_e64_]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[COPY1]]
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s16) = G_TRUNC %0(s32)
+ %2:sgpr(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), %1
+ %3:sgpr(s32) = G_ANYEXT %2(s16)
+ $vgpr0 = COPY %3(s32)
+
+...
+---
+name: v_s_log_f32
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; CHECK-LABEL: name: v_s_log_f32
+ ; CHECK: liveins: $sgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; CHECK-NEXT: [[V_S_LOG_F32_e64_:%[0-9]+]]:sreg_32_xexec = nofpexcept V_S_LOG_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: $vgpr0 = COPY [[V_S_LOG_F32_e64_]]
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), %0
+ $vgpr0 = COPY %1(s32)
+
+...
+---
+name: v_s_log_f16
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; CHECK-LABEL: name: v_s_log_f16
+ ; CHECK: liveins: $sgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; CHECK-NEXT: [[V_S_LOG_F16_e64_:%[0-9]+]]:sreg_32_xexec = nofpexcept V_S_LOG_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[V_S_LOG_F16_e64_]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[COPY1]]
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s16) = G_TRUNC %0(s32)
+ %2:sgpr(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.log), %1
+ %3:sgpr(s32) = G_ANYEXT %2(s16)
+ $vgpr0 = COPY %3(s32)
+
+...
+---
+name: v_s_rcp_f32
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; CHECK-LABEL: name: v_s_rcp_f32
+ ; CHECK: liveins: $sgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; CHECK-NEXT: [[V_S_RCP_F32_e64_:%[0-9]+]]:sreg_32_xexec = nnan ninf nsz arcp contract afn reassoc nofpexcept V_S_RCP_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: $vgpr0 = COPY [[V_S_RCP_F32_e64_]]
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = nnan ninf nsz arcp contract afn reassoc G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), %0(s32)
+ $vgpr0 = COPY %1(s32)
+
+...
+---
+name: v_s_rcp_f16
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; CHECK-LABEL: name: v_s_rcp_f16
+ ; CHECK: liveins: $sgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; CHECK-NEXT: [[V_S_RCP_F16_e64_:%[0-9]+]]:sreg_32_xexec = nnan ninf nsz arcp contract afn reassoc nofpexcept V_S_RCP_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[V_S_RCP_F16_e64_]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[COPY1]]
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s16) = G_TRUNC %0(s32)
+ %2:sgpr(s16) = nnan ninf nsz arcp contract afn reassoc G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), %1(s16)
+ %3:sgpr(s32) = G_ANYEXT %2(s16)
+ $vgpr0 = COPY %3(s32)
+
+...
+---
+name: v_s_rsq_f32
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; CHECK-LABEL: name: v_s_rsq_f32
+ ; CHECK: liveins: $sgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; CHECK-NEXT: [[V_S_RSQ_F32_e64_:%[0-9]+]]:sreg_32_xexec = nofpexcept V_S_RSQ_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: $vgpr0 = COPY [[V_S_RSQ_F32_e64_]]
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %0(s32)
+ $vgpr0 = COPY %1(s32)
+
+...
+---
+name: v_s_rsq_f16
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; CHECK-LABEL: name: v_s_rsq_f16
+ ; CHECK: liveins: $sgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; CHECK-NEXT: [[V_S_RSQ_F16_e64_:%[0-9]+]]:sreg_32_xexec = nofpexcept V_S_RSQ_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[V_S_RSQ_F16_e64_]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[COPY1]]
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s16) = G_TRUNC %0(s32)
+ %2:sgpr(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %1(s16)
+ %3:sgpr(s32) = G_ANYEXT %2(s16)
+ $vgpr0 = COPY %3(s32)
+
+...
+---
+name: v_s_sqrt_f32
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; CHECK-LABEL: name: v_s_sqrt_f32
+ ; CHECK: liveins: $sgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; CHECK-NEXT: [[V_S_SQRT_F32_e64_:%[0-9]+]]:sreg_32_xexec = nofpexcept V_S_SQRT_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: $vgpr0 = COPY [[V_S_SQRT_F32_e64_]]
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = G_FSQRT %0
+ $vgpr0 = COPY %1(s32)
+
+...
+---
+name: v_s_sqrt_f16
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; CHECK-LABEL: name: v_s_sqrt_f16
+ ; CHECK: liveins: $sgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; CHECK-NEXT: [[V_S_SQRT_F16_e64_:%[0-9]+]]:sreg_32_xexec = nofpexcept V_S_SQRT_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[V_S_SQRT_F16_e64_]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[COPY1]]
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s16) = G_TRUNC %0(s32)
+ %2:sgpr(s16) = G_FSQRT %1
+ %3:sgpr(s32) = G_ANYEXT %2(s16)
+ $vgpr0 = COPY %3(s32)
+
+...
+---
+name: v_amdgcn_sqrt_f32
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; CHECK-LABEL: name: v_amdgcn_sqrt_f32
+ ; CHECK: liveins: $sgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; CHECK-NEXT: [[V_S_SQRT_F32_e64_:%[0-9]+]]:sreg_32_xexec = nofpexcept V_S_SQRT_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: $vgpr0 = COPY [[V_S_SQRT_F32_e64_]]
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sqrt), %0(s32)
+ $vgpr0 = COPY %1(s32)
+
+...
+---
+name: v_amdgcn_sqrt_f16
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; CHECK-LABEL: name: v_amdgcn_sqrt_f16
+ ; CHECK: liveins: $sgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; CHECK-NEXT: [[V_S_SQRT_F16_e64_:%[0-9]+]]:sreg_32_xexec = nofpexcept V_S_SQRT_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[V_S_SQRT_F16_e64_]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[COPY1]]
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s16) = G_TRUNC %0(s32)
+ %2:sgpr(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.sqrt), %1(s16)
+ %3:sgpr(s32) = G_ANYEXT %2(s16)
+ $vgpr0 = COPY %3(s32)
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll
index ed298796937c71..f495ce5390c727 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll
@@ -145,10 +145,10 @@ define float @v_rsq_clamp_undef_f32() #0 {
; GFX12-LABEL: v_rsq_clamp_undef_f32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX12-NEXT: v_rsq_f32_e32 v0, s0
-; GFX12-NEXT: v_mov_b32_e32 v1, 0xff7fffff
+; GFX12-NEXT: v_s_rsq_f32 s0, s0
+; GFX12-NEXT: v_mov_b32_e32 v0, 0xff7fffff
; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_minmax_num_f32 v0, v0, 0x7f7fffff, v1
+; GFX12-NEXT: v_minmax_num_f32 v0, s0, 0x7f7fffff, v0
; GFX12-NEXT: s_setpc_b64 s[30:31]
%rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float undef)
ret float %rsq_clamp
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-pseudo-scalar-transcendental.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-pseudo-scalar-transcendental.mir
new file mode 100644
index 00000000000000..8b8c539ec3a1af
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-pseudo-scalar-transcendental.mir
@@ -0,0 +1,243 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
+# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
+
+---
+name: v_s_exp_f32
+legalized: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; CHECK-LABEL: name: v_s_exp_f32
+ ; CHECK: liveins: $sgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; CHECK-NEXT: [[FEXP2_:%[0-9]+]]:sgpr(s32) = G_FEXP2 [[FEXP2_]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[FEXP2_]](s32)
+ %0:_(s32) = COPY $sgpr0
+ %1:_(s32) = G_FEXP2 %1
+ $vgpr0 = COPY %1(s32)
+...
+---
+name: v_s_exp_f16
+legalized: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; CHECK-LABEL: name: v_s_exp_f16
+ ; CHECK: liveins: $sgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32)
+ ; CHECK-NEXT: [[FEXP2_:%[0-9]+]]:sgpr(s16) = G_FEXP2 [[TRUNC]]
+ ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[FEXP2_]](s16)
+ ; CHECK-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ %0:_(s32) = COPY $sgpr0
+ %1:_(s16) = G_TRUNC %0(s32)
+ %2:_(s16) = G_FEXP2 %1
+ %3:_(s32) = G_ANYEXT %2(s16)
+ $vgpr0 = COPY %3(s32)
+
+...
+---
+name: v_s_log_f32
+legalized: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; CHECK-LABEL: name: v_s_log_f32
+ ; CHECK: liveins: $sgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; CHECK-NEXT: [[FLOG2_:%[0-9]+]]:sgpr(s32) = G_FLOG2 [[COPY]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[FLOG2_]](s32)
+ %0:_(s32) = COPY $sgpr0
+ %1:_(s32) = G_FLOG2 %0
+ $vgpr0 = COPY %1(s32)
+
+...
+---
+name: v_s_log_f16
+legalized: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; CHECK-LABEL: name: v_s_log_f16
+ ; CHECK: liveins: $sgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32)
+ ; CHECK-NEXT: [[FLOG2_:%[0-9]+]]:sgpr(s16) = G_FLOG2 [[TRUNC]]
+ ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[FLOG2_]](s16)
+ ; CHECK-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ %0:_(s32) = COPY $sgpr0
+ %1:_(s16) = G_TRUNC %0(s32)
+ %2:_(s16) = G_FLOG2 %1
+ %3:_(s32) = G_ANYEXT %2(s16)
+ $vgpr0 = COPY %3(s32)
+
+...
+---
+name: v_s_rcp_f32
+legalized: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; CHECK-LABEL: name: v_s_rcp_f32
+ ; CHECK: liveins: $sgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; CHECK-NEXT: [[INT:%[0-9]+]]:sgpr(s32) = nnan ninf nsz arcp contract afn reassoc G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY]](s32)
+ ; CHECK-NEXT: $vgpr0 = COPY [[INT]](s32)
+ %0:_(s32) = COPY $sgpr0
+ %1:_(s32) = nnan ninf nsz arcp contract afn reassoc G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), %0(s32)
+ $vgpr0 = COPY %1(s32)
+
+...
+---
+name: v_s_rcp_f16
+legalized: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; CHECK-LABEL: name: v_s_rcp_f16
+ ; CHECK: liveins: $sgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32)
+ ; CHECK-NEXT: [[INT:%[0-9]+]]:sgpr(s16) = nnan ninf nsz arcp contract afn reassoc G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[TRUNC]](s16)
+ ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[INT]](s16)
+ ; CHECK-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ %0:_(s32) = COPY $sgpr0
+ %1:_(s16) = G_TRUNC %0(s32)
+ %2:_(s16) = nnan ninf nsz arcp contract afn reassoc G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), %1(s16)
+ %3:_(s32) = G_ANYEXT %2(s16)
+ $vgpr0 = COPY %3(s32)
+
+...
+---
+name: v_s_rsq_f32
+legalized: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; CHECK-LABEL: name: v_s_rsq_f32
+ ; CHECK: liveins: $sgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; CHECK-NEXT: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[COPY]](s32)
+ ; CHECK-NEXT: $vgpr0 = COPY [[INT]](s32)
+ %0:_(s32) = COPY $sgpr0
+ %1:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %0(s32)
+ $vgpr0 = COPY %1(s32)
+
+...
+---
+name: v_s_rsq_f16
+legalized: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; CHECK-LABEL: name: v_s_rsq_f16
+ ; CHECK: liveins: $sgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32)
+ ; CHECK-NEXT: [[INT:%[0-9]+]]:sgpr(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[TRUNC]](s16)
+ ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[INT]](s16)
+ ; CHECK-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ %0:_(s32) = COPY $sgpr0
+ %1:_(s16) = G_TRUNC %0(s32)
+ %2:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %1(s16)
+ %3:_(s32) = G_ANYEXT %2(s16)
+ $vgpr0 = COPY %3(s32)
+
+...
+---
+name: v_s_sqrt_f32
+legalized: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; CHECK-LABEL: name: v_s_sqrt_f32
+ ; CHECK: liveins: $sgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; CHECK-NEXT: [[FSQRT:%[0-9]+]]:sgpr(s32) = G_FSQRT [[COPY]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[FSQRT]](s32)
+ %0:_(s32) = COPY $sgpr0
+ %1:_(s32) = G_FSQRT %0
+ $vgpr0 = COPY %1(s32)
+
+...
+---
+name: v_s_sqrt_f16
+legalized: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; CHECK-LABEL: name: v_s_sqrt_f16
+ ; CHECK: liveins: $sgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32)
+ ; CHECK-NEXT: [[FSQRT:%[0-9]+]]:sgpr(s16) = G_FSQRT [[TRUNC]]
+ ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[FSQRT]](s16)
+ ; CHECK-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ %0:_(s32) = COPY $sgpr0
+ %1:_(s16) = G_TRUNC %0(s32)
+ %2:_(s16) = G_FSQRT %1
+ %3:_(s32) = G_ANYEXT %2(s16)
+ $vgpr0 = COPY %3(s32)
+
+...
+---
+name: v_amdgcn_sqrt_f32
+legalized: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; CHECK-LABEL: name: v_amdgcn_sqrt_f32
+ ; CHECK: liveins: $sgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; CHECK-NEXT: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sqrt), [[COPY]](s32)
+ ; CHECK-NEXT: $vgpr0 = COPY [[INT]](s32)
+ %0:_(s32) = COPY $sgpr0
+ %1:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sqrt), %0(s32)
+ $vgpr0 = COPY %1(s32)
+
+...
+---
+name: v_amdgcn_sqrt_f16
+legalized: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; CHECK-LABEL: name: v_amdgcn_sqrt_f16
+ ; CHECK: liveins: $sgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32)
+ ; CHECK-NEXT: [[INT:%[0-9]+]]:sgpr(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.sqrt), [[TRUNC]](s16)
+ ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[INT]](s16)
+ ; CHECK-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ %0:_(s32) = COPY $sgpr0
+ %1:_(s16) = G_TRUNC %0(s32)
+ %2:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.sqrt), %1(s16)
+ %3:_(s32) = G_ANYEXT %2(s16)
+ $vgpr0 = COPY %3(s32)
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
new file mode 100644
index 00000000000000..5b0e0d74009f0b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
@@ -0,0 +1,357 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
+
+define amdgpu_cs float @v_s_exp_f32(float inreg %src) {
+; GFX12-LABEL: v_s_exp_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_cmp_lt_f32 s0, 0xc2fc0000
+; GFX12-NEXT: s_cselect_b32 s1, 0x42800000, 0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
+; GFX12-NEXT: s_add_f32 s0, s0, s1
+; GFX12-NEXT: s_cselect_b32 s1, 0x1f800000, 1.0
+; GFX12-NEXT: v_s_exp_f32 s0, s0
+; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; GFX12-NEXT: s_mul_f32 s0, s0, s1
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: ; return to shader part epilog
+ %result = call float @llvm.exp2.f32(float %src)
+ ret float %result
+}
+
+define amdgpu_cs half @v_s_exp_f16(half inreg %src) {
+; GFX12-LABEL: v_s_exp_f16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_s_exp_f16 s0, s0
+; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: ; return to shader part epilog
+ %result = call half @llvm.exp2.f16(half %src)
+ ret half %result
+}
+
+define amdgpu_cs float @v_s_amdgcn_exp_f32(float inreg %src) {
+; GFX12-LABEL: v_s_amdgcn_exp_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_s_exp_f32 s0, s0
+; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: ; return to shader part epilog
+ %result = call float @llvm.amdgcn.exp2.f32(float %src)
+ ret float %result
+}
+
+define amdgpu_cs half @v_s_amdgcn_exp_f16(half inreg %src) {
+; GFX12-LABEL: v_s_amdgcn_exp_f16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_s_exp_f16 s0, s0
+; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: ; return to shader part epilog
+ %result = call half @llvm.amdgcn.exp2.f16(half %src)
+ ret half %result
+}
+
+define amdgpu_cs float @v_s_log_f32(float inreg %src) {
+; GFX12-LABEL: v_s_log_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_cmp_lt_f32 s0, 0x800000
+; GFX12-NEXT: s_cselect_b32 s1, 0x4f800000, 1.0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
+; GFX12-NEXT: s_mul_f32 s0, s0, s1
+; GFX12-NEXT: s_cselect_b32 s1, 0x42000000, 0
+; GFX12-NEXT: v_s_log_f32 s0, s0
+; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; GFX12-NEXT: s_sub_f32 s0, s0, s1
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: ; return to shader part epilog
+ %result = call float @llvm.log2.f32(float %src)
+ ret float %result
+}
+
+define amdgpu_cs half @v_s_log_f16(half inreg %src) {
+; GFX12-LABEL: v_s_log_f16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_s_log_f16 s0, s0
+; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: ; return to shader part epilog
+ %result = call half @llvm.log2.f16(half %src)
+ ret half %result
+}
+
+define amdgpu_cs float @v_s_amdgcn_log_f32(float inreg %src) {
+; GFX12-LABEL: v_s_amdgcn_log_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_s_log_f32 s0, s0
+; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: ; return to shader part epilog
+ %result = call float @llvm.amdgcn.log.f32(float %src)
+ ret float %result
+}
+
+define amdgpu_cs half @v_s_amdgcn_log_f16(half inreg %src) {
+; GFX12-LABEL: v_s_amdgcn_log_f16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_s_log_f16 s0, s0
+; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: ; return to shader part epilog
+ %result = call half @llvm.amdgcn.log.f16(half %src)
+ ret half %result
+}
+
+define amdgpu_cs float @v_s_rcp_f32(float inreg %src) {
+; GFX12-LABEL: v_s_rcp_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_s_rcp_f32 s0, s0
+; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: ; return to shader part epilog
+ %result = call fast float @llvm.amdgcn.rcp.f32(float %src)
+ ret float %result
+}
+
+define amdgpu_cs half @v_s_rcp_f16(half inreg %src) {
+; GFX12-LABEL: v_s_rcp_f16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_s_rcp_f16 s0, s0
+; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: ; return to shader part epilog
+ %result = call fast half @llvm.amdgcn.rcp.f16(half %src)
+ ret half %result
+}
+
+; TODO-GFX12: GlobalISel should generate v_s_rsq.
+define amdgpu_cs float @v_s_rsq_f32(float inreg %src) {
+; GFX12-SDAG-LABEL: v_s_rsq_f32:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: v_s_rsq_f32 s0, s0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: v_s_rsq_f32:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: v_s_sqrt_f32 s0, s0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX12-GISEL-NEXT: v_s_rcp_f32 s0, s0
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-GISEL-NEXT: ; return to shader part epilog
+ %sqrt = call fast float @llvm.sqrt.f32(float %src)
+ %fdiv = fdiv fast float 1.0, %sqrt
+ ret float %fdiv
+}
+
+define amdgpu_cs half @v_s_rsq_f16(half inreg %src) {
+; GFX12-LABEL: v_s_rsq_f16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_s_rsq_f16 s0, s0
+; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: ; return to shader part epilog
+ %sqrt = call fast half @llvm.sqrt.f16(half %src)
+ %result = fdiv fast half 1.0, %sqrt
+ ret half %result
+}
+
+; TODO-GFX12: Should not use any VALU instructions.
+define amdgpu_cs float @v_s_sqrt_f32(float inreg %src) {
+; GFX12-SDAG-LABEL: v_s_sqrt_f32:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_mul_f32 s1, s0, 0x4f800000
+; GFX12-SDAG-NEXT: s_cmp_lt_f32 s0, 0xf800000
+; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT: s_cselect_b32 s1, s1, s0
+; GFX12-SDAG-NEXT: v_s_sqrt_f32 s2, s1
+; GFX12-SDAG-NEXT: s_mov_b32 s4, s1
+; GFX12-SDAG-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT: s_add_co_i32 s3, s2, -1
+; GFX12-SDAG-NEXT: s_xor_b32 s5, s3, 0x80000000
+; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
+; GFX12-SDAG-NEXT: s_fmac_f32 s4, s5, s2
+; GFX12-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX12-SDAG-NEXT: s_cmp_le_f32 s4, 0
+; GFX12-SDAG-NEXT: s_cselect_b32 s3, s3, s2
+; GFX12-SDAG-NEXT: s_add_co_i32 s4, s2, 1
+; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT: s_xor_b32 s6, s4, 0x80000000
+; GFX12-SDAG-NEXT: s_fmac_f32 s5, s6, s2
+; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_2)
+; GFX12-SDAG-NEXT: s_cmp_gt_f32 s5, 0
+; GFX12-SDAG-NEXT: s_cselect_b32 s2, s4, s3
+; GFX12-SDAG-NEXT: s_cmp_lt_f32 s0, 0xf800000
+; GFX12-SDAG-NEXT: s_mul_f32 s0, s2, 0x37800000
+; GFX12-SDAG-NEXT: v_cmp_class_f32_e64 s3, s1, 0x260
+; GFX12-SDAG-NEXT: s_cselect_b32 s0, s0, s2
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT: s_and_b32 s2, s3, exec_lo
+; GFX12-SDAG-NEXT: s_cselect_b32 s0, s1, s0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: v_s_sqrt_f32:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_cmp_gt_f32 0xf800000, s0
+; GFX12-GISEL-NEXT: s_mul_f32 s2, s0, 0x4f800000
+; GFX12-GISEL-NEXT: s_cselect_b32 s1, 1, 0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT: s_cselect_b32 s0, s2, s0
+; GFX12-GISEL-NEXT: v_s_sqrt_f32 s2, s0
+; GFX12-GISEL-NEXT: s_mov_b32 s4, s0
+; GFX12-GISEL-NEXT: s_mov_b32 s6, s0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT: s_add_co_i32 s3, s2, -1
+; GFX12-GISEL-NEXT: s_xor_b32 s5, s3, 0x80000000
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT: s_fmac_f32 s4, s5, s2
+; GFX12-GISEL-NEXT: s_add_co_i32 s5, s2, 1
+; GFX12-GISEL-NEXT: s_xor_b32 s7, s5, 0x80000000
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_2)
+; GFX12-GISEL-NEXT: s_cmp_le_f32 s4, 0
+; GFX12-GISEL-NEXT: s_fmac_f32 s6, s7, s2
+; GFX12-GISEL-NEXT: s_cselect_b32 s2, s3, s2
+; GFX12-GISEL-NEXT: s_cmp_gt_f32 s6, 0
+; GFX12-GISEL-NEXT: s_cselect_b32 s2, s5, s2
+; GFX12-GISEL-NEXT: s_cmp_lg_u32 s1, 0
+; GFX12-GISEL-NEXT: s_mul_f32 s3, s2, 0x37800000
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT: s_cselect_b32 s1, s3, s2
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-GISEL-NEXT: v_cmp_class_f32_e64 s1, s0, 0x260
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, s0, s1
+; GFX12-GISEL-NEXT: ; return to shader part epilog
+ %result = call float @llvm.sqrt.f32(float %src)
+ ret float %result
+}
+
+define amdgpu_cs half @v_s_sqrt_f16(half inreg %src) {
+; GFX12-LABEL: v_s_sqrt_f16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_s_sqrt_f16 s0, s0
+; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: ; return to shader part epilog
+ %result = call half @llvm.sqrt.f16(half %src)
+ ret half %result
+}
+
+define amdgpu_cs float @v_amdgcn_sqrt_f32(float inreg %src) {
+; GFX12-LABEL: v_amdgcn_sqrt_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_s_sqrt_f32 s0, s0
+; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: ; return to shader part epilog
+ %result = call float @llvm.amdgcn.sqrt.f32(float %src)
+ ret float %result
+}
+
+define amdgpu_cs half @v_amdgcn_sqrt_f16(half inreg %src) {
+; GFX12-LABEL: v_amdgcn_sqrt_f16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_s_sqrt_f16 s0, s0
+; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: ; return to shader part epilog
+ %result = call half @llvm.amdgcn.sqrt.f16(half %src)
+ ret half %result
+}
+
+define amdgpu_cs float @srcmods_abs_f32(float inreg %src) {
+; GFX12-LABEL: srcmods_abs_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_bitset0_b32 s0, 31
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_cmp_lt_f32 s0, 0x800000
+; GFX12-NEXT: s_cselect_b32 s1, 0x4f800000, 1.0
+; GFX12-NEXT: s_mul_f32 s0, s0, s1
+; GFX12-NEXT: s_cselect_b32 s1, 0x42000000, 0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX12-NEXT: v_s_log_f32 s0, s0
+; GFX12-NEXT: s_sub_f32 s0, s0, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: ; return to shader part epilog
+ %abs = call float @llvm.fabs.f32(float %src)
+ %result = call float @llvm.log2.f32(float %abs)
+ ret float %result
+}
+
+define amdgpu_cs float @srcmods_neg_f32(float inreg %src) {
+; GFX12-SDAG-LABEL: srcmods_neg_f32:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_xor_b32 s1, s0, 0x80000000
+; GFX12-SDAG-NEXT: s_cmp_gt_f32 s0, 0x80800000
+; GFX12-SDAG-NEXT: s_cselect_b32 s0, 0x4f800000, 1.0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
+; GFX12-SDAG-NEXT: s_mul_f32 s0, s1, s0
+; GFX12-SDAG-NEXT: s_cselect_b32 s1, 0x42000000, 0
+; GFX12-SDAG-NEXT: v_s_log_f32 s0, s0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; GFX12-SDAG-NEXT: s_sub_f32 s0, s0, s1
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: srcmods_neg_f32:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_xor_b32 s0, s0, 0x80000000
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT: s_cmp_lt_f32 s0, 0x800000
+; GFX12-GISEL-NEXT: s_cselect_b32 s1, 0x4f800000, 1.0
+; GFX12-GISEL-NEXT: s_mul_f32 s0, s0, s1
+; GFX12-GISEL-NEXT: s_cselect_b32 s1, 0x42000000, 0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX12-GISEL-NEXT: v_s_log_f32 s0, s0
+; GFX12-GISEL-NEXT: s_sub_f32 s0, s0, s1
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-GISEL-NEXT: ; return to shader part epilog
+ %neg = fneg float %src
+ %result = call float @llvm.log2.f32(float %neg)
+ ret float %result
+}
+
+define amdgpu_cs half @srcmods_abs_f16(half inreg %src) {
+; GFX12-LABEL: srcmods_abs_f16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_s_log_f16 s0, |s0|
+; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: ; return to shader part epilog
+ %abs = call half @llvm.fabs.f16(half %src)
+ %result = call half @llvm.log2.f16(half %abs)
+ ret half %result
+}
+
+define amdgpu_cs half @srcmods_neg_f16(half inreg %src) {
+; GFX12-LABEL: srcmods_neg_f16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_s_log_f16 s0, -s0
+; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: ; return to shader part epilog
+ %neg = fneg half %src
+ %result = call half @llvm.log2.f16(half %neg)
+ ret half %result
+}
+
+declare half @llvm.exp2.f16(half)
+declare float @llvm.exp2.f32(float)
+declare half @llvm.amdgcn.exp2.f16(half)
+declare float @llvm.amdgcn.exp2.f32(float)
+declare half @llvm.log2.f16(half)
+declare float @llvm.log2.f32(float)
+declare half @llvm.amdgcn.log.f16(half)
+declare float @llvm.amdgcn.log.f32(float)
+declare half @llvm.amdgcn.rcp.f16(half)
+declare float @llvm.amdgcn.rcp.f32(float)
+declare half @llvm.sqrt.f16(half)
+declare float @llvm.sqrt.f32(float)
+declare half @llvm.amdgcn.sqrt.f16(half)
+declare float @llvm.amdgcn.sqrt.f32(float)
+declare half @llvm.fabs.f16(half)
+declare float @llvm.fabs.f32(float)
>From 29322598788edbcb38f282c23c7ca7de47f6c4c3 Mon Sep 17 00:00:00 2001
From: Mirko Brkusanin <Mirko.Brkusanin at amd.com>
Date: Tue, 12 Dec 2023 16:42:27 +0100
Subject: [PATCH 4/4] [AMDGPU] MoveToVALU for pseudo scalar transcendental
instructions
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 12 +-
.../move-to-valu-pseudo-scalar-trans.ll | 218 ++++++++++++++++++
2 files changed, 229 insertions(+), 1 deletion(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 03ffe8e10f4bbd..c2b8da1fc3921f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -5305,6 +5305,16 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
case AMDGPU::S_CMP_NLE_F16: return AMDGPU::V_CMP_NLE_F16_t16_e64;
case AMDGPU::S_CMP_NEQ_F16: return AMDGPU::V_CMP_NEQ_F16_t16_e64;
case AMDGPU::S_CMP_NLT_F16: return AMDGPU::V_CMP_NLT_F16_t16_e64;
+ case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
+ case AMDGPU::V_S_EXP_F16_e64: return AMDGPU::V_EXP_F16_t16_e64;
+ case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
+ case AMDGPU::V_S_LOG_F16_e64: return AMDGPU::V_LOG_F16_t16_e64;
+ case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
+ case AMDGPU::V_S_RCP_F16_e64: return AMDGPU::V_RCP_F16_t16_e64;
+ case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
+ case AMDGPU::V_S_RSQ_F16_e64: return AMDGPU::V_RSQ_F16_t16_e64;
+ case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
+ case AMDGPU::V_S_SQRT_F16_e64: return AMDGPU::V_SQRT_F16_t16_e64;
}
llvm_unreachable(
"Unexpected scalar opcode without corresponding vector one!");
@@ -7189,7 +7199,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
// Use the new VALU Opcode.
auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
.setMIFlags(Inst.getFlags());
- if (isVOP3(NewOpcode)) {
+ if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
// Intersperse VOP3 modifiers among the SALU operands.
NewInstr->addOperand(Inst.getOperand(0));
if (AMDGPU::getNamedOperandIdx(NewOpcode,
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll
new file mode 100644
index 00000000000000..67baec82b8396a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll
@@ -0,0 +1,218 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -stop-after=si-fix-sgpr-copies -verify-machineinstrs < %s | FileCheck %s
+
+define amdgpu_kernel void @exp_f32(ptr addrspace(1) %ptr) {
+ ; CHECK-LABEL: name: exp_f32
+ ; CHECK: bb.0 (%ir-block.0):
+ ; CHECK-NEXT: liveins: $sgpr0_sgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[V_EXP_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F32_e64 0, [[GLOBAL_LOAD_DWORD_SADDR]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_EXP_F32_e64_]]
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s32) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %val = load volatile float, ptr addrspace(1) %ptr
+ %res = call float @llvm.amdgcn.exp2.f32(float %val)
+ store float %res, ptr addrspace(1) %ptr
+ ret void
+}
+
+define amdgpu_kernel void @exp_f16(ptr addrspace(1) %ptr) {
+ ; CHECK-LABEL: name: exp_f16
+ ; CHECK: bb.0 (%ir-block.0):
+ ; CHECK-NEXT: liveins: $sgpr0_sgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[V_EXP_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F16_t16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_EXP_F16_t16_e64_]]
+ ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %val = load volatile half, ptr addrspace(1) %ptr
+ %res = call half @llvm.amdgcn.exp2.f16(half %val)
+ store half %res, ptr addrspace(1) %ptr
+ ret void
+}
+
+define amdgpu_kernel void @log_f32(ptr addrspace(1) %ptr) {
+ ; CHECK-LABEL: name: log_f32
+ ; CHECK: bb.0 (%ir-block.0):
+ ; CHECK-NEXT: liveins: $sgpr0_sgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[V_LOG_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F32_e64 0, [[GLOBAL_LOAD_DWORD_SADDR]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_LOG_F32_e64_]]
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s32) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %val = load volatile float, ptr addrspace(1) %ptr
+ %res = call float @llvm.amdgcn.log.f32(float %val)
+ store float %res, ptr addrspace(1) %ptr
+ ret void
+}
+
+define amdgpu_kernel void @log_f16(ptr addrspace(1) %ptr) {
+ ; CHECK-LABEL: name: log_f16
+ ; CHECK: bb.0 (%ir-block.0):
+ ; CHECK-NEXT: liveins: $sgpr0_sgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[V_LOG_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_t16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_LOG_F16_t16_e64_]]
+ ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %val = load volatile half, ptr addrspace(1) %ptr
+ %res = call half @llvm.amdgcn.log.f16(half %val)
+ store half %res, ptr addrspace(1) %ptr
+ ret void
+}
+
+define amdgpu_kernel void @rcp_f32(ptr addrspace(1) %ptr) {
+ ; CHECK-LABEL: name: rcp_f32
+ ; CHECK: bb.0 (%ir-block.0):
+ ; CHECK-NEXT: liveins: $sgpr0_sgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[V_RCP_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_F32_e64 0, [[GLOBAL_LOAD_DWORD_SADDR]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_RCP_F32_e64_]]
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s32) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %val = load volatile float, ptr addrspace(1) %ptr
+ %res = call float @llvm.amdgcn.rcp.f32(float %val)
+ store float %res, ptr addrspace(1) %ptr
+ ret void
+}
+
+define amdgpu_kernel void @rcp_f16(ptr addrspace(1) %ptr) {
+ ; CHECK-LABEL: name: rcp_f16
+ ; CHECK: bb.0 (%ir-block.0):
+ ; CHECK-NEXT: liveins: $sgpr0_sgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[V_RCP_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_F16_t16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_RCP_F16_t16_e64_]]
+ ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %val = load volatile half, ptr addrspace(1) %ptr
+ %res = call half @llvm.amdgcn.rcp.f16(half %val)
+ store half %res, ptr addrspace(1) %ptr
+ ret void
+}
+
+define amdgpu_kernel void @rsq_f32(ptr addrspace(1) %ptr) {
+ ; CHECK-LABEL: name: rsq_f32
+ ; CHECK: bb.0 (%ir-block.0):
+ ; CHECK-NEXT: liveins: $sgpr0_sgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[V_RSQ_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_RSQ_F32_e64 0, [[GLOBAL_LOAD_DWORD_SADDR]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_RSQ_F32_e64_]]
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s32) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %val = load volatile float, ptr addrspace(1) %ptr
+ %res = call float @llvm.amdgcn.rsq.f32(float %val)
+ store float %res, ptr addrspace(1) %ptr
+ ret void
+}
+
+define amdgpu_kernel void @rsq_f16(ptr addrspace(1) %ptr) {
+ ; CHECK-LABEL: name: rsq_f16
+ ; CHECK: bb.0 (%ir-block.0):
+ ; CHECK-NEXT: liveins: $sgpr0_sgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[V_RSQ_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_RSQ_F16_t16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_RSQ_F16_t16_e64_]]
+ ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %val = load volatile half, ptr addrspace(1) %ptr
+ %res = call half @llvm.amdgcn.rsq.f16(half %val)
+ store half %res, ptr addrspace(1) %ptr
+ ret void
+}
+
+define amdgpu_kernel void @sqrt_f32(ptr addrspace(1) %ptr) {
+ ; CHECK-LABEL: name: sqrt_f32
+ ; CHECK: bb.0 (%ir-block.0):
+ ; CHECK-NEXT: liveins: $sgpr0_sgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[V_SQRT_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_SQRT_F32_e64 0, [[GLOBAL_LOAD_DWORD_SADDR]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_SQRT_F32_e64_]]
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s32) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %val = load volatile float, ptr addrspace(1) %ptr
+ %res = call float @llvm.amdgcn.sqrt.f32(float %val)
+ store float %res, ptr addrspace(1) %ptr
+ ret void
+}
+
+define amdgpu_kernel void @sqrt_f16(ptr addrspace(1) %ptr) {
+ ; CHECK-LABEL: name: sqrt_f16
+ ; CHECK: bb.0 (%ir-block.0):
+ ; CHECK-NEXT: liveins: $sgpr0_sgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[V_SQRT_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_SQRT_F16_t16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_SQRT_F16_t16_e64_]]
+ ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %val = load volatile half, ptr addrspace(1) %ptr
+ %res = call half @llvm.amdgcn.sqrt.f16(half %val)
+ store half %res, ptr addrspace(1) %ptr
+ ret void
+}
+
+declare float @llvm.amdgcn.exp2.f32(float)
+declare half @llvm.amdgcn.exp2.f16(half)
+declare float @llvm.amdgcn.log.f32(float)
+declare half @llvm.amdgcn.log.f16(half)
+declare float @llvm.amdgcn.rcp.f32(float)
+declare half @llvm.amdgcn.rcp.f16(half)
+declare float @llvm.amdgcn.rsq.f32(float)
+declare half @llvm.amdgcn.rsq.f16(half)
+declare float @llvm.amdgcn.sqrt.f32(float)
+declare half @llvm.amdgcn.sqrt.f16(half)
More information about the llvm-commits
mailing list