[llvm] AMDGPU: Fix broken exp10 lowering for f16 (PR #170582)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Dec 3 16:30:35 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Matt Arsenault (arsenm)
<details>
<summary>Changes</summary>
This was calling the exp handling, so multiplying by the wrong
constant.
GlobalISel is still broken, but missing the fast exp10 path.
This is tracked in https://github.com/llvm/llvm-project/issues/170576
---
Patch is 38.90 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/170582.diff
2 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (+7-3)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.exp10.ll (+385-126)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 971dfdbe3e70a..5be5c66ba17dc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3055,8 +3055,11 @@ SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
if (VT.getScalarType() == MVT::f16) {
// v_exp_f16 (fmul x, log2e)
- if (allowApproxFunc(DAG, Flags)) // TODO: Does this really require fast?
- return lowerFEXPUnsafe(X, SL, DAG, Flags);
+
+ if (allowApproxFunc(DAG, Flags)) { // TODO: Does this really require fast?
+ return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
+ : lowerFEXPUnsafe(X, SL, DAG, Flags);
+ }
if (VT.isVector())
return SDValue();
@@ -3066,7 +3069,8 @@ SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
// Nothing in half is a denormal when promoted to f32.
SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, X, Flags);
- SDValue Lowered = lowerFEXPUnsafe(Ext, SL, DAG, Flags);
+ SDValue Lowered = IsExp10 ? lowerFEXP10Unsafe(Ext, SL, DAG, Flags)
+ : lowerFEXPUnsafe(Ext, SL, DAG, Flags);
return DAG.getNode(ISD::FP_ROUND, SL, VT, Lowered,
DAG.getTargetConstant(0, SL, MVT::i32), Flags);
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
index 3928ec2dd76d3..8a0e02664fc6c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
@@ -5877,22 +5877,37 @@ define float @v_exp10_f32_from_fpext_math_f16_daz(i16 %src0.i, i16 %src1.i) #0 {
; FIXME: Fold out fp16_to_fp (FP_TO_FP16) on no-f16 targets
define half @v_exp10_f16(half %in) {
-; GCN-LABEL: v_exp10_f16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GCN-NEXT: v_exp_f32_e32 v0, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; GCN-SDAG-LABEL: v_exp10_f16:
+; GCN-SDAG: ; %bb.0:
+; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-SDAG-NEXT: v_mul_f32_e32 v1, 0x3a2784bc, v0
+; GCN-SDAG-NEXT: v_mul_f32_e32 v0, 0x40549000, v0
+; GCN-SDAG-NEXT: v_exp_f32_e32 v1, v1
+; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v0
+; GCN-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; GCN-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-GISEL-LABEL: v_exp10_f16:
+; GCN-GISEL: ; %bb.0:
+; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; GCN-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; SI-SDAG-LABEL: v_exp10_f16:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3a2784bc, v0
+; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x40549000, v0
+; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1
; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0
+; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -5920,22 +5935,37 @@ define half @v_exp10_f16(half %in) {
}
define half @v_exp10_fabs_f16(half %in) {
-; GCN-LABEL: v_exp10_fabs_f16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GCN-NEXT: v_exp_f32_e32 v0, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; GCN-SDAG-LABEL: v_exp10_fabs_f16:
+; GCN-SDAG: ; %bb.0:
+; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0|
+; GCN-SDAG-NEXT: v_mul_f32_e32 v1, 0x3a2784bc, v0
+; GCN-SDAG-NEXT: v_mul_f32_e32 v0, 0x40549000, v0
+; GCN-SDAG-NEXT: v_exp_f32_e32 v1, v1
+; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v0
+; GCN-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; GCN-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-GISEL-LABEL: v_exp10_fabs_f16:
+; GCN-GISEL: ; %bb.0:
+; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-GISEL-NEXT: v_cvt_f32_f16_e64 v0, |v0|
+; GCN-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; GCN-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; SI-SDAG-LABEL: v_exp10_fabs_f16:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3a2784bc, v0
+; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x40549000, v0
+; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1
; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0
+; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -5967,9 +5997,12 @@ define half @v_exp10_fneg_fabs_f16(half %in) {
; GCN-SDAG-LABEL: v_exp10_fneg_fabs_f16:
; GCN-SDAG: ; %bb.0:
; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; GCN-SDAG-NEXT: v_mul_f32_e32 v0, 0xbfb8aa3b, v0
+; GCN-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -|v0|
+; GCN-SDAG-NEXT: v_mul_f32_e32 v1, 0x3a2784bc, v0
+; GCN-SDAG-NEXT: v_mul_f32_e32 v0, 0x40549000, v0
+; GCN-SDAG-NEXT: v_exp_f32_e32 v1, v1
; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v0
+; GCN-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
; GCN-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GCN-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -5987,8 +6020,11 @@ define half @v_exp10_fneg_fabs_f16(half %in) {
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0xbfb8aa3b, v0
+; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0xba2784bc, v0
+; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0xc0549000, v0
+; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1
; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0
+; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -6021,9 +6057,12 @@ define half @v_exp10_fneg_f16(half %in) {
; GCN-SDAG-LABEL: v_exp10_fneg_f16:
; GCN-SDAG: ; %bb.0:
; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-SDAG-NEXT: v_mul_f32_e32 v0, 0xbfb8aa3b, v0
+; GCN-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -v0
+; GCN-SDAG-NEXT: v_mul_f32_e32 v1, 0x3a2784bc, v0
+; GCN-SDAG-NEXT: v_mul_f32_e32 v0, 0x40549000, v0
+; GCN-SDAG-NEXT: v_exp_f32_e32 v1, v1
; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v0
+; GCN-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
; GCN-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GCN-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -6041,8 +6080,11 @@ define half @v_exp10_fneg_f16(half %in) {
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3a2784bc, v0
+; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x40549000, v0
+; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1
; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0
+; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -6071,20 +6113,33 @@ define half @v_exp10_fneg_f16(half %in) {
}
define half @v_exp10_f16_fast(half %in) {
-; GCN-LABEL: v_exp10_f16_fast:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f16_e32 v0, 0x3dc5, v0
-; GCN-NEXT: v_exp_f16_e32 v0, v0
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; GCN-SDAG-LABEL: v_exp10_f16_fast:
+; GCN-SDAG: ; %bb.0:
+; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-SDAG-NEXT: v_mul_f16_e32 v1, 0x113c, v0
+; GCN-SDAG-NEXT: v_mul_f16_e32 v0, 0x42a4, v0
+; GCN-SDAG-NEXT: v_exp_f16_e32 v1, v1
+; GCN-SDAG-NEXT: v_exp_f16_e32 v0, v0
+; GCN-SDAG-NEXT: v_mul_f16_e32 v0, v0, v1
+; GCN-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-GISEL-LABEL: v_exp10_f16_fast:
+; GCN-GISEL: ; %bb.0:
+; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-GISEL-NEXT: v_mul_f16_e32 v0, 0x3dc5, v0
+; GCN-GISEL-NEXT: v_exp_f16_e32 v0, v0
+; GCN-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; SI-SDAG-LABEL: v_exp10_f16_fast:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0
+; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3a278000, v0
+; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x40548000, v0
+; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1
; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0
+; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; SI-GISEL-LABEL: v_exp10_f16_fast:
@@ -6117,11 +6172,17 @@ define <2 x half> @v_exp10_v2f16(<2 x half> %in) {
; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-SDAG-NEXT: v_cvt_f32_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; VI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3a2784bc, v1
+; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x40549000, v1
+; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3a2784bc, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v0, 0x40549000, v0
+; VI-SDAG-NEXT: v_exp_f32_e32 v2, v2
; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1
+; VI-SDAG-NEXT: v_exp_f32_e32 v3, v3
; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v1, v1, v2
; VI-SDAG-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; VI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v3
; VI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; VI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -6140,19 +6201,39 @@ define <2 x half> @v_exp10_v2f16(<2 x half> %in) {
; VI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0
; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_exp10_v2f16:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_cvt_f32_f16_e32 v1, v0
-; GFX900-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX900-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX900-NEXT: v_exp_f32_e32 v1, v1
-; GFX900-NEXT: v_exp_f32_e32 v0, v0
-; GFX900-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX900-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX900-NEXT: v_pack_b32_f16 v0, v1, v0
-; GFX900-NEXT: s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_exp10_v2f16:
+; GFX900-SDAG: ; %bb.0:
+; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX900-SDAG-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3a2784bc, v1
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x40549000, v1
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, 0x3a2784bc, v0
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, 0x40549000, v0
+; GFX900-SDAG-NEXT: v_exp_f32_e32 v2, v2
+; GFX900-SDAG-NEXT: v_exp_f32_e32 v1, v1
+; GFX900-SDAG-NEXT: v_exp_f32_e32 v3, v3
+; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, v1, v2
+; GFX900-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v0, v3
+; GFX900-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_exp10_v2f16:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX900-GISEL-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX900-GISEL-NEXT: v_exp_f32_e32 v1, v1
+; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX900-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; SI-SDAG-LABEL: v_exp10_v2f16:
; SI-SDAG: ; %bb.0:
@@ -6161,11 +6242,17 @@ define <2 x half> @v_exp10_v2f16(<2 x half> %in) {
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3a2784bc, v0
+; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x40549000, v0
+; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3a2784bc, v1
+; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x40549000, v1
+; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2
; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0
+; SI-SDAG-NEXT: v_exp_f32_e32 v3, v3
; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1
+; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-SDAG-NEXT: v_mul_f32_e32 v1, v1, v3
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
@@ -6203,11 +6290,17 @@ define <2 x half> @v_exp10_fabs_v2f16(<2 x half> %in) {
; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-SDAG-NEXT: v_cvt_f32_f16_sdwa v1, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; VI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3a2784bc, v1
+; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x40549000, v1
+; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3a2784bc, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v0, 0x40549000, v0
+; VI-SDAG-NEXT: v_exp_f32_e32 v2, v2
; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1
+; VI-SDAG-NEXT: v_exp_f32_e32 v3, v3
; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v1, v1, v2
; VI-SDAG-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; VI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v3
; VI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; VI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -6232,11 +6325,17 @@ define <2 x half> @v_exp10_fabs_v2f16(<2 x half> %in) {
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v0|
; GFX900-SDAG-NEXT: v_cvt_f32_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3a2784bc, v1
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x40549000, v1
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, 0x3a2784bc, v0
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, 0x40549000, v0
+; GFX900-SDAG-NEXT: v_exp_f32_e32 v2, v2
; GFX900-SDAG-NEXT: v_exp_f32_e32 v1, v1
+; GFX900-SDAG-NEXT: v_exp_f32_e32 v3, v3
; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, v1, v2
; GFX900-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v0, v3
; GFX900-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -6263,11 +6362,17 @@ define <2 x half> @v_exp10_fabs_v2f16(<2 x half> %in) {
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0|
; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v1|
-; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3a2784bc, v0
+; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x40549000, v0
+; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3a2784bc, v1
+; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x40549000, v1
+; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2
; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0
+; SI-SDAG-NEXT: v_exp_f32_e32 v3, v3
; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1
+; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-SDAG-NEXT: v_mul_f32_e32 v1, v1, v3
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
@@ -6311,11 +6416,17 @@ define <2 x half> @v_exp10_fneg_fabs_v2f16(<2 x half> %in) {
; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-SDAG-NEXT: v_cvt_f32_f16_sdwa v1, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -|v0|
-; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; VI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3a2784bc, v1
+; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x40549000, v1
+; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3a2784bc, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v0, 0x40549000, v0
+; VI-SDAG-NEXT: v_exp_f32_e32 v2, v2
; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1
+; VI-SDAG-NEXT: v_exp_f32_e32 v3, v3
; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v1, v1, v2
; VI-SDAG-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; VI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v3
; VI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; VI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -6340,11 +6451,17 @@ define <2 x half> @v_exp10_fneg_fabs_v2f16(<2 x half> %in) {
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-SDAG-NEXT: v_cvt_f32_f16_e64 v1, -|v0|
; GFX900-SDAG-NEXT: v_cvt_f32_f16_sdwa v0, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3a2784bc, v1
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x40549000, v1
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, 0x3a2784bc, v0
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, 0x40549000, v0
+; GFX900-SDAG-NEXT: v_exp_f32_e32 v2, v2
; GFX900-SDAG-NEXT: v_exp_f32_e32 v1, v1
+; GFX900-SDAG-NEXT: v_exp_f32_e32 v3, v3
; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, v1, v2
; GFX900-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v0, v3
; GFX900-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -6375,11 +6492,17 @@ define <2 x half> @v_exp10_fneg_fabs_v2f16(<2 x half> %in) {
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v0
; SI-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3a2784bc, v1
+; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x40549000, v1
+; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3a2784bc, v0
+; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x40549000, v0
+; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2
; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1
-; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; SI-SDAG-NEXT: v_exp_f32_e32 v3, v3
; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0
+; SI-SDAG-NEXT: v_mul_f32_e32 v1, v1, v2
; SI-SDAG-NEXT: v_...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/170582
More information about the llvm-commits
mailing list