[llvm] AMDGPU/GlobalISel: Fix broken exp10 lowering for f16 (PR #170708)

via llvm-commits llvm-commits at lists.llvm.org
Thu Dec 4 09:57:39 PST 2025


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-amdgpu

Author: Petar Avramovic (petar-avramovic)

<details>
<summary>Changes</summary>



---

Patch is 43.39 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/170708.diff


3 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (+65-2) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h (+2) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.exp10.ll (+334-198) 


``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index ae62dbe1cc706..8c157f30da268 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -3756,6 +3756,67 @@ bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst,
   return true;
 }
 
+static MachineInstrBuilder buildExp(MachineIRBuilder &B, const DstOp &Dst,
+                                    const SrcOp &Src, unsigned Flags) {
+  LLT Ty = Dst.getLLTTy(*B.getMRI());
+
+  if (Ty == LLT::scalar(32)) {
+    return B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Dst})
+        .addUse(Src.getReg())
+        .setMIFlags(Flags);
+  }
+  return B.buildFExp2(Dst, Src, Flags);
+}
+
+bool AMDGPULegalizerInfo::legalizeFExp10Unsafe(MachineIRBuilder &B,
+                                               Register Dst, Register X,
+                                               unsigned Flags) const {
+  LLT Ty = B.getMRI()->getType(Dst);
+  LLT F32 = LLT::scalar(32);
+
+  if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
+    // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
+    auto K0 = B.buildFConstant(Ty, 0x1.a92000p+1f);
+    auto K1 = B.buildFConstant(Ty, 0x1.4f0978p-11f);
+
+    auto Mul1 = B.buildFMul(Ty, X, K1, Flags);
+    auto Exp2_1 = buildExp(B, Ty, Mul1, Flags);
+    auto Mul0 = B.buildFMul(Ty, X, K0, Flags);
+    auto Exp2_0 = buildExp(B, Ty, Mul0, Flags);
+    B.buildFMul(Dst, Exp2_0, Exp2_1, Flags);
+    return true;
+  }
+
+  // bool s = x < -0x1.2f7030p+5f;
+  // x += s ? 0x1.0p+5f : 0.0f;
+  // exp10 = exp2(x * 0x1.a92000p+1f) *
+  //        exp2(x * 0x1.4f0978p-11f) *
+  //        (s ? 0x1.9f623ep-107f : 1.0f);
+
+  auto Threshold = B.buildFConstant(Ty, -0x1.2f7030p+5f);
+  auto NeedsScaling =
+      B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold);
+
+  auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+5f);
+  auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
+  auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X);
+
+  auto K0 = B.buildFConstant(Ty, 0x1.a92000p+1f);
+  auto K1 = B.buildFConstant(Ty, 0x1.4f0978p-11f);
+
+  auto Mul1 = B.buildFMul(Ty, AdjustedX, K1, Flags);
+  auto Exp2_1 = buildExp(B, Ty, Mul1, Flags);
+  auto Mul0 = B.buildFMul(Ty, AdjustedX, K0, Flags);
+  auto Exp2_0 = buildExp(B, Ty, Mul0, Flags);
+
+  auto MulExps = B.buildFMul(Ty, Exp2_0, Exp2_1, Flags);
+  auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.9f623ep-107f);
+  auto AdjustedResult = B.buildFMul(Ty, MulExps, ResultScaleFactor, Flags);
+
+  B.buildSelect(Dst, NeedsScaling, AdjustedResult, MulExps);
+  return true;
+}
+
 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
                                        MachineIRBuilder &B) const {
   Register Dst = MI.getOperand(0).getReg();
@@ -3772,7 +3833,8 @@ bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
     // v_exp_f16 (fmul x, log2e)
     if (allowApproxFunc(MF, Flags)) {
       // TODO: Does this really require fast?
-      legalizeFExpUnsafe(B, Dst, X, Flags);
+      IsExp10 ? legalizeFExp10Unsafe(B, Dst, X, Flags)
+              : legalizeFExpUnsafe(B, Dst, X, Flags);
       MI.eraseFromParent();
       return true;
     }
@@ -3783,7 +3845,8 @@ bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
     // Nothing in half is a denormal when promoted to f32.
     auto Ext = B.buildFPExt(F32, X, Flags);
     Register Lowered = MRI.createGenericVirtualRegister(F32);
-    legalizeFExpUnsafe(B, Lowered, Ext.getReg(0), Flags);
+    IsExp10 ? legalizeFExp10Unsafe(B, Lowered, Ext.getReg(0), Flags)
+            : legalizeFExpUnsafe(B, Lowered, Ext.getReg(0), Flags);
     B.buildFPTrunc(Dst, Lowered, Flags);
     MI.eraseFromParent();
     return true;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index 31db548d2af88..c32b6648e6bb7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -93,6 +93,8 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
   bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const;
   bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src,
                           unsigned Flags) const;
+  bool legalizeFExp10Unsafe(MachineIRBuilder &B, Register Dst, Register Src,
+                            unsigned Flags) const;
   bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const;
   bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const;
   bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI,
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
index 8a0e02664fc6c..daf0cf8a8aacd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
@@ -5877,26 +5877,17 @@ define float @v_exp10_f32_from_fpext_math_f16_daz(i16 %src0.i, i16 %src1.i) #0 {
 
 ; FIXME: Fold out fp16_to_fp (FP_TO_FP16) on no-f16 targets
 define half @v_exp10_f16(half %in) {
-; GCN-SDAG-LABEL: v_exp10_f16:
-; GCN-SDAG:       ; %bb.0:
-; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3a2784bc, v0
-; GCN-SDAG-NEXT:    v_mul_f32_e32 v0, 0x40549000, v0
-; GCN-SDAG-NEXT:    v_exp_f32_e32 v1, v1
-; GCN-SDAG-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN-GISEL-LABEL: v_exp10_f16:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: v_exp10_f16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, 0x3a2784bc, v0
+; GCN-NEXT:    v_mul_f32_e32 v0, 0x40549000, v0
+; GCN-NEXT:    v_exp_f32_e32 v1, v1
+; GCN-NEXT:    v_exp_f32_e32 v0, v0
+; GCN-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp10_f16:
 ; SI-SDAG:       ; %bb.0:
@@ -5916,8 +5907,11 @@ define half @v_exp10_f16(half %in) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3a2784bc, v0
+; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x40549000, v0
+; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5935,26 +5929,17 @@ define half @v_exp10_f16(half %in) {
 }
 
 define half @v_exp10_fabs_f16(half %in) {
-; GCN-SDAG-LABEL: v_exp10_fabs_f16:
-; GCN-SDAG:       ; %bb.0:
-; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-SDAG-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
-; GCN-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3a2784bc, v0
-; GCN-SDAG-NEXT:    v_mul_f32_e32 v0, 0x40549000, v0
-; GCN-SDAG-NEXT:    v_exp_f32_e32 v1, v1
-; GCN-SDAG-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN-GISEL-LABEL: v_exp10_fabs_f16:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: v_exp10_fabs_f16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
+; GCN-NEXT:    v_mul_f32_e32 v1, 0x3a2784bc, v0
+; GCN-NEXT:    v_mul_f32_e32 v0, 0x40549000, v0
+; GCN-NEXT:    v_exp_f32_e32 v1, v1
+; GCN-NEXT:    v_exp_f32_e32 v0, v0
+; GCN-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp10_fabs_f16:
 ; SI-SDAG:       ; %bb.0:
@@ -5974,8 +5959,11 @@ define half @v_exp10_fabs_f16(half %in) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3a2784bc, v0
+; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x40549000, v0
+; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5994,26 +5982,17 @@ define half @v_exp10_fabs_f16(half %in) {
 }
 
 define half @v_exp10_fneg_fabs_f16(half %in) {
-; GCN-SDAG-LABEL: v_exp10_fneg_fabs_f16:
-; GCN-SDAG:       ; %bb.0:
-; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-SDAG-NEXT:    v_cvt_f32_f16_e64 v0, -|v0|
-; GCN-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3a2784bc, v0
-; GCN-SDAG-NEXT:    v_mul_f32_e32 v0, 0x40549000, v0
-; GCN-SDAG-NEXT:    v_exp_f32_e32 v1, v1
-; GCN-SDAG-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN-GISEL-LABEL: v_exp10_fneg_fabs_f16:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_cvt_f32_f16_e64 v0, -|v0|
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: v_exp10_fneg_fabs_f16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e64 v0, -|v0|
+; GCN-NEXT:    v_mul_f32_e32 v1, 0x3a2784bc, v0
+; GCN-NEXT:    v_mul_f32_e32 v0, 0x40549000, v0
+; GCN-NEXT:    v_exp_f32_e32 v1, v1
+; GCN-NEXT:    v_exp_f32_e32 v0, v0
+; GCN-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp10_fneg_fabs_f16:
 ; SI-SDAG:       ; %bb.0:
@@ -6033,8 +6012,11 @@ define half @v_exp10_fneg_fabs_f16(half %in) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e64 v0, -|v0|
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3a2784bc, v0
+; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x40549000, v0
+; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6054,26 +6036,17 @@ define half @v_exp10_fneg_fabs_f16(half %in) {
 }
 
 define half @v_exp10_fneg_f16(half %in) {
-; GCN-SDAG-LABEL: v_exp10_fneg_f16:
-; GCN-SDAG:       ; %bb.0:
-; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-SDAG-NEXT:    v_cvt_f32_f16_e64 v0, -v0
-; GCN-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3a2784bc, v0
-; GCN-SDAG-NEXT:    v_mul_f32_e32 v0, 0x40549000, v0
-; GCN-SDAG-NEXT:    v_exp_f32_e32 v1, v1
-; GCN-SDAG-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN-GISEL-LABEL: v_exp10_fneg_f16:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_cvt_f32_f16_e64 v0, -v0
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: v_exp10_fneg_f16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e64 v0, -v0
+; GCN-NEXT:    v_mul_f32_e32 v1, 0x3a2784bc, v0
+; GCN-NEXT:    v_mul_f32_e32 v0, 0x40549000, v0
+; GCN-NEXT:    v_exp_f32_e32 v1, v1
+; GCN-NEXT:    v_exp_f32_e32 v0, v0
+; GCN-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp10_fneg_f16:
 ; SI-SDAG:       ; %bb.0:
@@ -6093,8 +6066,11 @@ define half @v_exp10_fneg_f16(half %in) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e64 v0, -v0
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3a2784bc, v0
+; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x40549000, v0
+; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6113,22 +6089,15 @@ define half @v_exp10_fneg_f16(half %in) {
 }
 
 define half @v_exp10_f16_fast(half %in) {
-; GCN-SDAG-LABEL: v_exp10_f16_fast:
-; GCN-SDAG:       ; %bb.0:
-; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-SDAG-NEXT:    v_mul_f16_e32 v1, 0x113c, v0
-; GCN-SDAG-NEXT:    v_mul_f16_e32 v0, 0x42a4, v0
-; GCN-SDAG-NEXT:    v_exp_f16_e32 v1, v1
-; GCN-SDAG-NEXT:    v_exp_f16_e32 v0, v0
-; GCN-SDAG-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GCN-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN-GISEL-LABEL: v_exp10_f16_fast:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mul_f16_e32 v0, 0x3dc5, v0
-; GCN-GISEL-NEXT:    v_exp_f16_e32 v0, v0
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: v_exp10_f16_fast:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mul_f16_e32 v1, 0x113c, v0
+; GCN-NEXT:    v_mul_f16_e32 v0, 0x42a4, v0
+; GCN-NEXT:    v_exp_f16_e32 v1, v1
+; GCN-NEXT:    v_exp_f16_e32 v0, v0
+; GCN-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp10_f16_fast:
 ; SI-SDAG:       ; %bb.0:
@@ -6146,10 +6115,19 @@ define half @v_exp10_f16_fast(half %in) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3fb8a000, v0
+; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3a278000, v0
+; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x40548000, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6192,48 +6170,40 @@ define <2 x half> @v_exp10_v2f16(<2 x half> %in) {
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v0
 ; VI-GISEL-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3a2784bc, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x40549000, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3a2784bc, v0
+; VI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x40549000, v0
+; VI-GISEL-NEXT:    v_exp_f32_e32 v2, v2
 ; VI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v3, v3
 ; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_mul_f32_e32 v1, v1, v2
 ; VI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v3
 ; VI-GISEL-NEXT:    v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-SDAG-LABEL: v_exp10_v2f16:
-; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; GFX900-SDAG-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3a2784bc, v1
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, 0x40549000, v1
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v3, 0x3a2784bc, v0
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v0, 0x40549000, v0
-; GFX900-SDAG-NEXT:    v_exp_f32_e32 v2, v2
-; GFX900-SDAG-NEXT:    v_exp_f32_e32 v1, v1
-; GFX900-SDAG-NEXT:    v_exp_f32_e32 v3, v3
-; GFX900-SDAG-NEXT:    v_exp_f32_e32 v0, v0
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, v1, v2
-; GFX900-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v3
-; GFX900-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
-; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX900-GISEL-LABEL: v_exp10_v2f16:
-; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; GFX900-GISEL-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX900-GISEL-NEXT:    v_exp_f32_e32 v1, v1
-; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX900-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
-; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_exp10_v2f16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX900-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-NEXT:    v_mul_f32_e32 v2, 0x3a2784bc, v1
+; GFX900-NEXT:    v_mul_f32_e32 v1, 0x40549000, v1
+; GFX900-NEXT:    v_mul_f32_e32 v3, 0x3a2784bc, v0
+; GFX900-NEXT:    v_mul_f32_e32 v0, 0x40549000, v0
+; GFX900-NEXT:    v_exp_f32_e32 v2, v2
+; GFX900-NEXT:    v_exp_f32_e32 v1, v1
+; GFX900-NEXT:    v_exp_f32_e32 v3, v3
+; GFX900-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GFX900-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX900-NEXT:    v_mul_f32_e32 v0, v0, v3
+; GFX900-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX900-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp10_v2f16:
 ; SI-SDAG:       ; %bb.0:
@@ -6263,11 +6233,17 @@ define <2 x half> @v_exp10_v2f16(<2 x half> %in) {
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3a2784bc, v0
+; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x40549000, v0
+; SI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3a2784bc, v1
+; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x40549000, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v2, v2
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_exp_f32_e32 v3, v3
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
+; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6311,11 +6287,17 @@ define <2 x half> @v_exp10_fabs_v2f16(<2 x half> %in) {
 ; VI-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
 ; VI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v0
 ; VI-GISEL-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/170708


More information about the llvm-commits mailing list