[llvm-branch-commits] [llvm] Reapply "AMDGPU: Use real copysign in fast pow (#97152)" (PR #178036)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon Jan 26 11:46:50 PST 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Matt Arsenault (arsenm)
<details>
<summary>Changes</summary>
This reverts commit bff619f91015a633df659d7f60f842d5c49351df.
This was reverted due to regressions caused by poor copysign
optimization, which have been fixed.
---
Patch is 20.47 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/178036.diff
5 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp (+4-3)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll (+21-28)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll (+24-32)
- (modified) llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll (+8-9)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index 551e1a82e5d61..4de9349fe5166 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -1056,17 +1056,18 @@ bool AMDGPULibCalls::fold_pow(FPMathOperator *FPOp, IRBuilder<> &B,
if (needcopysign) {
Type* nTyS = B.getIntNTy(eltType->getPrimitiveSizeInBits());
Type *nTy = FPOp->getType()->getWithNewType(nTyS);
- unsigned size = nTy->getScalarSizeInBits();
Value *opr_n = FPOp->getOperand(1);
if (opr_n->getType()->getScalarType()->isIntegerTy())
opr_n = B.CreateZExtOrTrunc(opr_n, nTy, "__ytou");
else
opr_n = B.CreateFPToSI(opr1, nTy, "__ytou");
+ unsigned size = nTy->getScalarSizeInBits();
Value *sign = B.CreateShl(opr_n, size-1, "__yeven");
sign = B.CreateAnd(B.CreateBitCast(opr0, nTy), sign, "__pow_sign");
- nval = B.CreateOr(B.CreateBitCast(nval, nTy), sign);
- nval = B.CreateBitCast(nval, opr0->getType());
+
+ nval = B.CreateCopySign(nval, B.CreateBitCast(sign, nval->getType()),
+ nullptr, "__pow_sign");
}
LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> "
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
index 1ab1aa09a3432..04daf2753384e 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
@@ -68,7 +68,7 @@ define half @test_pow_fast_f16__integral_y(half %x, i32 %y.i) {
; CHECK-NEXT: v_cvt_f16_f32_e32 v2, v2
; CHECK-NEXT: v_mul_f16_e32 v2, v3, v2
; CHECK-NEXT: v_exp_f16_e32 v2, v2
-; CHECK-NEXT: v_or_b32_e32 v0, v0, v2
+; CHECK-NEXT: v_or_b32_e32 v0, v2, v0
; CHECK-NEXT: s_setpc_b64 s[30:31]
%y = sitofp i32 %y.i to half
%pow = tail call fast half @_Z3powDhDh(half %x, half %y)
@@ -177,7 +177,7 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) {
; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; CHECK-NEXT: v_or_b32_e32 v1, v2, v1
+; CHECK-NEXT: v_or_b32_e32 v1, v1, v2
; CHECK-NEXT: v_readlane_b32 s53, v43, 13
; CHECK-NEXT: v_readlane_b32 s52, v43, 12
; CHECK-NEXT: v_readlane_b32 s51, v43, 11
@@ -354,7 +354,7 @@ define half @test_pown_fast_f16(half %x, i32 %y) {
; CHECK-NEXT: v_cvt_f16_f32_e32 v2, v2
; CHECK-NEXT: v_mul_f16_e32 v2, v3, v2
; CHECK-NEXT: v_exp_f16_e32 v2, v2
-; CHECK-NEXT: v_or_b32_e32 v0, v0, v2
+; CHECK-NEXT: v_or_b32_e32 v0, v2, v0
; CHECK-NEXT: s_setpc_b64 s[30:31]
%call = tail call fast half @_Z4pownDhi(half %x, i32 %y)
ret half %call
@@ -459,7 +459,7 @@ define double @test_pown_fast_f64(double %x, i32 %y) {
; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; CHECK-NEXT: v_or_b32_e32 v1, v2, v1
+; CHECK-NEXT: v_or_b32_e32 v1, v1, v2
; CHECK-NEXT: v_readlane_b32 s53, v43, 13
; CHECK-NEXT: v_readlane_b32 s52, v43, 12
; CHECK-NEXT: v_readlane_b32 s51, v43, 11
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll
index 6eb67381ef8ae..e1e0c1828a430 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll
@@ -2213,10 +2213,9 @@ define float @test_pow_afn_nnan_ninf_f32_known_integral_sitofp(float %x, i32 %y)
; CHECK-NEXT: [[__YEVEN:%.*]] = shl i32 [[TMP1]], 31
; CHECK-NEXT: [[TMP2:%.*]] = bitcast float [[X]] to i32
; CHECK-NEXT: [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP2]]
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast float [[__EXP2]] to i32
-; CHECK-NEXT: [[TMP4:%.*]] = or disjoint i32 [[__POW_SIGN]], [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[TMP4]] to float
-; CHECK-NEXT: ret float [[TMP5]]
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32 [[__POW_SIGN]] to float
+; CHECK-NEXT: [[__POW_SIGN1:%.*]] = call nnan ninf afn float @llvm.copysign.f32(float [[__EXP2]], float [[TMP3]])
+; CHECK-NEXT: ret float [[__POW_SIGN1]]
;
%y.cast = sitofp i32 %y to float
%pow = tail call afn nnan ninf float @_Z3powff(float %x, float %y.cast)
@@ -2301,10 +2300,9 @@ define float @test_pow_afn_nnan_ninf_f32_known_integral_uitofp(float %x, i32 %y)
; CHECK-NEXT: [[__YEVEN:%.*]] = shl i32 [[TMP1]], 31
; CHECK-NEXT: [[TMP2:%.*]] = bitcast float [[X]] to i32
; CHECK-NEXT: [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP2]]
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast float [[__EXP2]] to i32
-; CHECK-NEXT: [[TMP4:%.*]] = or disjoint i32 [[__POW_SIGN]], [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[TMP4]] to float
-; CHECK-NEXT: ret float [[TMP5]]
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32 [[__POW_SIGN]] to float
+; CHECK-NEXT: [[__POW_SIGN1:%.*]] = call nnan ninf afn float @llvm.copysign.f32(float [[__EXP2]], float [[TMP3]])
+; CHECK-NEXT: ret float [[__POW_SIGN1]]
;
%y.cast = uitofp i32 %y to float
%pow = tail call afn nnan ninf float @_Z3powff(float %x, float %y.cast)
@@ -2350,10 +2348,9 @@ define float @test_pow_afn_nnan_ninf_f32_known_integral_uitofp_i256(float %x, i2
; CHECK-NEXT: [[__YEVEN:%.*]] = shl i32 [[TMP1]], 31
; CHECK-NEXT: [[TMP2:%.*]] = bitcast float [[X]] to i32
; CHECK-NEXT: [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP2]]
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast float [[__EXP2]] to i32
-; CHECK-NEXT: [[TMP4:%.*]] = or disjoint i32 [[__POW_SIGN]], [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[TMP4]] to float
-; CHECK-NEXT: ret float [[TMP5]]
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32 [[__POW_SIGN]] to float
+; CHECK-NEXT: [[__POW_SIGN1:%.*]] = call nnan ninf afn float @llvm.copysign.f32(float [[__EXP2]], float [[TMP3]])
+; CHECK-NEXT: ret float [[__POW_SIGN1]]
;
%y.cast = uitofp i256 %y to float
%pow = tail call afn nnan ninf float @_Z3powff(float %x, float %y.cast)
@@ -2373,10 +2370,9 @@ define float @test_pow_afn_nnan_ninf_f32_known_integral_sitofp_i256(float %x, i2
; CHECK-NEXT: [[__YEVEN:%.*]] = shl i32 [[TMP1]], 31
; CHECK-NEXT: [[TMP2:%.*]] = bitcast float [[X]] to i32
; CHECK-NEXT: [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP2]]
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast float [[__EXP2]] to i32
-; CHECK-NEXT: [[TMP4:%.*]] = or disjoint i32 [[__POW_SIGN]], [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[TMP4]] to float
-; CHECK-NEXT: ret float [[TMP5]]
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32 [[__POW_SIGN]] to float
+; CHECK-NEXT: [[__POW_SIGN1:%.*]] = call nnan ninf afn float @llvm.copysign.f32(float [[__EXP2]], float [[TMP3]])
+; CHECK-NEXT: ret float [[__POW_SIGN1]]
;
%y.cast = sitofp i256 %y to float
%pow = tail call afn nnan ninf float @_Z3powff(float %x, float %y.cast)
@@ -2396,10 +2392,9 @@ define <2 x float> @test_pow_afn_nnan_ninf_v2f32_known_integral_sitofp(<2 x floa
; CHECK-NEXT: [[__YEVEN:%.*]] = shl <2 x i32> [[TMP1]], splat (i32 31)
; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[X]] to <2 x i32>
; CHECK-NEXT: [[__POW_SIGN:%.*]] = and <2 x i32> [[__YEVEN]], [[TMP2]]
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[__EXP2]] to <2 x i32>
-; CHECK-NEXT: [[TMP4:%.*]] = or disjoint <2 x i32> [[__POW_SIGN]], [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <2 x float>
-; CHECK-NEXT: ret <2 x float> [[TMP5]]
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[__POW_SIGN]] to <2 x float>
+; CHECK-NEXT: [[__POW_SIGN1:%.*]] = call nnan ninf afn <2 x float> @llvm.copysign.v2f32(<2 x float> [[__EXP2]], <2 x float> [[TMP3]])
+; CHECK-NEXT: ret <2 x float> [[__POW_SIGN1]]
;
%y.cast = sitofp <2 x i32> %y to <2 x float>
%pow = tail call afn nnan ninf <2 x float> @_Z3powDv2_fS_(<2 x float> %x, <2 x float> %y.cast)
@@ -2445,10 +2440,9 @@ define <2 x float> @test_pow_afn_nnan_ninf_v2f32_known_integral_uitofp(<2 x floa
; CHECK-NEXT: [[__YEVEN:%.*]] = shl <2 x i32> [[TMP1]], splat (i32 31)
; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[X]] to <2 x i32>
; CHECK-NEXT: [[__POW_SIGN:%.*]] = and <2 x i32> [[__YEVEN]], [[TMP2]]
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[__EXP2]] to <2 x i32>
-; CHECK-NEXT: [[TMP4:%.*]] = or disjoint <2 x i32> [[__POW_SIGN]], [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <2 x float>
-; CHECK-NEXT: ret <2 x float> [[TMP5]]
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[__POW_SIGN]] to <2 x float>
+; CHECK-NEXT: [[__POW_SIGN1:%.*]] = call nnan ninf afn <2 x float> @llvm.copysign.v2f32(<2 x float> [[__EXP2]], <2 x float> [[TMP3]])
+; CHECK-NEXT: ret <2 x float> [[__POW_SIGN1]]
;
%y.cast = uitofp <2 x i32> %y to <2 x float>
%pow = tail call afn nnan ninf <2 x float> @_Z3powDv2_fS_(<2 x float> %x, <2 x float> %y.cast)
@@ -2557,10 +2551,9 @@ define float @test_pow_afn_f32_nnan_ninf__y_known_integral_trunc(float %x, float
; CHECK-NEXT: [[__YEVEN:%.*]] = shl i32 [[TMP1]], 31
; CHECK-NEXT: [[TMP2:%.*]] = bitcast float [[X]] to i32
; CHECK-NEXT: [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP2]]
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast float [[__EXP2]] to i32
-; CHECK-NEXT: [[TMP4:%.*]] = or disjoint i32 [[__POW_SIGN]], [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[TMP4]] to float
-; CHECK-NEXT: ret float [[TMP5]]
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32 [[__POW_SIGN]] to float
+; CHECK-NEXT: [[__POW_SIGN1:%.*]] = call nnan ninf afn float @llvm.copysign.f32(float [[__EXP2]], float [[TMP3]])
+; CHECK-NEXT: ret float [[__POW_SIGN1]]
;
%y = call float @llvm.trunc.f32(float %y.arg)
%pow = tail call afn nnan ninf float @_Z3powff(float %x, float %y)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll
index e081b460dae7f..2f0db29fc763b 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll
@@ -677,10 +677,9 @@ define float @test_pown_afn_nnan_ninf_f32(float %x, i32 %y) {
; CHECK-NEXT: [[__YEVEN:%.*]] = shl i32 [[Y]], 31
; CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[X]] to i32
; CHECK-NEXT: [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP0]]
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast float [[__EXP2]] to i32
-; CHECK-NEXT: [[TMP2:%.*]] = or disjoint i32 [[__POW_SIGN]], [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32 [[TMP2]] to float
-; CHECK-NEXT: ret float [[TMP3]]
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[__POW_SIGN]] to float
+; CHECK-NEXT: [[__POW_SIGN1:%.*]] = call nnan ninf afn float @llvm.copysign.f32(float [[__EXP2]], float [[TMP1]])
+; CHECK-NEXT: ret float [[__POW_SIGN1]]
;
entry:
%call = tail call nnan ninf afn float @_Z4pownfi(float %x, i32 %y)
@@ -699,10 +698,9 @@ define <2 x float> @test_pown_afn_nnan_ninf_v2f32(<2 x float> %x, <2 x i32> %y)
; CHECK-NEXT: [[__YEVEN:%.*]] = shl <2 x i32> [[Y]], splat (i32 31)
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[X]] to <2 x i32>
; CHECK-NEXT: [[__POW_SIGN:%.*]] = and <2 x i32> [[__YEVEN]], [[TMP0]]
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[__EXP2]] to <2 x i32>
-; CHECK-NEXT: [[TMP2:%.*]] = or disjoint <2 x i32> [[__POW_SIGN]], [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <2 x float>
-; CHECK-NEXT: ret <2 x float> [[TMP3]]
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[__POW_SIGN]] to <2 x float>
+; CHECK-NEXT: [[__POW_SIGN1:%.*]] = call nnan ninf afn <2 x float> @llvm.copysign.v2f32(<2 x float> [[__EXP2]], <2 x float> [[TMP1]])
+; CHECK-NEXT: ret <2 x float> [[__POW_SIGN1]]
;
entry:
%call = tail call nnan ninf afn <2 x float> @_Z4pownDv2_fDv2_i(<2 x float> %x, <2 x i32> %y)
@@ -722,10 +720,9 @@ define double @test_pown_afn_nnan_ninf_f64(double %x, i32 %y) {
; CHECK-NEXT: [[__YEVEN:%.*]] = shl i64 [[__YTOU]], 63
; CHECK-NEXT: [[TMP0:%.*]] = bitcast double [[X]] to i64
; CHECK-NEXT: [[__POW_SIGN:%.*]] = and i64 [[__YEVEN]], [[TMP0]]
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast double [[__EXP2]] to i64
-; CHECK-NEXT: [[TMP2:%.*]] = or disjoint i64 [[__POW_SIGN]], [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[TMP2]] to double
-; CHECK-NEXT: ret double [[TMP3]]
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[__POW_SIGN]] to double
+; CHECK-NEXT: [[__POW_SIGN1:%.*]] = call nnan ninf afn double @llvm.copysign.f64(double [[__EXP2]], double [[TMP1]])
+; CHECK-NEXT: ret double [[__POW_SIGN1]]
;
entry:
%call = tail call nnan ninf afn double @_Z4powndi(double %x, i32 %y)
@@ -745,10 +742,9 @@ define <2 x double> @test_pown_afn_nnan_ninf_v2f64(<2 x double> %x, <2 x i32> %y
; CHECK-NEXT: [[__YEVEN:%.*]] = shl <2 x i64> [[__YTOU]], splat (i64 63)
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[X]] to <2 x i64>
; CHECK-NEXT: [[__POW_SIGN:%.*]] = and <2 x i64> [[__YEVEN]], [[TMP0]]
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[__EXP2]] to <2 x i64>
-; CHECK-NEXT: [[TMP2:%.*]] = or disjoint <2 x i64> [[__POW_SIGN]], [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <2 x double>
-; CHECK-NEXT: ret <2 x double> [[TMP3]]
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[__POW_SIGN]] to <2 x double>
+; CHECK-NEXT: [[__POW_SIGN1:%.*]] = call nnan ninf afn <2 x double> @llvm.copysign.v2f64(<2 x double> [[__EXP2]], <2 x double> [[TMP1]])
+; CHECK-NEXT: ret <2 x double> [[__POW_SIGN1]]
;
entry:
%call = tail call nnan ninf afn <2 x double> @_Z4pownDv2_dDv2_i(<2 x double> %x, <2 x i32> %y)
@@ -768,10 +764,9 @@ define half @test_pown_afn_nnan_ninf_f16(half %x, i32 %y) {
; CHECK-NEXT: [[__YEVEN:%.*]] = shl i16 [[__YTOU]], 15
; CHECK-NEXT: [[TMP0:%.*]] = bitcast half [[X]] to i16
; CHECK-NEXT: [[__POW_SIGN:%.*]] = and i16 [[__YEVEN]], [[TMP0]]
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast half [[__EXP2]] to i16
-; CHECK-NEXT: [[TMP2:%.*]] = or disjoint i16 [[__POW_SIGN]], [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[TMP2]] to half
-; CHECK-NEXT: ret half [[TMP3]]
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[__POW_SIGN]] to half
+; CHECK-NEXT: [[__POW_SIGN1:%.*]] = call nnan ninf afn half @llvm.copysign.f16(half [[__EXP2]], half [[TMP1]])
+; CHECK-NEXT: ret half [[__POW_SIGN1]]
;
entry:
%call = tail call nnan ninf afn half @_Z4pownDhi(half %x, i32 %y)
@@ -791,10 +786,9 @@ define <2 x half> @test_pown_afn_nnan_ninf_v2f16(<2 x half> %x, <2 x i32> %y) {
; CHECK-NEXT: [[__YEVEN:%.*]] = shl <2 x i16> [[__YTOU]], splat (i16 15)
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x half> [[X]] to <2 x i16>
; CHECK-NEXT: [[__POW_SIGN:%.*]] = and <2 x i16> [[__YEVEN]], [[TMP0]]
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x half> [[__EXP2]] to <2 x i16>
-; CHECK-NEXT: [[TMP2:%.*]] = or disjoint <2 x i16> [[__POW_SIGN]], [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i16> [[TMP2]] to <2 x half>
-; CHECK-NEXT: ret <2 x half> [[TMP3]]
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i16> [[__POW_SIGN]] to <2 x half>
+; CHECK-NEXT: [[__POW_SIGN1:%.*]] = call nnan ninf afn <2 x half> @llvm.copysign.v2f16(<2 x half> [[__EXP2]], <2 x half> [[TMP1]])
+; CHECK-NEXT: ret <2 x half> [[__POW_SIGN1]]
;
entry:
%call = tail call nnan ninf afn <2 x half> @_Z4pownDv2_DhDv2_i(<2 x half> %x, <2 x i32> %y)
@@ -825,10 +819,9 @@ define float @test_pown_fast_f32_strictfp(float %x, i32 %y) #1 {
; CHECK-NEXT: [[__YEVEN:%.*]] = shl i32 [[Y]], 31
; CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[X]] to i32
; CHECK-NEXT: [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP0]]
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast float [[__EXP2]] to i32
-; CHECK-NEXT: [[TMP2:%.*]] = or disjoint i32 [[__POW_SIGN]], [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32 [[TMP2]] to float
-; CHECK-NEXT: ret float [[TMP3]]
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[__POW_SIGN]] to float
+; CHECK-NEXT: [[__POW_SIGN1:%.*]] = call fast float @llvm.copysign.f32(float [[__EXP2]], float [[TMP1]]) #[[ATTR0]]
+; CHECK-NEXT: ret float [[__POW_SIGN1]]
;
entry:
%call = tail call fast float @_Z4pownfi(float %x, i32 %y) #1
@@ -1071,10 +1064,9 @@ define float @test_pown_afn_ninf_nnan_f32__x_known_positive(float nofpclass(ninf
; CHECK-NEXT: [[__YEVEN:%.*]] = shl i32 [[Y]], 31
; CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[X]] to i32
; CHECK-NEXT: [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP0]]
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast float [[__EXP2]] to i32
-; CHECK-NEXT: [[TMP2:%.*]] = or disjoint i32 [[__POW_SIGN]], [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32 [[TMP2]] to float
-; CHECK-NEXT: ret float [[TMP3]]
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[__POW_SIGN]] to float
+; CHECK-NEXT: [[__POW_SIGN1:%.*]] = call nnan ninf afn float @llvm.copysign.f32(float [[__EXP2]], float [[TMP1]])
+; CHECK-NEXT: ret float [[__POW_SIGN1]]
;
entry:
%call = tail call afn ninf nnan float @_Z4pownfi(float %x, i32 %y)
diff --git a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
index 3855fc0c9c45a..36a7ee057d24b 100644
--- a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
+++ b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
@@ -359,9 +359,8 @@ declare half @_Z4pownDhi(half, i32)
; GCN-NATIVE: %__yeven = shl i16 %__ytou, 15
; GCN-NATIVE: %0 = bitcast half %x to i16
; GCN-NATIVE: %__pow_sign = and i16 %__yeven, %0
-; GCN-NATIVE: %1 = bitcast half %__exp2 to i16
-; GCN-NATIVE: %2 = or disjoint i16 %__pow_sign, %1
-; GCN-NATIVE: %3 = bitcast i16 %2 to half
+; GCN-NATIVE: %1 = bitcast i16 %__pow_sign to half
+; GCN-NATIVE: %__pow_sign1 = tail call fast half @llvm.copysign.f16(half %__exp2, half %1)
define half @test_pown_f16(half %x, i32 %y) {
entry:
%call = call fast half @_Z4pownDhi(half %x, i32 %y)
@@ -375,7 +374,7 @@ declare float @_Z4pownfi(float, i32)
; GCN: %__log2 = tail call fast float @llvm.log2.f32(float %__fabs)
; GCN: %__ylogx = fmul fast float %__log2, 1.013000e+03
; GCN: %__exp2 = tail call fast nofpclass(nan ninf nzero nsub nnorm) float @llvm.exp2.f32(float %__ylogx)
-; GCN: %[[r0:.*]] = tail call float @llvm.copysign.f32(float %__exp2, float %tmp)
+; GCN: %[[r0:.*]] = tail call fast float @llvm.copysign.f32(float %__exp2, float %tmp)
; GCN: store float %[[r0]], ptr addrspace(1) %a, align 4
define amdgpu_kernel void @test_pow(ptr addrspace(1) nocapture %a) {
entry:
@@ -410,9 +409,9 @@ entry:
; GCN: %__yeven = shl i32 %conv, 31
; GCN: %[[r0:.*]] = bitcast float %tmp to i32
; GCN: %__pow_sign = and i32 %__yeven, %[[r0]]
-; GCN: %[[r1:.*]] = bitcast float %__exp2 to i32
-; GCN: %[[r2:.*]] = or disjoint i32 %__pow_sign, %[[r1]]
-; GCN: store i32 %[[r2]], ptr addrspace(1) %a, align 4
+; GCN: %[[r1:.*]] = bitcast i32 %__pow_sign to float
+; GCN: %[[r2:.*]] = tail call fast float @llvm.copysign.f32(float %__exp2, float %[[r1]])
+; GCN: store float %[[r2]], ptr addrspace(1) %a, align 4
define amdgpu_kernel void @test_pown(ptr addrspace(1) nocapture %a) {
entry:
%tmp = load float, ptr addrspace(1) %a, align 4
@@ -432,7 +431,7 @@ declare <2 x half> @_Z3powDv2_DhS_(<2 x half>, <2 x half>)
; GCN: %__log2 = tail call fast half @llvm.log2.f16(half %__fabs)
; GCN: %__ylogx = fmul fast half %__log2, 0xH4A80
; GCN: %__exp2 = tail call fast nofpclass(nan ninf nzero nsub nnorm) half @llvm.exp2.f16(half %__ylogx)
-; GCN: %1 = tail call half @llvm.copysign.f16(half %__exp2, half %x)
+; GCN: %__pow_sign1 = tail call fast half @llvm.copysign.f16(half %__exp2, half %x)
define half @test_pow_fast_f16__y_13(half %x) {
%powr = tail call fast half @_Z3powDhDh(half %x, half 13.0)
ret half %powr
@@ -443,7 +442,7 @@ define half @test_pow_fast_f16__y_13(half %x) {
; GCN: %__log2 = tail call fast <2 x half> @llvm.log2.v2f16(<2 x half> %__fabs)
; GCN: %__ylogx = fmul fast <2 x half> %__log2, splat (half 0xH4A80)
; GCN: %__exp2 = tail call...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/178036
More information about the llvm-branch-commits
mailing list