[llvm-branch-commits] [llvm] AMDGPU: Use real copysign in fast pow (PR #97152)

Sat Jun 29 00:48:56 PDT 2024

https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/97152

Previously this would introduce some codegen regressions, but
those have been avoided by simplifying demanded bits on copysign
operations.

>From 339a76086df9e50218561f568d70683f14ef1631 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Sat, 26 Aug 2023 13:19:06 -0400
Subject: [PATCH] AMDGPU: Use real copysign in fast pow

Previously this would introduce some codegen regressions, but
those have been avoided by simplifying demanded bits on copysign
operations.
---
 llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp     |  7 ++-
 .../AMDGPU/amdgpu-simplify-libcall-pow.ll     | 52 ++++++++--------
 .../AMDGPU/amdgpu-simplify-libcall-pown.ll    | 59 ++++++++-----------
 llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll | 30 +++++-----
 4 files changed, 68 insertions(+), 80 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index 456f3cb332cf8..27fa67ce5b45a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -1131,17 +1131,18 @@ bool AMDGPULibCalls::fold_pow(FPMathOperator *FPOp, IRBuilder<> &B,
   if (needcopysign) {
     Type* nTyS = B.getIntNTy(eltType->getPrimitiveSizeInBits());
     Type *nTy = FPOp->getType()->getWithNewType(nTyS);
-    unsigned size = nTy->getScalarSizeInBits();
     Value *opr_n = FPOp->getOperand(1);
     if (opr_n->getType()->getScalarType()->isIntegerTy())
       opr_n = B.CreateZExtOrTrunc(opr_n, nTy, "__ytou");
     else
       opr_n = B.CreateFPToSI(opr1, nTy, "__ytou");
 
+    unsigned size = nTy->getScalarSizeInBits();
     Value *sign = B.CreateShl(opr_n, size-1, "__yeven");
     sign = B.CreateAnd(B.CreateBitCast(opr0, nTy), sign, "__pow_sign");
-    nval = B.CreateOr(B.CreateBitCast(nval, nTy), sign);
-    nval = B.CreateBitCast(nval, opr0->getType());
+
+    nval = B.CreateCopySign(nval, B.CreateBitCast(sign, nval->getType()),
+                            nullptr, "__pow_sign");
   }
 
   LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> "
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll
index 6b4b0f881f3be..ab52c8ff8d399 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll
@@ -1783,7 +1783,8 @@ define float @test_pow_afn_f32_nnan_ninf__y_10(float %x) {
 define <2 x float> @test_pow_afn_v2f32_nnan_ninf__y_poison(<2 x float> %x) {
 ; CHECK-LABEL: define <2 x float> @test_pow_afn_v2f32_nnan_ninf__y_poison
 ; CHECK-SAME: (<2 x float> [[X:%.*]]) {
-; CHECK-NEXT:    ret <2 x float> poison
+; CHECK-NEXT:    [[__EXP2:%.*]] = call nnan ninf afn <2 x float> @llvm.exp2.v2f32(<2 x float> poison)
+; CHECK-NEXT:    ret <2 x float> [[__EXP2]]
 ;
   %pow = tail call afn nnan ninf <2 x float> @_Z3powDv2_fS_(<2 x float> %x, <2 x float> poison)
   ret <2 x float> %pow
@@ -2215,10 +2216,9 @@ define float @test_pow_afn_nnan_ninf_f32_known_integral_sitofp(float %x, i32 %y)
 ; CHECK-NEXT:    [[__YEVEN:%.*]] = shl i32 [[TMP1]], 31
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float [[X]] to i32
 ; CHECK-NEXT:    [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float [[__EXP2]] to i32
-; CHECK-NEXT:    [[TMP4:%.*]] = or disjoint i32 [[__POW_SIGN]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP4]] to float
-; CHECK-NEXT:    ret float [[TMP5]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32 [[__POW_SIGN]] to float
+; CHECK-NEXT:    [[__POW_SIGN1:%.*]] = call nnan ninf afn float @llvm.copysign.f32(float [[__EXP2]], float [[TMP3]])
+; CHECK-NEXT:    ret float [[__POW_SIGN1]]
 ;
   %y.cast = sitofp i32 %y to float
   %pow = tail call afn nnan ninf float @_Z3powff(float %x, float %y.cast)
@@ -2303,10 +2303,9 @@ define float @test_pow_afn_nnan_ninf_f32_known_integral_uitofp(float %x, i32 %y)
 ; CHECK-NEXT:    [[__YEVEN:%.*]] = shl i32 [[TMP1]], 31
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float [[X]] to i32
 ; CHECK-NEXT:    [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float [[__EXP2]] to i32
-; CHECK-NEXT:    [[TMP4:%.*]] = or disjoint i32 [[__POW_SIGN]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP4]] to float
-; CHECK-NEXT:    ret float [[TMP5]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32 [[__POW_SIGN]] to float
+; CHECK-NEXT:    [[__POW_SIGN1:%.*]] = call nnan ninf afn float @llvm.copysign.f32(float [[__EXP2]], float [[TMP3]])
+; CHECK-NEXT:    ret float [[__POW_SIGN1]]
 ;
   %y.cast = uitofp i32 %y to float
   %pow = tail call afn nnan ninf float @_Z3powff(float %x, float %y.cast)
@@ -2352,10 +2351,9 @@ define float @test_pow_afn_nnan_ninf_f32_known_integral_uitofp_i256(float %x, i2
 ; CHECK-NEXT:    [[__YEVEN:%.*]] = shl i32 [[TMP1]], 31
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float [[X]] to i32
 ; CHECK-NEXT:    [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float [[__EXP2]] to i32
-; CHECK-NEXT:    [[TMP4:%.*]] = or disjoint i32 [[__POW_SIGN]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP4]] to float
-; CHECK-NEXT:    ret float [[TMP5]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32 [[__POW_SIGN]] to float
+; CHECK-NEXT:    [[__POW_SIGN1:%.*]] = call nnan ninf afn float @llvm.copysign.f32(float [[__EXP2]], float [[TMP3]])
+; CHECK-NEXT:    ret float [[__POW_SIGN1]]
 ;
   %y.cast = uitofp i256 %y to float
   %pow = tail call afn nnan ninf float @_Z3powff(float %x, float %y.cast)
@@ -2375,10 +2373,9 @@ define float @test_pow_afn_nnan_ninf_f32_known_integral_sitofp_i256(float %x, i2
 ; CHECK-NEXT:    [[__YEVEN:%.*]] = shl i32 [[TMP1]], 31
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float [[X]] to i32
 ; CHECK-NEXT:    [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float [[__EXP2]] to i32
-; CHECK-NEXT:    [[TMP4:%.*]] = or disjoint i32 [[__POW_SIGN]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP4]] to float
-; CHECK-NEXT:    ret float [[TMP5]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32 [[__POW_SIGN]] to float
+; CHECK-NEXT:    [[__POW_SIGN1:%.*]] = call nnan ninf afn float @llvm.copysign.f32(float [[__EXP2]], float [[TMP3]])
+; CHECK-NEXT:    ret float [[__POW_SIGN1]]
 ;
   %y.cast = sitofp i256 %y to float
   %pow = tail call afn nnan ninf float @_Z3powff(float %x, float %y.cast)
@@ -2398,10 +2395,9 @@ define <2 x float> @test_pow_afn_nnan_ninf_v2f32_known_integral_sitofp(<2 x floa
 ; CHECK-NEXT:    [[__YEVEN:%.*]] = shl <2 x i32> [[TMP1]], <i32 31, i32 31>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[X]] to <2 x i32>
 ; CHECK-NEXT:    [[__POW_SIGN:%.*]] = and <2 x i32> [[__YEVEN]], [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[__EXP2]] to <2 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = or disjoint <2 x i32> [[__POW_SIGN]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <2 x float>
-; CHECK-NEXT:    ret <2 x float> [[TMP5]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[__POW_SIGN]] to <2 x float>
+; CHECK-NEXT:    [[__POW_SIGN1:%.*]] = call nnan ninf afn <2 x float> @llvm.copysign.v2f32(<2 x float> [[__EXP2]], <2 x float> [[TMP3]])
+; CHECK-NEXT:    ret <2 x float> [[__POW_SIGN1]]
 ;
   %y.cast = sitofp <2 x i32> %y to <2 x float>
   %pow = tail call afn nnan ninf <2 x float> @_Z3powDv2_fS_(<2 x float> %x, <2 x float> %y.cast)
@@ -2447,10 +2443,9 @@ define <2 x float> @test_pow_afn_nnan_ninf_v2f32_known_integral_uitofp(<2 x floa
 ; CHECK-NEXT:    [[__YEVEN:%.*]] = shl <2 x i32> [[TMP1]], <i32 31, i32 31>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[X]] to <2 x i32>
 ; CHECK-NEXT:    [[__POW_SIGN:%.*]] = and <2 x i32> [[__YEVEN]], [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[__EXP2]] to <2 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = or disjoint <2 x i32> [[__POW_SIGN]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <2 x float>
-; CHECK-NEXT:    ret <2 x float> [[TMP5]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[__POW_SIGN]] to <2 x float>
+; CHECK-NEXT:    [[__POW_SIGN1:%.*]] = call nnan ninf afn <2 x float> @llvm.copysign.v2f32(<2 x float> [[__EXP2]], <2 x float> [[TMP3]])
+; CHECK-NEXT:    ret <2 x float> [[__POW_SIGN1]]
 ;
   %y.cast = uitofp <2 x i32> %y to <2 x float>
   %pow = tail call afn nnan ninf <2 x float> @_Z3powDv2_fS_(<2 x float> %x, <2 x float> %y.cast)
@@ -2559,10 +2554,9 @@ define float @test_pow_afn_f32_nnan_ninf__y_known_integral_trunc(float %x, float
 ; CHECK-NEXT:    [[__YEVEN:%.*]] = shl i32 [[TMP1]], 31
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float [[X]] to i32
 ; CHECK-NEXT:    [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float [[__EXP2]] to i32
-; CHECK-NEXT:    [[TMP4:%.*]] = or disjoint i32 [[__POW_SIGN]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP4]] to float
-; CHECK-NEXT:    ret float [[TMP5]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32 [[__POW_SIGN]] to float
+; CHECK-NEXT:    [[__POW_SIGN1:%.*]] = call nnan ninf afn float @llvm.copysign.f32(float [[__EXP2]], float [[TMP3]])
+; CHECK-NEXT:    ret float [[__POW_SIGN1]]
 ;
   %y = call float @llvm.trunc.f32(float %y.arg)
   %pow = tail call afn nnan ninf float @_Z3powff(float %x, float %y)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll
index 77db224af2890..8d5705d4deb4c 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll
@@ -679,10 +679,9 @@ define float @test_pown_afn_nnan_ninf_f32(float %x, i32 %y) {
 ; CHECK-NEXT:    [[__YEVEN:%.*]] = shl i32 [[Y]], 31
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[X]] to i32
 ; CHECK-NEXT:    [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float [[__EXP2]] to i32
-; CHECK-NEXT:    [[TMP2:%.*]] = or disjoint i32 [[__POW_SIGN]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32 [[TMP2]] to float
-; CHECK-NEXT:    ret float [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[__POW_SIGN]] to float
+; CHECK-NEXT:    [[__POW_SIGN1:%.*]] = call nnan ninf afn float @llvm.copysign.f32(float [[__EXP2]], float [[TMP1]])
+; CHECK-NEXT:    ret float [[__POW_SIGN1]]
 ;
 entry:
   %call = tail call nnan ninf afn float @_Z4pownfi(float %x, i32 %y)
@@ -701,10 +700,9 @@ define <2 x float> @test_pown_afn_nnan_ninf_v2f32(<2 x float> %x, <2 x i32> %y)
 ; CHECK-NEXT:    [[__YEVEN:%.*]] = shl <2 x i32> [[Y]], <i32 31, i32 31>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[X]] to <2 x i32>
 ; CHECK-NEXT:    [[__POW_SIGN:%.*]] = and <2 x i32> [[__YEVEN]], [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[__EXP2]] to <2 x i32>
-; CHECK-NEXT:    [[TMP2:%.*]] = or disjoint <2 x i32> [[__POW_SIGN]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <2 x float>
-; CHECK-NEXT:    ret <2 x float> [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[__POW_SIGN]] to <2 x float>
+; CHECK-NEXT:    [[__POW_SIGN1:%.*]] = call nnan ninf afn <2 x float> @llvm.copysign.v2f32(<2 x float> [[__EXP2]], <2 x float> [[TMP1]])
+; CHECK-NEXT:    ret <2 x float> [[__POW_SIGN1]]
 ;
 entry:
   %call = tail call nnan ninf afn <2 x float> @_Z4pownDv2_fDv2_i(<2 x float> %x, <2 x i32> %y)
@@ -724,10 +722,9 @@ define double @test_pown_afn_nnan_ninf_f64(double %x, i32 %y) {
 ; CHECK-NEXT:    [[__YEVEN:%.*]] = shl i64 [[__YTOU]], 63
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double [[X]] to i64
 ; CHECK-NEXT:    [[__POW_SIGN:%.*]] = and i64 [[__YEVEN]], [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double [[__EXP2]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = or i64 [[__POW_SIGN]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[TMP2]] to double
-; CHECK-NEXT:    ret double [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[__POW_SIGN]] to double
+; CHECK-NEXT:    [[__POW_SIGN1:%.*]] = call nnan ninf afn double @llvm.copysign.f64(double [[__EXP2]], double [[TMP1]])
+; CHECK-NEXT:    ret double [[__POW_SIGN1]]
 ;
 entry:
   %call = tail call nnan ninf afn double @_Z4powndi(double %x, i32 %y)
@@ -747,10 +744,9 @@ define <2 x double> @test_pown_afn_nnan_ninf_v2f64(<2 x double> %x, <2 x i32> %y
 ; CHECK-NEXT:    [[__YEVEN:%.*]] = shl <2 x i64> [[__YTOU]], <i64 63, i64 63>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[X]] to <2 x i64>
 ; CHECK-NEXT:    [[__POW_SIGN:%.*]] = and <2 x i64> [[__YEVEN]], [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[__EXP2]] to <2 x i64>
-; CHECK-NEXT:    [[TMP2:%.*]] = or <2 x i64> [[__POW_SIGN]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <2 x double>
-; CHECK-NEXT:    ret <2 x double> [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[__POW_SIGN]] to <2 x double>
+; CHECK-NEXT:    [[__POW_SIGN1:%.*]] = call nnan ninf afn <2 x double> @llvm.copysign.v2f64(<2 x double> [[__EXP2]], <2 x double> [[TMP1]])
+; CHECK-NEXT:    ret <2 x double> [[__POW_SIGN1]]
 ;
 entry:
   %call = tail call nnan ninf afn <2 x double> @_Z4pownDv2_dDv2_i(<2 x double> %x, <2 x i32> %y)
@@ -770,10 +766,9 @@ define half @test_pown_afn_nnan_ninf_f16(half %x, i32 %y) {
 ; CHECK-NEXT:    [[__YEVEN:%.*]] = shl i16 [[__YTOU]], 15
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast half [[X]] to i16
 ; CHECK-NEXT:    [[__POW_SIGN:%.*]] = and i16 [[__YEVEN]], [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast half [[__EXP2]] to i16
-; CHECK-NEXT:    [[TMP2:%.*]] = or disjoint i16 [[__POW_SIGN]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP2]] to half
-; CHECK-NEXT:    ret half [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[__POW_SIGN]] to half
+; CHECK-NEXT:    [[__POW_SIGN1:%.*]] = call nnan ninf afn half @llvm.copysign.f16(half [[__EXP2]], half [[TMP1]])
+; CHECK-NEXT:    ret half [[__POW_SIGN1]]
 ;
 entry:
   %call = tail call nnan ninf afn half @_Z4pownDhi(half %x, i32 %y)
@@ -793,10 +788,9 @@ define <2 x half> @test_pown_afn_nnan_ninf_v2f16(<2 x half> %x, <2 x i32> %y) {
 ; CHECK-NEXT:    [[__YEVEN:%.*]] = shl <2 x i16> [[__YTOU]], <i16 15, i16 15>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x half> [[X]] to <2 x i16>
 ; CHECK-NEXT:    [[__POW_SIGN:%.*]] = and <2 x i16> [[__YEVEN]], [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x half> [[__EXP2]] to <2 x i16>
-; CHECK-NEXT:    [[TMP2:%.*]] = or disjoint <2 x i16> [[__POW_SIGN]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[TMP2]] to <2 x half>
-; CHECK-NEXT:    ret <2 x half> [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i16> [[__POW_SIGN]] to <2 x half>
+; CHECK-NEXT:    [[__POW_SIGN1:%.*]] = call nnan ninf afn <2 x half> @llvm.copysign.v2f16(<2 x half> [[__EXP2]], <2 x half> [[TMP1]])
+; CHECK-NEXT:    ret <2 x half> [[__POW_SIGN1]]
 ;
 entry:
   %call = tail call nnan ninf afn <2 x half> @_Z4pownDv2_DhDv2_i(<2 x half> %x, <2 x i32> %y)
@@ -827,10 +821,9 @@ define float @test_pown_fast_f32_strictfp(float %x, i32 %y) #1 {
 ; CHECK-NEXT:    [[__YEVEN:%.*]] = shl i32 [[Y]], 31
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[X]] to i32
 ; CHECK-NEXT:    [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float [[__EXP2]] to i32
-; CHECK-NEXT:    [[TMP2:%.*]] = or disjoint i32 [[__POW_SIGN]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32 [[TMP2]] to float
-; CHECK-NEXT:    ret float [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[__POW_SIGN]] to float
+; CHECK-NEXT:    [[__POW_SIGN1:%.*]] = call fast float @llvm.copysign.f32(float [[__EXP2]], float [[TMP1]]) #[[ATTR0]]
+; CHECK-NEXT:    ret float [[__POW_SIGN1]]
 ;
 entry:
   %call = tail call fast float @_Z4pownfi(float %x, i32 %y) #1
@@ -840,7 +833,8 @@ entry:
 define float @test_pown_fast_f32__y_poison(float %x) {
 ; CHECK-LABEL: define float @test_pown_fast_f32__y_poison
 ; CHECK-SAME: (float [[X:%.*]]) {
-; CHECK-NEXT:    ret float poison
+; CHECK-NEXT:    [[__EXP2:%.*]] = call fast float @llvm.exp2.f32(float poison)
+; CHECK-NEXT:    ret float [[__EXP2]]
 ;
   %call = tail call fast float @_Z4pownfi(float %x, i32 poison)
   ret float %call
@@ -1073,10 +1067,9 @@ define float @test_pown_afn_ninf_nnan_f32__x_known_positive(float nofpclass(ninf
 ; CHECK-NEXT:    [[__YEVEN:%.*]] = shl i32 [[Y]], 31
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[X]] to i32
 ; CHECK-NEXT:    [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float [[__EXP2]] to i32
-; CHECK-NEXT:    [[TMP2:%.*]] = or disjoint i32 [[__POW_SIGN]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32 [[TMP2]] to float
-; CHECK-NEXT:    ret float [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[__POW_SIGN]] to float
+; CHECK-NEXT:    [[__POW_SIGN1:%.*]] = call nnan ninf afn float @llvm.copysign.f32(float [[__EXP2]], float [[TMP1]])
+; CHECK-NEXT:    ret float [[__POW_SIGN1]]
 ;
 entry:
   %call = tail call afn ninf nnan float @_Z4pownfi(float %x, i32 %y)
diff --git a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
index 5a241f85b2e2c..7117f5ccc76e6 100644
--- a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
+++ b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
@@ -359,9 +359,9 @@ declare half @_Z4pownDhi(half, i32)
 ; GCN-NATIVE: %__yeven = shl i16 %__ytou, 15
 ; GCN-NATIVE: %0 = bitcast half %x to i16
 ; GCN-NATIVE: %__pow_sign = and i16 %__yeven, %0
-; GCN-NATIVE: %1 = bitcast half %__exp2 to i16
-; GCN-NATIVE: %2 = or disjoint i16 %__pow_sign, %1
-; GCN-NATIVE: %3 = bitcast i16 %2 to half
+; GCN-NATIVE: %1 = bitcast i16 %__pow_sign to half
+; GCN-NATIVE: %__pow_sign1 = tail call fast half @llvm.copysign.f16(half %__exp2, half %1)
+; GCN-NATIVE: ret half %__pow_sign1
 define half @test_pown_f16(half %x, i32 %y) {
 entry:
   %call = call fast half @_Z4pownDhi(half %x, i32 %y)
@@ -377,9 +377,9 @@ declare float @_Z4pownfi(float, i32)
 ; GCN: %__exp2 = tail call fast float @llvm.exp2.f32(float %__ylogx)
 ; GCN: %[[r0:.*]] = bitcast float %tmp to i32
 ; GCN: %__pow_sign = and i32 %[[r0]], -2147483648
-; GCN: %[[r1:.*]] = bitcast float %__exp2 to i32
-; GCN: %[[r2:.*]] = or disjoint i32 %__pow_sign, %[[r1]]
-; GCN: store i32 %[[r2]], ptr addrspace(1) %a, align 4
+; GCN: %[[r1:.+]] = bitcast i32 %__pow_sign to float
+; GCN: %[[r2:.+]] = tail call fast float @llvm.copysign.f32(float %__exp2, float %[[r1]])
+; GCN: store float %[[r2]], ptr addrspace(1) %a, align 4
 define amdgpu_kernel void @test_pow(ptr addrspace(1) nocapture %a) {
 entry:
   %tmp = load float, ptr addrspace(1) %a, align 4
@@ -413,9 +413,9 @@ entry:
 ; GCN: %__yeven = shl i32 %conv, 31
 ; GCN: %[[r0:.*]] = bitcast float %tmp to i32
 ; GCN: %__pow_sign = and i32 %__yeven, %[[r0]]
-; GCN: %[[r1:.*]] = bitcast float %__exp2 to i32
-; GCN: %[[r2:.*]] = or disjoint i32 %__pow_sign, %[[r1]]
-; GCN: store i32 %[[r2]], ptr addrspace(1) %a, align 4
+; GCN: %[[r1:.*]] = bitcast i32 %__pow_sign to float
+; GCN: %[[r2:.*]] = tail call fast float @llvm.copysign.f32(float %__exp2, float %[[r1]])
+; GCN: store float %[[r2]], ptr addrspace(1) %a, align 4
 define amdgpu_kernel void @test_pown(ptr addrspace(1) nocapture %a) {
 entry:
   %tmp = load float, ptr addrspace(1) %a, align 4
@@ -437,9 +437,9 @@ declare <2 x half> @_Z3powDv2_DhS_(<2 x half>, <2 x half>)
 ; GCN: %__exp2 = tail call fast half @llvm.exp2.f16(half %__ylogx)
 ; GCN: %1 = bitcast half %x to i16
 ; GCN: %__pow_sign = and i16 %1, -32768
-; GCN: %2 = bitcast half %__exp2 to i16
-; GCN: %3 = or disjoint i16 %__pow_sign, %2
-; GCN: %4 = bitcast i16 %3 to half
+; GCN: %2 = bitcast i16 %__pow_sign to half
+; GCN: %__pow_sign1 = tail call fast half @llvm.copysign.f16(half %__exp2, half %2)
+; GCN: ret half %__pow_sign1
 define half @test_pow_fast_f16__y_13(half %x) {
   %powr = tail call fast half @_Z3powDhDh(half %x, half 13.0)
   ret half %powr
@@ -452,9 +452,9 @@ define half @test_pow_fast_f16__y_13(half %x) {
 ; GCN: %__exp2 = tail call fast <2 x half> @llvm.exp2.v2f16(<2 x half> %__ylogx)
 ; GCN: %1 = bitcast <2 x half> %x to <2 x i16>
 ; GCN: %__pow_sign = and <2 x i16> %1, <i16 -32768, i16 -32768>
-; GCN: %2 = bitcast <2 x half> %__exp2 to <2 x i16>
-; GCN: %3 = or disjoint <2 x i16> %__pow_sign, %2
-; GCN: %4 = bitcast <2 x i16> %3 to <2 x half>
+; GCN: %2 = bitcast <2 x i16> %__pow_sign to <2 x half>
+; GCN: %__pow_sign1 = tail call fast <2 x half> @llvm.copysign.v2f16(<2 x half> %__exp2, <2 x half> %2)
+; GCN: ret <2 x half> %__pow_sign1
 define <2 x half> @test_pow_fast_v2f16__y_13(<2 x half> %x) {
   %powr = tail call fast <2 x half> @_Z3powDv2_DhS_(<2 x half> %x, <2 x half> <half 13.0, half 13.0>)
   ret <2 x half> %powr