[llvm] deefda7 - AMDGPU: Use exp2 and log2 intrinsics directly for f16/f32

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Fri Sep 1 05:22:32 PDT 2023


Author: Matt Arsenault
Date: 2023-09-01T08:22:16-04:00
New Revision: deefda7074398b7e8b7dae9dace43a8c694a49c1

URL: https://github.com/llvm/llvm-project/commit/deefda7074398b7e8b7dae9dace43a8c694a49c1
DIFF: https://github.com/llvm/llvm-project/commit/deefda7074398b7e8b7dae9dace43a8c694a49c1.diff

LOG: AMDGPU: Use exp2 and log2 intrinsics directly for f16/f32

These codegen correctly but f64 doesn't. This prevents losing fast
math flags on the way to the underlying intrinsic.

https://reviews.llvm.org/D158997

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
    llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
    llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll
    llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll
    llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-powr.ll
    llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index 0fb1474f80127a..285768ff28a1df 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -969,12 +969,19 @@ bool AMDGPULibCalls::fold_pow(FPMathOperator *FPOp, IRBuilder<> &B,
     return true;
   }
 
+  // If we should use the generic intrinsic instead of emitting a libcall
+  const bool ShouldUseIntrinsic = eltType->isFloatTy() || eltType->isHalfTy();
+
   // powr ---> exp2(y * log2(x))
   // pown/pow ---> powr(fabs(x), y) | (x & ((int)y << 31))
-  FunctionCallee ExpExpr =
-      getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_EXP2, FInfo));
-  if (!ExpExpr)
-    return false;
+  FunctionCallee ExpExpr;
+  if (ShouldUseIntrinsic)
+    ExpExpr = Intrinsic::getDeclaration(M, Intrinsic::exp2, {FPOp->getType()});
+  else {
+    ExpExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_EXP2, FInfo));
+    if (!ExpExpr)
+      return false;
+  }
 
   bool needlog = false;
   bool needabs = false;
@@ -1043,10 +1050,16 @@ bool AMDGPULibCalls::fold_pow(FPMathOperator *FPOp, IRBuilder<> &B,
     nval = cnval ? cnval : opr0;
   }
   if (needlog) {
-    FunctionCallee LogExpr =
-        getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_LOG2, FInfo));
-    if (!LogExpr)
-      return false;
+    FunctionCallee LogExpr;
+    if (ShouldUseIntrinsic) {
+      LogExpr =
+          Intrinsic::getDeclaration(M, Intrinsic::log2, {FPOp->getType()});
+    } else {
+      LogExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_LOG2, FInfo));
+      if (!LogExpr)
+        return false;
+    }
+
     nval = CreateCallEx(B,LogExpr, nval, "__log2");
   }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
index a8f75f039a0df7..bd3fc6f0589b6d 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
@@ -57,10 +57,62 @@ define half @test_pow_fast_f16__integral_y(half %x, i32 %y.i) {
 ; CHECK-LABEL: test_pow_fast_f16__integral_y:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; CHECK-NEXT:    v_log_f16_e64 v2, |v0|
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; CHECK-NEXT:    v_mul_f16_e32 v2, v2, v1
+; CHECK-NEXT:    v_exp_f16_e32 v2, v2
+; CHECK-NEXT:    v_cvt_i16_f16_e32 v1, v1
+; CHECK-NEXT:    v_lshlrev_b16_e32 v1, 15, v1
+; CHECK-NEXT:    v_and_b32_e32 v0, v1, v0
+; CHECK-NEXT:    v_or_b32_e32 v0, v0, v2
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %y = sitofp i32 %y.i to half
+  %pow = tail call fast half @_Z3powDhDh(half %x, half %y)
+  ret half %pow
+}
+
+define float @test_pow_fast_f32__integral_y(float %x, i32 %y.i) {
+; CHECK-LABEL: test_pow_fast_f32__integral_y:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s4, 0x800000
+; CHECK-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0x4f800000
+; CHECK-NEXT:    v_cndmask_b32_e32 v3, 1.0, v3, vcc
+; CHECK-NEXT:    v_mul_f32_e64 v3, |v0|, v3
+; CHECK-NEXT:    v_log_f32_e32 v3, v3
+; CHECK-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0x42000000
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; CHECK-NEXT:    v_sub_f32_e32 v2, v3, v2
+; CHECK-NEXT:    v_mul_f32_e32 v3, v2, v1
+; CHECK-NEXT:    s_mov_b32 s4, 0xc2fc0000
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0x42800000
+; CHECK-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v3
+; CHECK-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
+; CHECK-NEXT:    v_fma_f32 v2, v2, v1, v3
+; CHECK-NEXT:    v_exp_f32_e32 v2, v2
+; CHECK-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0x1f800000
+; CHECK-NEXT:    v_cndmask_b32_e32 v3, 1.0, v3, vcc
+; CHECK-NEXT:    v_mul_f32_e32 v2, v2, v3
+; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
+; CHECK-NEXT:    v_and_or_b32 v0, v1, v0, v2
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %y = sitofp i32 %y.i to float
+  %pow = tail call fast float @_Z3powff(float %x, float %y)
+  ret float %pow
+}
+
+define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) {
+; CHECK-LABEL: test_pow_fast_f64__integral_y:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_mov_b32 s16, s33
 ; CHECK-NEXT:    s_mov_b32 s33, s32
 ; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
 ; CHECK-NEXT:    v_writelane_b32 v40, s16, 14
 ; CHECK-NEXT:    v_writelane_b32 v40, s30, 0
@@ -76,19 +128,20 @@ define half @test_pow_fast_f16__integral_y(half %x, i32 %y.i) {
 ; CHECK-NEXT:    v_writelane_b32 v40, s41, 9
 ; CHECK-NEXT:    s_mov_b64 s[40:41], s[4:5]
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
-; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2Dh at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2Dh at gotpcrel32@hi+12
-; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_mov_b32_e32 v42, v0
-; CHECK-NEXT:    v_cvt_f32_i32_e32 v0, v1
+; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
 ; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; CHECK-NEXT:    v_writelane_b32 v40, s42, 10
+; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill
 ; CHECK-NEXT:    v_writelane_b32 v40, s43, 11
+; CHECK-NEXT:    v_mov_b32_e32 v43, v1
 ; CHECK-NEXT:    v_writelane_b32 v40, s44, 12
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v43, v0
-; CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v42
+; CHECK-NEXT:    v_mov_b32_e32 v42, v2
+; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v43
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; CHECK-NEXT:    v_writelane_b32 v40, s45, 13
 ; CHECK-NEXT:    v_mov_b32_e32 v41, v31
@@ -99,13 +152,14 @@ define half @test_pow_fast_f16__integral_y(half %x, i32 %y.i) {
 ; CHECK-NEXT:    s_mov_b64 s[34:35], s[10:11]
 ; CHECK-NEXT:    s_mov_b64 s[36:37], s[8:9]
 ; CHECK-NEXT:    s_mov_b64 s[38:39], s[6:7]
+; CHECK-NEXT:    v_cvt_f64_i32_e32 v[44:45], v42
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT:    v_mul_f64 v[0:1], v[0:1], v[44:45]
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
-; CHECK-NEXT:    s_add_u32 s4, s4, _Z4exp2Dh at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4exp2Dh at gotpcrel32@hi+12
+; CHECK-NEXT:    s_add_u32 s4, s4, _Z4exp2d at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4exp2d at gotpcrel32@hi+12
 ; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    v_mul_f16_e32 v0, v0, v43
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; CHECK-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; CHECK-NEXT:    s_mov_b64 s[8:9], s[36:37]
@@ -117,13 +171,14 @@ define half @test_pow_fast_f16__integral_y(half %x, i32 %y.i) {
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v41
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_cvt_i16_f16_e32 v1, v43
-; CHECK-NEXT:    v_lshlrev_b16_e32 v1, 15, v1
-; CHECK-NEXT:    v_and_b32_e32 v1, v1, v42
-; CHECK-NEXT:    buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; CHECK-NEXT:    v_or_b32_e32 v0, v1, v0
+; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 31, v42
+; CHECK-NEXT:    v_and_b32_e32 v2, v2, v43
+; CHECK-NEXT:    buffer_load_dword v45, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; CHECK-NEXT:    v_or_b32_e32 v1, v2, v1
 ; CHECK-NEXT:    v_readlane_b32 s45, v40, 13
 ; CHECK-NEXT:    v_readlane_b32 s44, v40, 12
 ; CHECK-NEXT:    v_readlane_b32 s43, v40, 11
@@ -140,19 +195,63 @@ define half @test_pow_fast_f16__integral_y(half %x, i32 %y.i) {
 ; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
 ; CHECK-NEXT:    v_readlane_b32 s4, v40, 14
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
 ; CHECK-NEXT:    s_addk_i32 s32, 0xf800
 ; CHECK-NEXT:    s_mov_b32 s33, s4
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
-  %y = sitofp i32 %y.i to half
-  %pow = tail call fast half @_Z3powDhDh(half %x, half %y)
-  ret half %pow
+  %y = sitofp i32 %y.i to double
+  %pow = tail call fast double @_Z3powdd(double %x, double %y)
+  ret double %pow
 }
 
-define float @test_pow_fast_f32__integral_y(float %x, i32 %y.i) {
-; CHECK-LABEL: test_pow_fast_f32__integral_y:
+; --------------------------------------------------------------------
+; test powr
+; --------------------------------------------------------------------
+
+define half @test_powr_fast_f16(half %x, half %y) {
+; CHECK-LABEL: test_powr_fast_f16:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_log_f16_e32 v0, v0
+; CHECK-NEXT:    v_mul_f16_e32 v0, v0, v1
+; CHECK-NEXT:    v_exp_f16_e32 v0, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %powr = tail call fast half @_Z4powrDhDh(half %x, half %y)
+  ret half %powr
+}
+
+define float @test_powr_fast_f32(float %x, float %y) {
+; CHECK-LABEL: test_powr_fast_f32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s4, 0x800000
+; CHECK-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0x4f800000
+; CHECK-NEXT:    v_cndmask_b32_e32 v3, 1.0, v3, vcc
+; CHECK-NEXT:    v_mul_f32_e32 v0, v0, v3
+; CHECK-NEXT:    v_log_f32_e32 v0, v0
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0x42000000
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; CHECK-NEXT:    s_mov_b32 s4, 0xc2fc0000
+; CHECK-NEXT:    v_sub_f32_e32 v0, v0, v2
+; CHECK-NEXT:    v_mul_f32_e32 v2, v0, v1
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0x42800000
+; CHECK-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v2
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
+; CHECK-NEXT:    v_fma_f32 v0, v0, v1, v2
+; CHECK-NEXT:    v_exp_f32_e32 v0, v0
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0x1f800000
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; CHECK-NEXT:    v_mul_f32_e32 v0, v0, v1
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %powr = tail call fast float @_Z4powrff(float %x, float %y)
+  ret float %powr
+}
+
+define double @test_powr_fast_f64(double %x, double %y) {
+; CHECK-LABEL: test_powr_fast_f64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_mov_b32 s16, s33
@@ -174,20 +273,18 @@ define float @test_pow_fast_f32__integral_y(float %x, i32 %y.i) {
 ; CHECK-NEXT:    v_writelane_b32 v40, s41, 9
 ; CHECK-NEXT:    s_mov_b64 s[40:41], s[4:5]
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
-; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2f at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2f at gotpcrel32@hi+12
+; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
 ; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; CHECK-NEXT:    v_writelane_b32 v40, s42, 10
-; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
 ; CHECK-NEXT:    v_writelane_b32 v40, s43, 11
-; CHECK-NEXT:    v_mov_b32_e32 v42, v0
 ; CHECK-NEXT:    v_writelane_b32 v40, s44, 12
-; CHECK-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v42
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
 ; CHECK-NEXT:    v_writelane_b32 v40, s45, 13
-; CHECK-NEXT:    v_mov_b32_e32 v41, v31
+; CHECK-NEXT:    v_mov_b32_e32 v43, v31
 ; CHECK-NEXT:    s_mov_b32 s42, s15
 ; CHECK-NEXT:    s_mov_b32 s43, s14
 ; CHECK-NEXT:    s_mov_b32 s44, s13
@@ -195,14 +292,15 @@ define float @test_pow_fast_f32__integral_y(float %x, i32 %y.i) {
 ; CHECK-NEXT:    s_mov_b64 s[34:35], s[10:11]
 ; CHECK-NEXT:    s_mov_b64 s[36:37], s[8:9]
 ; CHECK-NEXT:    s_mov_b64 s[38:39], s[6:7]
-; CHECK-NEXT:    v_cvt_f32_i32_e32 v43, v1
+; CHECK-NEXT:    v_mov_b32_e32 v42, v3
+; CHECK-NEXT:    v_mov_b32_e32 v41, v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT:    v_mul_f64 v[0:1], v[0:1], v[41:42]
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
-; CHECK-NEXT:    s_add_u32 s4, s4, _Z4exp2f at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4exp2f at gotpcrel32@hi+12
+; CHECK-NEXT:    s_add_u32 s4, s4, _Z4exp2d at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4exp2d at gotpcrel32@hi+12
 ; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    v_mul_f32_e32 v0, v0, v43
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; CHECK-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; CHECK-NEXT:    s_mov_b64 s[8:9], s[36:37]
@@ -211,18 +309,15 @@ define float @test_pow_fast_f32__integral_y(float %x, i32 %y.i) {
 ; CHECK-NEXT:    s_mov_b32 s13, s44
 ; CHECK-NEXT:    s_mov_b32 s14, s43
 ; CHECK-NEXT:    s_mov_b32 s15, s42
-; CHECK-NEXT:    v_mov_b32_e32 v31, v41
+; CHECK-NEXT:    v_mov_b32_e32 v31, v43
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_cvt_i32_f32_e32 v1, v43
-; CHECK-NEXT:    v_readlane_b32 s45, v40, 13
-; CHECK-NEXT:    v_readlane_b32 s44, v40, 12
-; CHECK-NEXT:    v_readlane_b32 s43, v40, 11
-; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
-; CHECK-NEXT:    v_and_or_b32 v0, v1, v42, v0
 ; CHECK-NEXT:    buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT:    v_readlane_b32 s45, v40, 13
+; CHECK-NEXT:    v_readlane_b32 s44, v40, 12
+; CHECK-NEXT:    v_readlane_b32 s43, v40, 11
 ; CHECK-NEXT:    v_readlane_b32 s42, v40, 10
 ; CHECK-NEXT:    v_readlane_b32 s41, v40, 9
 ; CHECK-NEXT:    v_readlane_b32 s40, v40, 8
@@ -242,19 +337,70 @@ define float @test_pow_fast_f32__integral_y(float %x, i32 %y.i) {
 ; CHECK-NEXT:    s_mov_b32 s33, s4
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
-  %y = sitofp i32 %y.i to float
-  %pow = tail call fast float @_Z3powff(float %x, float %y)
-  ret float %pow
+  %powr = tail call fast double @_Z4powrdd(double %x, double %y)
+  ret double %powr
 }
 
-define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) {
-; CHECK-LABEL: test_pow_fast_f64__integral_y:
+; --------------------------------------------------------------------
+; test pown
+; --------------------------------------------------------------------
+
+define half @test_pown_fast_f16(half %x, i32 %y) {
+; CHECK-LABEL: test_pown_fast_f16:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_cvt_f32_i32_e32 v2, v1
+; CHECK-NEXT:    v_log_f16_e64 v3, |v0|
+; CHECK-NEXT:    v_lshlrev_b16_e32 v1, 15, v1
+; CHECK-NEXT:    v_and_b32_e32 v0, v1, v0
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; CHECK-NEXT:    v_mul_f16_e32 v2, v3, v2
+; CHECK-NEXT:    v_exp_f16_e32 v2, v2
+; CHECK-NEXT:    v_or_b32_e32 v0, v0, v2
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %call = tail call fast half @_Z4pownDhi(half %x, i32 %y)
+  ret half %call
+}
+
+define float @test_pown_fast_f32(float %x, i32 %y) {
+; CHECK-LABEL: test_pown_fast_f32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s4, 0x800000
+; CHECK-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0x4f800000
+; CHECK-NEXT:    v_cndmask_b32_e32 v3, 1.0, v3, vcc
+; CHECK-NEXT:    v_mul_f32_e64 v3, |v0|, v3
+; CHECK-NEXT:    v_log_f32_e32 v3, v3
+; CHECK-NEXT:    v_cvt_f32_i32_e32 v4, v1
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0x42000000
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; CHECK-NEXT:    v_sub_f32_e32 v2, v3, v2
+; CHECK-NEXT:    v_mul_f32_e32 v3, v2, v4
+; CHECK-NEXT:    s_mov_b32 s4, 0xc2fc0000
+; CHECK-NEXT:    v_mov_b32_e32 v5, 0x42800000
+; CHECK-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v3
+; CHECK-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
+; CHECK-NEXT:    v_fma_f32 v2, v2, v4, v3
+; CHECK-NEXT:    v_exp_f32_e32 v2, v2
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0x1f800000
+; CHECK-NEXT:    v_cndmask_b32_e32 v3, 1.0, v3, vcc
+; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
+; CHECK-NEXT:    v_mul_f32_e32 v2, v2, v3
+; CHECK-NEXT:    v_and_or_b32 v0, v1, v0, v2
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %call = tail call fast float @_Z4pownfi(float %x, i32 %y)
+  ret float %call
+}
+
+define double @test_pown_fast_f64(double %x, i32 %y) {
+; CHECK-LABEL: test_pown_fast_f64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_mov_b32 s16, s33
 ; CHECK-NEXT:    s_mov_b32 s33, s32
 ; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
 ; CHECK-NEXT:    v_writelane_b32 v40, s16, 14
 ; CHECK-NEXT:    v_writelane_b32 v40, s30, 0
@@ -274,15 +420,12 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) {
 ; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
 ; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; CHECK-NEXT:    v_writelane_b32 v40, s42, 10
-; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
 ; CHECK-NEXT:    v_writelane_b32 v40, s43, 11
 ; CHECK-NEXT:    v_mov_b32_e32 v43, v1
 ; CHECK-NEXT:    v_writelane_b32 v40, s44, 12
-; CHECK-NEXT:    v_mov_b32_e32 v42, v2
 ; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v43
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; CHECK-NEXT:    v_writelane_b32 v40, s45, 13
@@ -294,15 +437,16 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) {
 ; CHECK-NEXT:    s_mov_b64 s[34:35], s[10:11]
 ; CHECK-NEXT:    s_mov_b64 s[36:37], s[8:9]
 ; CHECK-NEXT:    s_mov_b64 s[38:39], s[6:7]
-; CHECK-NEXT:    v_cvt_f64_i32_e32 v[44:45], v42
+; CHECK-NEXT:    v_mov_b32_e32 v42, v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_mul_f64 v[0:1], v[0:1], v[44:45]
+; CHECK-NEXT:    v_cvt_f64_i32_e32 v[2:3], v42
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, _Z4exp2d at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4exp2d at gotpcrel32@hi+12
 ; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; CHECK-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; CHECK-NEXT:    s_mov_b64 s[8:9], s[36:37]
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[34:35]
@@ -315,11 +459,9 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) {
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 31, v42
 ; CHECK-NEXT:    v_and_b32_e32 v2, v2, v43
-; CHECK-NEXT:    buffer_load_dword v45, off, s[0:3], s33 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; CHECK-NEXT:    v_or_b32_e32 v1, v2, v1
 ; CHECK-NEXT:    v_readlane_b32 s45, v40, 13
 ; CHECK-NEXT:    v_readlane_b32 s44, v40, 12
@@ -337,23 +479,65 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) {
 ; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
 ; CHECK-NEXT:    v_readlane_b32 s4, v40, 14
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
 ; CHECK-NEXT:    s_addk_i32 s32, 0xf800
 ; CHECK-NEXT:    s_mov_b32 s33, s4
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
-  %y = sitofp i32 %y.i to double
-  %pow = tail call fast double @_Z3powdd(double %x, double %y)
-  ret double %pow
+  %call = tail call fast double @_Z4powndi(double %x, i32 %y)
+  ret double %call
 }
 
-; --------------------------------------------------------------------
-; test powr
-; --------------------------------------------------------------------
+define half @test_pown_fast_f16_known_even(half %x, i32 %y.arg) {
+; CHECK-LABEL: test_pown_fast_f16_known_even:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
+; CHECK-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; CHECK-NEXT:    v_log_f16_e64 v0, |v0|
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; CHECK-NEXT:    v_mul_f16_e32 v0, v0, v1
+; CHECK-NEXT:    v_exp_f16_e32 v0, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %y = shl i32 %y.arg, 1
+  %call = tail call fast half @_Z4pownDhi(half %x, i32 %y)
+  ret half %call
+}
 
-define half @test_powr_fast_f16(half %x, half %y) {
-; CHECK-LABEL: test_powr_fast_f16:
+define float @test_pown_fast_f32_known_even(float %x, i32 %y.arg) {
+; CHECK-LABEL: test_pown_fast_f32_known_even:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s4, 0x800000
+; CHECK-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0x4f800000
+; CHECK-NEXT:    v_cndmask_b32_e32 v3, 1.0, v3, vcc
+; CHECK-NEXT:    v_mul_f32_e64 v0, |v0|, v3
+; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
+; CHECK-NEXT:    v_log_f32_e32 v0, v0
+; CHECK-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0x42000000
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; CHECK-NEXT:    v_sub_f32_e32 v0, v0, v2
+; CHECK-NEXT:    v_mul_f32_e32 v2, v0, v1
+; CHECK-NEXT:    s_mov_b32 s4, 0xc2fc0000
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0x42800000
+; CHECK-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v2
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
+; CHECK-NEXT:    v_fma_f32 v0, v0, v1, v2
+; CHECK-NEXT:    v_exp_f32_e32 v0, v0
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0x1f800000
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; CHECK-NEXT:    v_mul_f32_e32 v0, v0, v1
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %y = shl i32 %y.arg, 1
+  %call = tail call fast float @_Z4pownfi(float %x, i32 %y)
+  ret float %call
+}
+
+define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) {
+; CHECK-LABEL: test_pown_fast_f64_known_even:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_mov_b32 s16, s33
@@ -375,12 +559,13 @@ define half @test_powr_fast_f16(half %x, half %y) {
 ; CHECK-NEXT:    v_writelane_b32 v40, s41, 9
 ; CHECK-NEXT:    s_mov_b64 s[40:41], s[4:5]
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
-; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2Dh at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2Dh at gotpcrel32@hi+12
+; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
 ; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; CHECK-NEXT:    v_writelane_b32 v40, s42, 10
 ; CHECK-NEXT:    v_writelane_b32 v40, s43, 11
 ; CHECK-NEXT:    v_writelane_b32 v40, s44, 12
+; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -393,15 +578,16 @@ define half @test_powr_fast_f16(half %x, half %y) {
 ; CHECK-NEXT:    s_mov_b64 s[34:35], s[10:11]
 ; CHECK-NEXT:    s_mov_b64 s[36:37], s[8:9]
 ; CHECK-NEXT:    s_mov_b64 s[38:39], s[6:7]
-; CHECK-NEXT:    v_mov_b32_e32 v42, v1
+; CHECK-NEXT:    v_lshlrev_b32_e32 v42, 1, v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT:    v_cvt_f64_i32_e32 v[2:3], v42
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
-; CHECK-NEXT:    s_add_u32 s4, s4, _Z4exp2Dh at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4exp2Dh at gotpcrel32@hi+12
+; CHECK-NEXT:    s_add_u32 s4, s4, _Z4exp2d at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4exp2d at gotpcrel32@hi+12
 ; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    v_mul_f16_e32 v0, v0, v42
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; CHECK-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; CHECK-NEXT:    s_mov_b64 s[8:9], s[36:37]
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[34:35]
@@ -436,846 +622,23 @@ define half @test_powr_fast_f16(half %x, half %y) {
 ; CHECK-NEXT:    s_mov_b32 s33, s4
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
-  %powr = tail call fast half @_Z4powrDhDh(half %x, half %y)
-  ret half %powr
+  %y = shl i32 %y.arg, 1
+  %call = tail call fast double @_Z4powndi(double %x, i32 %y)
+  ret double %call
 }
 
-define float @test_powr_fast_f32(float %x, float %y) {
-; CHECK-LABEL: test_powr_fast_f32:
+define half @test_pown_fast_f16_known_odd(half %x, i32 %y.arg) {
+; CHECK-LABEL: test_pown_fast_f16_known_odd:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_mov_b32 s16, s33
-; CHECK-NEXT:    s_mov_b32 s33, s32
-; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
-; CHECK-NEXT:    v_writelane_b32 v40, s16, 14
-; CHECK-NEXT:    v_writelane_b32 v40, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v40, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v40, s35, 3
-; CHECK-NEXT:    v_writelane_b32 v40, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v40, s37, 5
-; CHECK-NEXT:    v_writelane_b32 v40, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v40, s39, 7
-; CHECK-NEXT:    s_addk_i32 s32, 0x400
-; CHECK-NEXT:    v_writelane_b32 v40, s40, 8
-; CHECK-NEXT:    v_writelane_b32 v40, s41, 9
-; CHECK-NEXT:    s_mov_b64 s[40:41], s[4:5]
-; CHECK-NEXT:    s_getpc_b64 s[4:5]
-; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2f at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2f at gotpcrel32@hi+12
-; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v40, s42, 10
-; CHECK-NEXT:    v_writelane_b32 v40, s43, 11
-; CHECK-NEXT:    v_writelane_b32 v40, s44, 12
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_writelane_b32 v40, s45, 13
-; CHECK-NEXT:    v_mov_b32_e32 v41, v31
-; CHECK-NEXT:    s_mov_b32 s42, s15
-; CHECK-NEXT:    s_mov_b32 s43, s14
-; CHECK-NEXT:    s_mov_b32 s44, s13
-; CHECK-NEXT:    s_mov_b32 s45, s12
-; CHECK-NEXT:    s_mov_b64 s[34:35], s[10:11]
-; CHECK-NEXT:    s_mov_b64 s[36:37], s[8:9]
-; CHECK-NEXT:    s_mov_b64 s[38:39], s[6:7]
-; CHECK-NEXT:    v_mov_b32_e32 v42, v1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    s_getpc_b64 s[4:5]
-; CHECK-NEXT:    s_add_u32 s4, s4, _Z4exp2f at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4exp2f at gotpcrel32@hi+12
-; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    v_mul_f32_e32 v0, v0, v42
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT:    s_mov_b64 s[6:7], s[38:39]
-; CHECK-NEXT:    s_mov_b64 s[8:9], s[36:37]
-; CHECK-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; CHECK-NEXT:    s_mov_b32 s12, s45
-; CHECK-NEXT:    s_mov_b32 s13, s44
-; CHECK-NEXT:    s_mov_b32 s14, s43
-; CHECK-NEXT:    s_mov_b32 s15, s42
-; CHECK-NEXT:    v_mov_b32_e32 v31, v41
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    v_readlane_b32 s45, v40, 13
-; CHECK-NEXT:    v_readlane_b32 s44, v40, 12
-; CHECK-NEXT:    v_readlane_b32 s43, v40, 11
-; CHECK-NEXT:    v_readlane_b32 s42, v40, 10
-; CHECK-NEXT:    v_readlane_b32 s41, v40, 9
-; CHECK-NEXT:    v_readlane_b32 s40, v40, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v40, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v40, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v40, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v40, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v40, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v40, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
-; CHECK-NEXT:    v_readlane_b32 s4, v40, 14
-; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
-; CHECK-NEXT:    s_addk_i32 s32, 0xfc00
-; CHECK-NEXT:    s_mov_b32 s33, s4
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    s_setpc_b64 s[30:31]
-  %powr = tail call fast float @_Z4powrff(float %x, float %y)
-  ret float %powr
-}
-
-define double @test_powr_fast_f64(double %x, double %y) {
-; CHECK-LABEL: test_powr_fast_f64:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_mov_b32 s16, s33
-; CHECK-NEXT:    s_mov_b32 s33, s32
-; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
-; CHECK-NEXT:    v_writelane_b32 v40, s16, 14
-; CHECK-NEXT:    v_writelane_b32 v40, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v40, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v40, s35, 3
-; CHECK-NEXT:    v_writelane_b32 v40, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v40, s37, 5
-; CHECK-NEXT:    v_writelane_b32 v40, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v40, s39, 7
-; CHECK-NEXT:    s_addk_i32 s32, 0x800
-; CHECK-NEXT:    v_writelane_b32 v40, s40, 8
-; CHECK-NEXT:    v_writelane_b32 v40, s41, 9
-; CHECK-NEXT:    s_mov_b64 s[40:41], s[4:5]
-; CHECK-NEXT:    s_getpc_b64 s[4:5]
-; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
-; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v40, s42, 10
-; CHECK-NEXT:    v_writelane_b32 v40, s43, 11
-; CHECK-NEXT:    v_writelane_b32 v40, s44, 12
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_writelane_b32 v40, s45, 13
-; CHECK-NEXT:    v_mov_b32_e32 v43, v31
-; CHECK-NEXT:    s_mov_b32 s42, s15
-; CHECK-NEXT:    s_mov_b32 s43, s14
-; CHECK-NEXT:    s_mov_b32 s44, s13
-; CHECK-NEXT:    s_mov_b32 s45, s12
-; CHECK-NEXT:    s_mov_b64 s[34:35], s[10:11]
-; CHECK-NEXT:    s_mov_b64 s[36:37], s[8:9]
-; CHECK-NEXT:    s_mov_b64 s[38:39], s[6:7]
-; CHECK-NEXT:    v_mov_b32_e32 v42, v3
-; CHECK-NEXT:    v_mov_b32_e32 v41, v2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_mul_f64 v[0:1], v[0:1], v[41:42]
-; CHECK-NEXT:    s_getpc_b64 s[4:5]
-; CHECK-NEXT:    s_add_u32 s4, s4, _Z4exp2d at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4exp2d at gotpcrel32@hi+12
-; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT:    s_mov_b64 s[6:7], s[38:39]
-; CHECK-NEXT:    s_mov_b64 s[8:9], s[36:37]
-; CHECK-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; CHECK-NEXT:    s_mov_b32 s12, s45
-; CHECK-NEXT:    s_mov_b32 s13, s44
-; CHECK-NEXT:    s_mov_b32 s14, s43
-; CHECK-NEXT:    s_mov_b32 s15, s42
-; CHECK-NEXT:    v_mov_b32_e32 v31, v43
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; CHECK-NEXT:    v_readlane_b32 s45, v40, 13
-; CHECK-NEXT:    v_readlane_b32 s44, v40, 12
-; CHECK-NEXT:    v_readlane_b32 s43, v40, 11
-; CHECK-NEXT:    v_readlane_b32 s42, v40, 10
-; CHECK-NEXT:    v_readlane_b32 s41, v40, 9
-; CHECK-NEXT:    v_readlane_b32 s40, v40, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v40, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v40, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v40, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v40, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v40, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v40, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
-; CHECK-NEXT:    v_readlane_b32 s4, v40, 14
-; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
-; CHECK-NEXT:    s_addk_i32 s32, 0xf800
-; CHECK-NEXT:    s_mov_b32 s33, s4
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    s_setpc_b64 s[30:31]
-  %powr = tail call fast double @_Z4powrdd(double %x, double %y)
-  ret double %powr
-}
-
-; --------------------------------------------------------------------
-; test pown
-; --------------------------------------------------------------------
-
-define half @test_pown_fast_f16(half %x, i32 %y) {
-; CHECK-LABEL: test_pown_fast_f16:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_mov_b32 s16, s33
-; CHECK-NEXT:    s_mov_b32 s33, s32
-; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
-; CHECK-NEXT:    v_writelane_b32 v40, s16, 14
-; CHECK-NEXT:    v_writelane_b32 v40, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v40, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v40, s35, 3
-; CHECK-NEXT:    v_writelane_b32 v40, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v40, s37, 5
-; CHECK-NEXT:    v_writelane_b32 v40, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v40, s39, 7
-; CHECK-NEXT:    s_addk_i32 s32, 0x800
-; CHECK-NEXT:    v_writelane_b32 v40, s40, 8
-; CHECK-NEXT:    v_writelane_b32 v40, s41, 9
-; CHECK-NEXT:    s_mov_b64 s[40:41], s[4:5]
-; CHECK-NEXT:    s_getpc_b64 s[4:5]
-; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2Dh at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2Dh at gotpcrel32@hi+12
-; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v40, s42, 10
-; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_writelane_b32 v40, s43, 11
-; CHECK-NEXT:    v_mov_b32_e32 v43, v0
-; CHECK-NEXT:    v_writelane_b32 v40, s44, 12
-; CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v43
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT:    v_writelane_b32 v40, s45, 13
-; CHECK-NEXT:    v_mov_b32_e32 v41, v31
-; CHECK-NEXT:    s_mov_b32 s42, s15
-; CHECK-NEXT:    s_mov_b32 s43, s14
-; CHECK-NEXT:    s_mov_b32 s44, s13
-; CHECK-NEXT:    s_mov_b32 s45, s12
-; CHECK-NEXT:    s_mov_b64 s[34:35], s[10:11]
-; CHECK-NEXT:    s_mov_b64 s[36:37], s[8:9]
-; CHECK-NEXT:    s_mov_b64 s[38:39], s[6:7]
-; CHECK-NEXT:    v_mov_b32_e32 v42, v1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_cvt_f32_i32_e32 v1, v42
-; CHECK-NEXT:    s_getpc_b64 s[4:5]
-; CHECK-NEXT:    s_add_u32 s4, s4, _Z4exp2Dh at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4exp2Dh at gotpcrel32@hi+12
-; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; CHECK-NEXT:    s_mov_b64 s[6:7], s[38:39]
-; CHECK-NEXT:    s_mov_b64 s[8:9], s[36:37]
-; CHECK-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; CHECK-NEXT:    v_mul_f16_e32 v0, v0, v1
-; CHECK-NEXT:    s_mov_b32 s12, s45
-; CHECK-NEXT:    s_mov_b32 s13, s44
-; CHECK-NEXT:    s_mov_b32 s14, s43
-; CHECK-NEXT:    s_mov_b32 s15, s42
-; CHECK-NEXT:    v_mov_b32_e32 v31, v41
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_lshlrev_b16_e32 v1, 15, v42
-; CHECK-NEXT:    v_and_b32_e32 v1, v1, v43
-; CHECK-NEXT:    buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; CHECK-NEXT:    v_or_b32_e32 v0, v1, v0
-; CHECK-NEXT:    v_readlane_b32 s45, v40, 13
-; CHECK-NEXT:    v_readlane_b32 s44, v40, 12
-; CHECK-NEXT:    v_readlane_b32 s43, v40, 11
-; CHECK-NEXT:    v_readlane_b32 s42, v40, 10
-; CHECK-NEXT:    v_readlane_b32 s41, v40, 9
-; CHECK-NEXT:    v_readlane_b32 s40, v40, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v40, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v40, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v40, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v40, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v40, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v40, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
-; CHECK-NEXT:    v_readlane_b32 s4, v40, 14
-; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
-; CHECK-NEXT:    s_addk_i32 s32, 0xf800
-; CHECK-NEXT:    s_mov_b32 s33, s4
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    s_setpc_b64 s[30:31]
-  %call = tail call fast half @_Z4pownDhi(half %x, i32 %y)
-  ret half %call
-}
-
-define float @test_pown_fast_f32(float %x, i32 %y) {
-; CHECK-LABEL: test_pown_fast_f32:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_mov_b32 s16, s33
-; CHECK-NEXT:    s_mov_b32 s33, s32
-; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
-; CHECK-NEXT:    v_writelane_b32 v40, s16, 14
-; CHECK-NEXT:    v_writelane_b32 v40, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v40, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v40, s35, 3
-; CHECK-NEXT:    v_writelane_b32 v40, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v40, s37, 5
-; CHECK-NEXT:    v_writelane_b32 v40, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v40, s39, 7
-; CHECK-NEXT:    s_addk_i32 s32, 0x800
-; CHECK-NEXT:    v_writelane_b32 v40, s40, 8
-; CHECK-NEXT:    v_writelane_b32 v40, s41, 9
-; CHECK-NEXT:    s_mov_b64 s[40:41], s[4:5]
-; CHECK-NEXT:    s_getpc_b64 s[4:5]
-; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2f at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2f at gotpcrel32@hi+12
-; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v40, s42, 10
-; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_writelane_b32 v40, s43, 11
-; CHECK-NEXT:    v_mov_b32_e32 v43, v0
-; CHECK-NEXT:    v_writelane_b32 v40, s44, 12
-; CHECK-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v43
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT:    v_writelane_b32 v40, s45, 13
-; CHECK-NEXT:    v_mov_b32_e32 v41, v31
-; CHECK-NEXT:    s_mov_b32 s42, s15
-; CHECK-NEXT:    s_mov_b32 s43, s14
-; CHECK-NEXT:    s_mov_b32 s44, s13
-; CHECK-NEXT:    s_mov_b32 s45, s12
-; CHECK-NEXT:    s_mov_b64 s[34:35], s[10:11]
-; CHECK-NEXT:    s_mov_b64 s[36:37], s[8:9]
-; CHECK-NEXT:    s_mov_b64 s[38:39], s[6:7]
-; CHECK-NEXT:    v_mov_b32_e32 v42, v1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    s_getpc_b64 s[4:5]
-; CHECK-NEXT:    s_add_u32 s4, s4, _Z4exp2f at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4exp2f at gotpcrel32@hi+12
-; CHECK-NEXT:    v_cvt_f32_i32_e32 v1, v42
-; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT:    s_mov_b64 s[6:7], s[38:39]
-; CHECK-NEXT:    v_mul_f32_e32 v0, v0, v1
-; CHECK-NEXT:    s_mov_b64 s[8:9], s[36:37]
-; CHECK-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; CHECK-NEXT:    s_mov_b32 s12, s45
-; CHECK-NEXT:    s_mov_b32 s13, s44
-; CHECK-NEXT:    s_mov_b32 s14, s43
-; CHECK-NEXT:    s_mov_b32 s15, s42
-; CHECK-NEXT:    v_mov_b32_e32 v31, v41
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 31, v42
-; CHECK-NEXT:    v_and_or_b32 v0, v1, v43, v0
-; CHECK-NEXT:    buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; CHECK-NEXT:    v_readlane_b32 s45, v40, 13
-; CHECK-NEXT:    v_readlane_b32 s44, v40, 12
-; CHECK-NEXT:    v_readlane_b32 s43, v40, 11
-; CHECK-NEXT:    v_readlane_b32 s42, v40, 10
-; CHECK-NEXT:    v_readlane_b32 s41, v40, 9
-; CHECK-NEXT:    v_readlane_b32 s40, v40, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v40, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v40, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v40, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v40, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v40, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v40, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
-; CHECK-NEXT:    v_readlane_b32 s4, v40, 14
-; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
-; CHECK-NEXT:    s_addk_i32 s32, 0xf800
-; CHECK-NEXT:    s_mov_b32 s33, s4
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    s_setpc_b64 s[30:31]
-  %call = tail call fast float @_Z4pownfi(float %x, i32 %y)
-  ret float %call
-}
-
-define double @test_pown_fast_f64(double %x, i32 %y) {
-; CHECK-LABEL: test_pown_fast_f64:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_mov_b32 s16, s33
-; CHECK-NEXT:    s_mov_b32 s33, s32
-; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
-; CHECK-NEXT:    v_writelane_b32 v40, s16, 14
-; CHECK-NEXT:    v_writelane_b32 v40, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v40, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v40, s35, 3
-; CHECK-NEXT:    v_writelane_b32 v40, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v40, s37, 5
-; CHECK-NEXT:    v_writelane_b32 v40, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v40, s39, 7
-; CHECK-NEXT:    s_addk_i32 s32, 0x800
-; CHECK-NEXT:    v_writelane_b32 v40, s40, 8
-; CHECK-NEXT:    v_writelane_b32 v40, s41, 9
-; CHECK-NEXT:    s_mov_b64 s[40:41], s[4:5]
-; CHECK-NEXT:    s_getpc_b64 s[4:5]
-; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
-; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v40, s42, 10
-; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_writelane_b32 v40, s43, 11
-; CHECK-NEXT:    v_mov_b32_e32 v43, v1
-; CHECK-NEXT:    v_writelane_b32 v40, s44, 12
-; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v43
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT:    v_writelane_b32 v40, s45, 13
-; CHECK-NEXT:    v_mov_b32_e32 v41, v31
-; CHECK-NEXT:    s_mov_b32 s42, s15
-; CHECK-NEXT:    s_mov_b32 s43, s14
-; CHECK-NEXT:    s_mov_b32 s44, s13
-; CHECK-NEXT:    s_mov_b32 s45, s12
-; CHECK-NEXT:    s_mov_b64 s[34:35], s[10:11]
-; CHECK-NEXT:    s_mov_b64 s[36:37], s[8:9]
-; CHECK-NEXT:    s_mov_b64 s[38:39], s[6:7]
-; CHECK-NEXT:    v_mov_b32_e32 v42, v2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_cvt_f64_i32_e32 v[2:3], v42
-; CHECK-NEXT:    s_getpc_b64 s[4:5]
-; CHECK-NEXT:    s_add_u32 s4, s4, _Z4exp2d at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4exp2d at gotpcrel32@hi+12
-; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
-; CHECK-NEXT:    s_mov_b64 s[6:7], s[38:39]
-; CHECK-NEXT:    s_mov_b64 s[8:9], s[36:37]
-; CHECK-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; CHECK-NEXT:    s_mov_b32 s12, s45
-; CHECK-NEXT:    s_mov_b32 s13, s44
-; CHECK-NEXT:    s_mov_b32 s14, s43
-; CHECK-NEXT:    s_mov_b32 s15, s42
-; CHECK-NEXT:    v_mov_b32_e32 v31, v41
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 31, v42
-; CHECK-NEXT:    v_and_b32_e32 v2, v2, v43
-; CHECK-NEXT:    buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; CHECK-NEXT:    v_or_b32_e32 v1, v2, v1
-; CHECK-NEXT:    v_readlane_b32 s45, v40, 13
-; CHECK-NEXT:    v_readlane_b32 s44, v40, 12
-; CHECK-NEXT:    v_readlane_b32 s43, v40, 11
-; CHECK-NEXT:    v_readlane_b32 s42, v40, 10
-; CHECK-NEXT:    v_readlane_b32 s41, v40, 9
-; CHECK-NEXT:    v_readlane_b32 s40, v40, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v40, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v40, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v40, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v40, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v40, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v40, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
-; CHECK-NEXT:    v_readlane_b32 s4, v40, 14
-; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
-; CHECK-NEXT:    s_addk_i32 s32, 0xf800
-; CHECK-NEXT:    s_mov_b32 s33, s4
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    s_setpc_b64 s[30:31]
-  %call = tail call fast double @_Z4powndi(double %x, i32 %y)
-  ret double %call
-}
-
-define half @test_pown_fast_f16_known_even(half %x, i32 %y.arg) {
-; CHECK-LABEL: test_pown_fast_f16_known_even:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_mov_b32 s16, s33
-; CHECK-NEXT:    s_mov_b32 s33, s32
-; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
-; CHECK-NEXT:    v_writelane_b32 v40, s16, 14
-; CHECK-NEXT:    v_writelane_b32 v40, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v40, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v40, s35, 3
-; CHECK-NEXT:    v_writelane_b32 v40, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v40, s37, 5
-; CHECK-NEXT:    v_writelane_b32 v40, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v40, s39, 7
-; CHECK-NEXT:    s_addk_i32 s32, 0x400
-; CHECK-NEXT:    v_writelane_b32 v40, s40, 8
-; CHECK-NEXT:    v_writelane_b32 v40, s41, 9
-; CHECK-NEXT:    s_mov_b64 s[40:41], s[4:5]
-; CHECK-NEXT:    s_getpc_b64 s[4:5]
-; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2Dh at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2Dh at gotpcrel32@hi+12
-; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v40, s42, 10
-; CHECK-NEXT:    v_writelane_b32 v40, s43, 11
-; CHECK-NEXT:    v_writelane_b32 v40, s44, 12
-; CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_writelane_b32 v40, s45, 13
-; CHECK-NEXT:    v_mov_b32_e32 v41, v31
-; CHECK-NEXT:    s_mov_b32 s42, s15
-; CHECK-NEXT:    s_mov_b32 s43, s14
-; CHECK-NEXT:    s_mov_b32 s44, s13
-; CHECK-NEXT:    s_mov_b32 s45, s12
-; CHECK-NEXT:    s_mov_b64 s[34:35], s[10:11]
-; CHECK-NEXT:    s_mov_b64 s[36:37], s[8:9]
-; CHECK-NEXT:    s_mov_b64 s[38:39], s[6:7]
-; CHECK-NEXT:    v_lshlrev_b32_e32 v42, 1, v1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_cvt_f32_i32_e32 v1, v42
-; CHECK-NEXT:    s_getpc_b64 s[4:5]
-; CHECK-NEXT:    s_add_u32 s4, s4, _Z4exp2Dh at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4exp2Dh at gotpcrel32@hi+12
-; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; CHECK-NEXT:    s_mov_b64 s[6:7], s[38:39]
-; CHECK-NEXT:    s_mov_b64 s[8:9], s[36:37]
-; CHECK-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; CHECK-NEXT:    v_mul_f16_e32 v0, v0, v1
-; CHECK-NEXT:    s_mov_b32 s12, s45
-; CHECK-NEXT:    s_mov_b32 s13, s44
-; CHECK-NEXT:    s_mov_b32 s14, s43
-; CHECK-NEXT:    s_mov_b32 s15, s42
-; CHECK-NEXT:    v_mov_b32_e32 v31, v41
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    v_readlane_b32 s45, v40, 13
-; CHECK-NEXT:    v_readlane_b32 s44, v40, 12
-; CHECK-NEXT:    v_readlane_b32 s43, v40, 11
-; CHECK-NEXT:    v_readlane_b32 s42, v40, 10
-; CHECK-NEXT:    v_readlane_b32 s41, v40, 9
-; CHECK-NEXT:    v_readlane_b32 s40, v40, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v40, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v40, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v40, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v40, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v40, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v40, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
-; CHECK-NEXT:    v_readlane_b32 s4, v40, 14
-; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
-; CHECK-NEXT:    s_addk_i32 s32, 0xfc00
-; CHECK-NEXT:    s_mov_b32 s33, s4
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    s_setpc_b64 s[30:31]
-  %y = shl i32 %y.arg, 1
-  %call = tail call fast half @_Z4pownDhi(half %x, i32 %y)
-  ret half %call
-}
-
-define float @test_pown_fast_f32_known_even(float %x, i32 %y.arg) {
-; CHECK-LABEL: test_pown_fast_f32_known_even:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_mov_b32 s16, s33
-; CHECK-NEXT:    s_mov_b32 s33, s32
-; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
-; CHECK-NEXT:    v_writelane_b32 v40, s16, 14
-; CHECK-NEXT:    v_writelane_b32 v40, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v40, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v40, s35, 3
-; CHECK-NEXT:    v_writelane_b32 v40, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v40, s37, 5
-; CHECK-NEXT:    v_writelane_b32 v40, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v40, s39, 7
-; CHECK-NEXT:    s_addk_i32 s32, 0x400
-; CHECK-NEXT:    v_writelane_b32 v40, s40, 8
-; CHECK-NEXT:    v_writelane_b32 v40, s41, 9
-; CHECK-NEXT:    s_mov_b64 s[40:41], s[4:5]
-; CHECK-NEXT:    s_getpc_b64 s[4:5]
-; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2f at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2f at gotpcrel32@hi+12
-; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v40, s42, 10
-; CHECK-NEXT:    v_writelane_b32 v40, s43, 11
-; CHECK-NEXT:    v_writelane_b32 v40, s44, 12
-; CHECK-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_writelane_b32 v40, s45, 13
-; CHECK-NEXT:    v_mov_b32_e32 v41, v31
-; CHECK-NEXT:    s_mov_b32 s42, s15
-; CHECK-NEXT:    s_mov_b32 s43, s14
-; CHECK-NEXT:    s_mov_b32 s44, s13
-; CHECK-NEXT:    s_mov_b32 s45, s12
-; CHECK-NEXT:    s_mov_b64 s[34:35], s[10:11]
-; CHECK-NEXT:    s_mov_b64 s[36:37], s[8:9]
-; CHECK-NEXT:    s_mov_b64 s[38:39], s[6:7]
-; CHECK-NEXT:    v_lshlrev_b32_e32 v42, 1, v1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    s_getpc_b64 s[4:5]
-; CHECK-NEXT:    s_add_u32 s4, s4, _Z4exp2f at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4exp2f at gotpcrel32@hi+12
-; CHECK-NEXT:    v_cvt_f32_i32_e32 v1, v42
-; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT:    s_mov_b64 s[6:7], s[38:39]
-; CHECK-NEXT:    v_mul_f32_e32 v0, v0, v1
-; CHECK-NEXT:    s_mov_b64 s[8:9], s[36:37]
-; CHECK-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; CHECK-NEXT:    s_mov_b32 s12, s45
-; CHECK-NEXT:    s_mov_b32 s13, s44
-; CHECK-NEXT:    s_mov_b32 s14, s43
-; CHECK-NEXT:    s_mov_b32 s15, s42
-; CHECK-NEXT:    v_mov_b32_e32 v31, v41
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    v_readlane_b32 s45, v40, 13
-; CHECK-NEXT:    v_readlane_b32 s44, v40, 12
-; CHECK-NEXT:    v_readlane_b32 s43, v40, 11
-; CHECK-NEXT:    v_readlane_b32 s42, v40, 10
-; CHECK-NEXT:    v_readlane_b32 s41, v40, 9
-; CHECK-NEXT:    v_readlane_b32 s40, v40, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v40, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v40, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v40, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v40, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v40, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v40, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
-; CHECK-NEXT:    v_readlane_b32 s4, v40, 14
-; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
-; CHECK-NEXT:    s_addk_i32 s32, 0xfc00
-; CHECK-NEXT:    s_mov_b32 s33, s4
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    s_setpc_b64 s[30:31]
-  %y = shl i32 %y.arg, 1
-  %call = tail call fast float @_Z4pownfi(float %x, i32 %y)
-  ret float %call
-}
-
-define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) {
-; CHECK-LABEL: test_pown_fast_f64_known_even:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_mov_b32 s16, s33
-; CHECK-NEXT:    s_mov_b32 s33, s32
-; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
-; CHECK-NEXT:    v_writelane_b32 v40, s16, 14
-; CHECK-NEXT:    v_writelane_b32 v40, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v40, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v40, s35, 3
-; CHECK-NEXT:    v_writelane_b32 v40, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v40, s37, 5
-; CHECK-NEXT:    v_writelane_b32 v40, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v40, s39, 7
-; CHECK-NEXT:    s_addk_i32 s32, 0x400
-; CHECK-NEXT:    v_writelane_b32 v40, s40, 8
-; CHECK-NEXT:    v_writelane_b32 v40, s41, 9
-; CHECK-NEXT:    s_mov_b64 s[40:41], s[4:5]
-; CHECK-NEXT:    s_getpc_b64 s[4:5]
-; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
-; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v40, s42, 10
-; CHECK-NEXT:    v_writelane_b32 v40, s43, 11
-; CHECK-NEXT:    v_writelane_b32 v40, s44, 12
-; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_writelane_b32 v40, s45, 13
-; CHECK-NEXT:    v_mov_b32_e32 v41, v31
-; CHECK-NEXT:    s_mov_b32 s42, s15
-; CHECK-NEXT:    s_mov_b32 s43, s14
-; CHECK-NEXT:    s_mov_b32 s44, s13
-; CHECK-NEXT:    s_mov_b32 s45, s12
-; CHECK-NEXT:    s_mov_b64 s[34:35], s[10:11]
-; CHECK-NEXT:    s_mov_b64 s[36:37], s[8:9]
-; CHECK-NEXT:    s_mov_b64 s[38:39], s[6:7]
-; CHECK-NEXT:    v_lshlrev_b32_e32 v42, 1, v2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_cvt_f64_i32_e32 v[2:3], v42
-; CHECK-NEXT:    s_getpc_b64 s[4:5]
-; CHECK-NEXT:    s_add_u32 s4, s4, _Z4exp2d at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4exp2d at gotpcrel32@hi+12
-; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
-; CHECK-NEXT:    s_mov_b64 s[6:7], s[38:39]
-; CHECK-NEXT:    s_mov_b64 s[8:9], s[36:37]
-; CHECK-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; CHECK-NEXT:    s_mov_b32 s12, s45
-; CHECK-NEXT:    s_mov_b32 s13, s44
-; CHECK-NEXT:    s_mov_b32 s14, s43
-; CHECK-NEXT:    s_mov_b32 s15, s42
-; CHECK-NEXT:    v_mov_b32_e32 v31, v41
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    v_readlane_b32 s45, v40, 13
-; CHECK-NEXT:    v_readlane_b32 s44, v40, 12
-; CHECK-NEXT:    v_readlane_b32 s43, v40, 11
-; CHECK-NEXT:    v_readlane_b32 s42, v40, 10
-; CHECK-NEXT:    v_readlane_b32 s41, v40, 9
-; CHECK-NEXT:    v_readlane_b32 s40, v40, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v40, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v40, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v40, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v40, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v40, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v40, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
-; CHECK-NEXT:    v_readlane_b32 s4, v40, 14
-; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
-; CHECK-NEXT:    s_addk_i32 s32, 0xfc00
-; CHECK-NEXT:    s_mov_b32 s33, s4
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    s_setpc_b64 s[30:31]
-  %y = shl i32 %y.arg, 1
-  %call = tail call fast double @_Z4powndi(double %x, i32 %y)
-  ret double %call
-}
-
-define half @test_pown_fast_f16_known_odd(half %x, i32 %y.arg) {
-; CHECK-LABEL: test_pown_fast_f16_known_odd:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_mov_b32 s16, s33
-; CHECK-NEXT:    s_mov_b32 s33, s32
-; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
-; CHECK-NEXT:    v_writelane_b32 v40, s16, 14
-; CHECK-NEXT:    v_writelane_b32 v40, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v40, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v40, s35, 3
-; CHECK-NEXT:    v_writelane_b32 v40, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v40, s37, 5
-; CHECK-NEXT:    v_writelane_b32 v40, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v40, s39, 7
-; CHECK-NEXT:    s_addk_i32 s32, 0x800
-; CHECK-NEXT:    v_writelane_b32 v40, s40, 8
-; CHECK-NEXT:    v_writelane_b32 v40, s41, 9
-; CHECK-NEXT:    s_mov_b64 s[40:41], s[4:5]
-; CHECK-NEXT:    s_getpc_b64 s[4:5]
-; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2Dh at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2Dh at gotpcrel32@hi+12
-; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v40, s42, 10
-; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_writelane_b32 v40, s43, 11
-; CHECK-NEXT:    v_mov_b32_e32 v42, v0
-; CHECK-NEXT:    v_writelane_b32 v40, s44, 12
-; CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v42
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT:    v_writelane_b32 v40, s45, 13
-; CHECK-NEXT:    v_mov_b32_e32 v41, v31
-; CHECK-NEXT:    s_mov_b32 s42, s15
-; CHECK-NEXT:    s_mov_b32 s43, s14
-; CHECK-NEXT:    s_mov_b32 s44, s13
-; CHECK-NEXT:    s_mov_b32 s45, s12
-; CHECK-NEXT:    s_mov_b64 s[34:35], s[10:11]
-; CHECK-NEXT:    s_mov_b64 s[36:37], s[8:9]
-; CHECK-NEXT:    s_mov_b64 s[38:39], s[6:7]
-; CHECK-NEXT:    v_or_b32_e32 v43, 1, v1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_cvt_f32_i32_e32 v1, v43
-; CHECK-NEXT:    s_getpc_b64 s[4:5]
-; CHECK-NEXT:    s_add_u32 s4, s4, _Z4exp2Dh at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4exp2Dh at gotpcrel32@hi+12
-; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT:    v_or_b32_e32 v1, 1, v1
+; CHECK-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; CHECK-NEXT:    v_log_f16_e64 v2, |v0|
+; CHECK-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
 ; CHECK-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; CHECK-NEXT:    s_mov_b64 s[6:7], s[38:39]
-; CHECK-NEXT:    s_mov_b64 s[8:9], s[36:37]
-; CHECK-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; CHECK-NEXT:    v_mul_f16_e32 v0, v0, v1
-; CHECK-NEXT:    s_mov_b32 s12, s45
-; CHECK-NEXT:    s_mov_b32 s13, s44
-; CHECK-NEXT:    s_mov_b32 s14, s43
-; CHECK-NEXT:    s_mov_b32 s15, s42
-; CHECK-NEXT:    v_mov_b32_e32 v31, v41
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_and_b32_e32 v1, 0xffff8000, v42
-; CHECK-NEXT:    buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; CHECK-NEXT:    v_or_b32_e32 v0, v1, v0
-; CHECK-NEXT:    v_readlane_b32 s45, v40, 13
-; CHECK-NEXT:    v_readlane_b32 s44, v40, 12
-; CHECK-NEXT:    v_readlane_b32 s43, v40, 11
-; CHECK-NEXT:    v_readlane_b32 s42, v40, 10
-; CHECK-NEXT:    v_readlane_b32 s41, v40, 9
-; CHECK-NEXT:    v_readlane_b32 s40, v40, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v40, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v40, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v40, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v40, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v40, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v40, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
-; CHECK-NEXT:    v_readlane_b32 s4, v40, 14
-; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
-; CHECK-NEXT:    s_addk_i32 s32, 0xf800
-; CHECK-NEXT:    s_mov_b32 s33, s4
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_mul_f16_e32 v1, v2, v1
+; CHECK-NEXT:    v_exp_f16_e32 v1, v1
+; CHECK-NEXT:    v_or_b32_e32 v0, v0, v1
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %y = or i32 %y.arg, 1
   %call = tail call fast half @_Z4pownDhi(half %x, i32 %y)
@@ -1286,92 +649,29 @@ define float @test_pown_fast_f32_known_odd(float %x, i32 %y.arg) {
 ; CHECK-LABEL: test_pown_fast_f32_known_odd:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_mov_b32 s16, s33
-; CHECK-NEXT:    s_mov_b32 s33, s32
-; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
-; CHECK-NEXT:    v_writelane_b32 v40, s16, 14
-; CHECK-NEXT:    v_writelane_b32 v40, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v40, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v40, s35, 3
-; CHECK-NEXT:    v_writelane_b32 v40, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v40, s37, 5
-; CHECK-NEXT:    v_writelane_b32 v40, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v40, s39, 7
-; CHECK-NEXT:    s_addk_i32 s32, 0x800
-; CHECK-NEXT:    v_writelane_b32 v40, s40, 8
-; CHECK-NEXT:    v_writelane_b32 v40, s41, 9
-; CHECK-NEXT:    s_mov_b64 s[40:41], s[4:5]
-; CHECK-NEXT:    s_getpc_b64 s[4:5]
-; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2f at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2f at gotpcrel32@hi+12
-; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v40, s42, 10
-; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_writelane_b32 v40, s43, 11
-; CHECK-NEXT:    v_mov_b32_e32 v42, v0
-; CHECK-NEXT:    v_writelane_b32 v40, s44, 12
-; CHECK-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v42
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT:    v_writelane_b32 v40, s45, 13
-; CHECK-NEXT:    v_mov_b32_e32 v41, v31
-; CHECK-NEXT:    s_mov_b32 s42, s15
-; CHECK-NEXT:    s_mov_b32 s43, s14
-; CHECK-NEXT:    s_mov_b32 s44, s13
-; CHECK-NEXT:    s_mov_b32 s45, s12
-; CHECK-NEXT:    s_mov_b64 s[34:35], s[10:11]
-; CHECK-NEXT:    s_mov_b64 s[36:37], s[8:9]
-; CHECK-NEXT:    s_mov_b64 s[38:39], s[6:7]
-; CHECK-NEXT:    v_or_b32_e32 v43, 1, v1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    s_getpc_b64 s[4:5]
-; CHECK-NEXT:    s_add_u32 s4, s4, _Z4exp2f at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4exp2f at gotpcrel32@hi+12
-; CHECK-NEXT:    v_cvt_f32_i32_e32 v1, v43
-; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT:    s_mov_b64 s[6:7], s[38:39]
-; CHECK-NEXT:    v_mul_f32_e32 v0, v0, v1
-; CHECK-NEXT:    s_mov_b64 s[8:9], s[36:37]
-; CHECK-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; CHECK-NEXT:    s_mov_b32 s12, s45
-; CHECK-NEXT:    s_mov_b32 s13, s44
-; CHECK-NEXT:    s_mov_b32 s14, s43
-; CHECK-NEXT:    s_mov_b32 s15, s42
-; CHECK-NEXT:    v_mov_b32_e32 v31, v41
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT:    s_mov_b32 s4, 0x800000
+; CHECK-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0x4f800000
+; CHECK-NEXT:    v_cndmask_b32_e32 v3, 1.0, v3, vcc
+; CHECK-NEXT:    v_mul_f32_e64 v3, |v0|, v3
+; CHECK-NEXT:    v_or_b32_e32 v1, 1, v1
+; CHECK-NEXT:    v_log_f32_e32 v3, v3
+; CHECK-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0x42000000
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; CHECK-NEXT:    v_sub_f32_e32 v2, v3, v2
+; CHECK-NEXT:    v_mul_f32_e32 v3, v2, v1
+; CHECK-NEXT:    s_mov_b32 s4, 0xc2fc0000
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0x42800000
+; CHECK-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v3
+; CHECK-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
+; CHECK-NEXT:    v_fma_f32 v1, v2, v1, v3
+; CHECK-NEXT:    v_exp_f32_e32 v1, v1
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0x1f800000
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
 ; CHECK-NEXT:    s_brev_b32 s4, 1
-; CHECK-NEXT:    v_and_or_b32 v0, v42, s4, v0
-; CHECK-NEXT:    buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; CHECK-NEXT:    v_readlane_b32 s45, v40, 13
-; CHECK-NEXT:    v_readlane_b32 s44, v40, 12
-; CHECK-NEXT:    v_readlane_b32 s43, v40, 11
-; CHECK-NEXT:    v_readlane_b32 s42, v40, 10
-; CHECK-NEXT:    v_readlane_b32 s41, v40, 9
-; CHECK-NEXT:    v_readlane_b32 s40, v40, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v40, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v40, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v40, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v40, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v40, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v40, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
-; CHECK-NEXT:    v_readlane_b32 s4, v40, 14
-; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
-; CHECK-NEXT:    s_addk_i32 s32, 0xf800
-; CHECK-NEXT:    s_mov_b32 s33, s4
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_mul_f32_e32 v1, v1, v2
+; CHECK-NEXT:    v_and_or_b32 v0, v0, s4, v1
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %y = or i32 %y.arg, 1
   %call = tail call fast float @_Z4pownfi(float %x, i32 %y)

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll
index fe4c297680e38c..2ebe2562ddb469 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll
@@ -1074,9 +1074,9 @@ define float @test_pow_afn_f32_nnan_x_known_positive(float nofpclass(ninf nnorm
 define float @test_pow_afn_f32_nnan_ninf_x_known_positive(float nofpclass(ninf nnorm nsub) %x, float %y) {
 ; CHECK-LABEL: define float @test_pow_afn_f32_nnan_ninf_x_known_positive
 ; CHECK-SAME: (float nofpclass(ninf nsub nnorm) [[X:%.*]], float [[Y:%.*]]) {
-; CHECK-NEXT:    [[__LOG2:%.*]] = call nnan ninf afn float @_Z4log2f(float [[X]])
+; CHECK-NEXT:    [[__LOG2:%.*]] = call nnan ninf afn float @llvm.log2.f32(float [[X]])
 ; CHECK-NEXT:    [[__YLOGX:%.*]] = fmul nnan ninf afn float [[__LOG2]], [[Y]]
-; CHECK-NEXT:    [[__EXP2:%.*]] = call nnan ninf afn float @_Z4exp2f(float [[__YLOGX]])
+; CHECK-NEXT:    [[__EXP2:%.*]] = call nnan ninf afn float @llvm.exp2.f32(float [[__YLOGX]])
 ; CHECK-NEXT:    ret float [[__EXP2]]
 ;
   %pow = tail call afn nnan ninf float @_Z3powff(float %x, float %y)
@@ -1096,9 +1096,9 @@ define <2 x float> @test_pow_afn_v2f32_nnan_x_known_positive(<2 x float> nofpcla
 define <2 x float> @test_pow_afn_v2f32_nnan_ninf_x_known_positive(<2 x float> nofpclass(ninf nnorm nsub) %x, <2 x float> %y) {
 ; CHECK-LABEL: define <2 x float> @test_pow_afn_v2f32_nnan_ninf_x_known_positive
 ; CHECK-SAME: (<2 x float> nofpclass(ninf nsub nnorm) [[X:%.*]], <2 x float> [[Y:%.*]]) {
-; CHECK-NEXT:    [[__LOG2:%.*]] = call nnan ninf afn <2 x float> @_Z4log2Dv2_f(<2 x float> [[X]])
+; CHECK-NEXT:    [[__LOG2:%.*]] = call nnan ninf afn <2 x float> @llvm.log2.v2f32(<2 x float> [[X]])
 ; CHECK-NEXT:    [[__YLOGX:%.*]] = fmul nnan ninf afn <2 x float> [[__LOG2]], [[Y]]
-; CHECK-NEXT:    [[__EXP2:%.*]] = call nnan ninf afn <2 x float> @_Z4exp2Dv2_f(<2 x float> [[__YLOGX]])
+; CHECK-NEXT:    [[__EXP2:%.*]] = call nnan ninf afn <2 x float> @llvm.exp2.v2f32(<2 x float> [[__YLOGX]])
 ; CHECK-NEXT:    ret <2 x float> [[__EXP2]]
 ;
   %pow = tail call afn nnan ninf <2 x float> @_Z3powDv2_fS_(<2 x float> %x, <2 x float> %y)
@@ -1242,9 +1242,9 @@ define half @test_pow_afn_f16_nnan_x_known_positive(half nofpclass(ninf nnorm ns
 define half @test_pow_afn_f16_nnan_ninf_x_known_positive(half nofpclass(ninf nnorm nsub) %x, half %y) {
 ; CHECK-LABEL: define half @test_pow_afn_f16_nnan_ninf_x_known_positive
 ; CHECK-SAME: (half nofpclass(ninf nsub nnorm) [[X:%.*]], half [[Y:%.*]]) {
-; CHECK-NEXT:    [[__LOG2:%.*]] = call nnan ninf afn half @_Z4log2Dh(half [[X]])
+; CHECK-NEXT:    [[__LOG2:%.*]] = call nnan ninf afn half @llvm.log2.f16(half [[X]])
 ; CHECK-NEXT:    [[__YLOGX:%.*]] = fmul nnan ninf afn half [[__LOG2]], [[Y]]
-; CHECK-NEXT:    [[__EXP2:%.*]] = call nnan ninf afn half @_Z4exp2Dh(half [[__YLOGX]])
+; CHECK-NEXT:    [[__EXP2:%.*]] = call nnan ninf afn half @llvm.exp2.f16(half [[__YLOGX]])
 ; CHECK-NEXT:    ret half [[__EXP2]]
 ;
   %pow = tail call afn nnan ninf half @_Z3powDhDh(half %x, half %y)
@@ -1264,9 +1264,9 @@ define <2 x half> @test_pow_afn_v2f16_nnan_x_known_positive(<2 x half> nofpclass
 define <2 x half> @test_pow_afn_v2f16_nnan_ninf_x_known_positive(<2 x half> nofpclass(ninf nnorm nsub) %x, <2 x half> %y) {
 ; CHECK-LABEL: define <2 x half> @test_pow_afn_v2f16_nnan_ninf_x_known_positive
 ; CHECK-SAME: (<2 x half> nofpclass(ninf nsub nnorm) [[X:%.*]], <2 x half> [[Y:%.*]]) {
-; CHECK-NEXT:    [[__LOG2:%.*]] = call nnan ninf afn <2 x half> @_Z4log2Dv2_Dh(<2 x half> [[X]])
+; CHECK-NEXT:    [[__LOG2:%.*]] = call nnan ninf afn <2 x half> @llvm.log2.v2f16(<2 x half> [[X]])
 ; CHECK-NEXT:    [[__YLOGX:%.*]] = fmul nnan ninf afn <2 x half> [[__LOG2]], [[Y]]
-; CHECK-NEXT:    [[__EXP2:%.*]] = call nnan ninf afn <2 x half> @_Z4exp2Dv2_Dh(<2 x half> [[__YLOGX]])
+; CHECK-NEXT:    [[__EXP2:%.*]] = call nnan ninf afn <2 x half> @llvm.exp2.v2f16(<2 x half> [[__YLOGX]])
 ; CHECK-NEXT:    ret <2 x half> [[__EXP2]]
 ;
   %pow = tail call afn nnan ninf <2 x half> @_Z3powDv2_DhS_(<2 x half> %x, <2 x half> %y)
@@ -1784,9 +1784,6 @@ define float @test_pow_afn_f32_nnan_ninf__y_10(float %x) {
 define <2 x float> @test_pow_afn_v2f32_nnan_ninf__y_poison(<2 x float> %x) {
 ; CHECK-LABEL: define <2 x float> @test_pow_afn_v2f32_nnan_ninf__y_poison
 ; CHECK-SAME: (<2 x float> [[X:%.*]]) {
-; CHECK-NEXT:    [[__FABS:%.*]] = call nnan ninf afn <2 x float> @llvm.fabs.v2f32(<2 x float> [[X]])
-; CHECK-NEXT:    [[__LOG2:%.*]] = call nnan ninf afn <2 x float> @_Z4log2Dv2_f(<2 x float> [[__FABS]])
-; CHECK-NEXT:    [[__EXP2:%.*]] = call nnan ninf afn <2 x float> @_Z4exp2Dv2_f(<2 x float> poison)
 ; CHECK-NEXT:    ret <2 x float> poison
 ;
   %pow = tail call afn nnan ninf <2 x float> @_Z3powDv2_fS_(<2 x float> %x, <2 x float> poison)
@@ -2209,9 +2206,9 @@ define float @test_pow_afn_nnan_ninf_f32_known_integral_sitofp(float %x, i32 %y)
 ; CHECK-SAME: (float [[X:%.*]], i32 [[Y:%.*]]) {
 ; CHECK-NEXT:    [[Y_CAST:%.*]] = sitofp i32 [[Y]] to float
 ; CHECK-NEXT:    [[__FABS:%.*]] = call nnan ninf afn float @llvm.fabs.f32(float [[X]])
-; CHECK-NEXT:    [[__LOG2:%.*]] = call nnan ninf afn float @_Z4log2f(float [[__FABS]])
+; CHECK-NEXT:    [[__LOG2:%.*]] = call nnan ninf afn float @llvm.log2.f32(float [[__FABS]])
 ; CHECK-NEXT:    [[__YLOGX:%.*]] = fmul nnan ninf afn float [[__LOG2]], [[Y_CAST]]
-; CHECK-NEXT:    [[__EXP2:%.*]] = call nnan ninf afn float @_Z4exp2f(float [[__YLOGX]])
+; CHECK-NEXT:    [[__EXP2:%.*]] = call nnan ninf afn float @llvm.exp2.f32(float [[__YLOGX]])
 ; CHECK-NEXT:    [[__YTOU:%.*]] = fptosi float [[Y_CAST]] to i32
 ; CHECK-NEXT:    [[__YEVEN:%.*]] = shl i32 [[__YTOU]], 31
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float [[X]] to i32
@@ -2291,9 +2288,9 @@ define float @test_pow_afn_nnan_ninf_f32_known_integral_uitofp(float %x, i32 %y)
 ; CHECK-SAME: (float [[X:%.*]], i32 [[Y:%.*]]) {
 ; CHECK-NEXT:    [[Y_CAST:%.*]] = uitofp i32 [[Y]] to float
 ; CHECK-NEXT:    [[__FABS:%.*]] = call nnan ninf afn float @llvm.fabs.f32(float [[X]])
-; CHECK-NEXT:    [[__LOG2:%.*]] = call nnan ninf afn float @_Z4log2f(float [[__FABS]])
+; CHECK-NEXT:    [[__LOG2:%.*]] = call nnan ninf afn float @llvm.log2.f32(float [[__FABS]])
 ; CHECK-NEXT:    [[__YLOGX:%.*]] = fmul nnan ninf afn float [[__LOG2]], [[Y_CAST]]
-; CHECK-NEXT:    [[__EXP2:%.*]] = call nnan ninf afn float @_Z4exp2f(float [[__YLOGX]])
+; CHECK-NEXT:    [[__EXP2:%.*]] = call nnan ninf afn float @llvm.exp2.f32(float [[__YLOGX]])
 ; CHECK-NEXT:    [[__YTOU:%.*]] = fptosi float [[Y_CAST]] to i32
 ; CHECK-NEXT:    [[__YEVEN:%.*]] = shl i32 [[__YTOU]], 31
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float [[X]] to i32
@@ -2339,9 +2336,9 @@ define float @test_pow_afn_nnan_ninf_f32_known_integral_uitofp_i256(float %x, i2
 ; CHECK-SAME: (float [[X:%.*]], i256 [[Y:%.*]]) {
 ; CHECK-NEXT:    [[Y_CAST:%.*]] = uitofp i256 [[Y]] to float
 ; CHECK-NEXT:    [[__FABS:%.*]] = call nnan ninf afn float @llvm.fabs.f32(float [[X]])
-; CHECK-NEXT:    [[__LOG2:%.*]] = call nnan ninf afn float @_Z4log2f(float [[__FABS]])
+; CHECK-NEXT:    [[__LOG2:%.*]] = call nnan ninf afn float @llvm.log2.f32(float [[__FABS]])
 ; CHECK-NEXT:    [[__YLOGX:%.*]] = fmul nnan ninf afn float [[__LOG2]], [[Y_CAST]]
-; CHECK-NEXT:    [[__EXP2:%.*]] = call nnan ninf afn float @_Z4exp2f(float [[__YLOGX]])
+; CHECK-NEXT:    [[__EXP2:%.*]] = call nnan ninf afn float @llvm.exp2.f32(float [[__YLOGX]])
 ; CHECK-NEXT:    [[__YTOU:%.*]] = fptosi float [[Y_CAST]] to i32
 ; CHECK-NEXT:    [[__YEVEN:%.*]] = shl i32 [[__YTOU]], 31
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float [[X]] to i32
@@ -2361,9 +2358,9 @@ define float @test_pow_afn_nnan_ninf_f32_known_integral_sitofp_i256(float %x, i2
 ; CHECK-SAME: (float [[X:%.*]], i256 [[Y:%.*]]) {
 ; CHECK-NEXT:    [[Y_CAST:%.*]] = sitofp i256 [[Y]] to float
 ; CHECK-NEXT:    [[__FABS:%.*]] = call nnan ninf afn float @llvm.fabs.f32(float [[X]])
-; CHECK-NEXT:    [[__LOG2:%.*]] = call nnan ninf afn float @_Z4log2f(float [[__FABS]])
+; CHECK-NEXT:    [[__LOG2:%.*]] = call nnan ninf afn float @llvm.log2.f32(float [[__FABS]])
 ; CHECK-NEXT:    [[__YLOGX:%.*]] = fmul nnan ninf afn float [[__LOG2]], [[Y_CAST]]
-; CHECK-NEXT:    [[__EXP2:%.*]] = call nnan ninf afn float @_Z4exp2f(float [[__YLOGX]])
+; CHECK-NEXT:    [[__EXP2:%.*]] = call nnan ninf afn float @llvm.exp2.f32(float [[__YLOGX]])
 ; CHECK-NEXT:    [[__YTOU:%.*]] = fptosi float [[Y_CAST]] to i32
 ; CHECK-NEXT:    [[__YEVEN:%.*]] = shl i32 [[__YTOU]], 31
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float [[X]] to i32
@@ -2383,9 +2380,9 @@ define <2 x float> @test_pow_afn_nnan_ninf_v2f32_known_integral_sitofp(<2 x floa
 ; CHECK-SAME: (<2 x float> [[X:%.*]], <2 x i32> [[Y:%.*]]) {
 ; CHECK-NEXT:    [[Y_CAST:%.*]] = sitofp <2 x i32> [[Y]] to <2 x float>
 ; CHECK-NEXT:    [[__FABS:%.*]] = call nnan ninf afn <2 x float> @llvm.fabs.v2f32(<2 x float> [[X]])
-; CHECK-NEXT:    [[__LOG2:%.*]] = call nnan ninf afn <2 x float> @_Z4log2Dv2_f(<2 x float> [[__FABS]])
+; CHECK-NEXT:    [[__LOG2:%.*]] = call nnan ninf afn <2 x float> @llvm.log2.v2f32(<2 x float> [[__FABS]])
 ; CHECK-NEXT:    [[__YLOGX:%.*]] = fmul nnan ninf afn <2 x float> [[__LOG2]], [[Y_CAST]]
-; CHECK-NEXT:    [[__EXP2:%.*]] = call nnan ninf afn <2 x float> @_Z4exp2Dv2_f(<2 x float> [[__YLOGX]])
+; CHECK-NEXT:    [[__EXP2:%.*]] = call nnan ninf afn <2 x float> @llvm.exp2.v2f32(<2 x float> [[__YLOGX]])
 ; CHECK-NEXT:    [[__YTOU:%.*]] = fptosi <2 x float> [[Y_CAST]] to <2 x i32>
 ; CHECK-NEXT:    [[__YEVEN:%.*]] = shl <2 x i32> [[__YTOU]], <i32 31, i32 31>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[X]] to <2 x i32>
@@ -2429,9 +2426,9 @@ define <2 x float> @test_pow_afn_nnan_ninf_v2f32_known_integral_uitofp(<2 x floa
 ; CHECK-SAME: (<2 x float> [[X:%.*]], <2 x i32> [[Y:%.*]]) {
 ; CHECK-NEXT:    [[Y_CAST:%.*]] = uitofp <2 x i32> [[Y]] to <2 x float>
 ; CHECK-NEXT:    [[__FABS:%.*]] = call nnan ninf afn <2 x float> @llvm.fabs.v2f32(<2 x float> [[X]])
-; CHECK-NEXT:    [[__LOG2:%.*]] = call nnan ninf afn <2 x float> @_Z4log2Dv2_f(<2 x float> [[__FABS]])
+; CHECK-NEXT:    [[__LOG2:%.*]] = call nnan ninf afn <2 x float> @llvm.log2.v2f32(<2 x float> [[__FABS]])
 ; CHECK-NEXT:    [[__YLOGX:%.*]] = fmul nnan ninf afn <2 x float> [[__LOG2]], [[Y_CAST]]
-; CHECK-NEXT:    [[__EXP2:%.*]] = call nnan ninf afn <2 x float> @_Z4exp2Dv2_f(<2 x float> [[__YLOGX]])
+; CHECK-NEXT:    [[__EXP2:%.*]] = call nnan ninf afn <2 x float> @llvm.exp2.v2f32(<2 x float> [[__YLOGX]])
 ; CHECK-NEXT:    [[__YTOU:%.*]] = fptosi <2 x float> [[Y_CAST]] to <2 x i32>
 ; CHECK-NEXT:    [[__YEVEN:%.*]] = shl <2 x i32> [[__YTOU]], <i32 31, i32 31>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[X]] to <2 x i32>
@@ -2475,9 +2472,9 @@ define float @test_pow_afn_nnan_ninf_f32__known_positive_x__known_integral_sitof
 ; CHECK-LABEL: define float @test_pow_afn_nnan_ninf_f32__known_positive_x__known_integral_sitofp
 ; CHECK-SAME: (float nofpclass(ninf nsub nnorm) [[X:%.*]], i32 [[Y:%.*]]) {
 ; CHECK-NEXT:    [[Y_CAST:%.*]] = sitofp i32 [[Y]] to float
-; CHECK-NEXT:    [[__LOG2:%.*]] = call nnan ninf afn float @_Z4log2f(float [[X]])
+; CHECK-NEXT:    [[__LOG2:%.*]] = call nnan ninf afn float @llvm.log2.f32(float [[X]])
 ; CHECK-NEXT:    [[__YLOGX:%.*]] = fmul nnan ninf afn float [[__LOG2]], [[Y_CAST]]
-; CHECK-NEXT:    [[__EXP2:%.*]] = call nnan ninf afn float @_Z4exp2f(float [[__YLOGX]])
+; CHECK-NEXT:    [[__EXP2:%.*]] = call nnan ninf afn float @llvm.exp2.f32(float [[__YLOGX]])
 ; CHECK-NEXT:    ret float [[__EXP2]]
 ;
   %y.cast = sitofp i32 %y to float

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll
index ac179120a9575b..85e14048f39478 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll
@@ -672,10 +672,10 @@ define float @test_pown_afn_nnan_ninf_f32(float %x, i32 %y) {
 ; CHECK-SAME: (float [[X:%.*]], i32 [[Y:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[__FABS:%.*]] = call nnan ninf afn float @llvm.fabs.f32(float [[X]])
-; CHECK-NEXT:    [[__LOG2:%.*]] = call nnan ninf afn float @_Z4log2f(float [[__FABS]])
+; CHECK-NEXT:    [[__LOG2:%.*]] = call nnan ninf afn float @llvm.log2.f32(float [[__FABS]])
 ; CHECK-NEXT:    [[POWNI2F:%.*]] = sitofp i32 [[Y]] to float
 ; CHECK-NEXT:    [[__YLOGX:%.*]] = fmul nnan ninf afn float [[__LOG2]], [[POWNI2F]]
-; CHECK-NEXT:    [[__EXP2:%.*]] = call nnan ninf afn float @_Z4exp2f(float [[__YLOGX]])
+; CHECK-NEXT:    [[__EXP2:%.*]] = call nnan ninf afn float @llvm.exp2.f32(float [[__YLOGX]])
 ; CHECK-NEXT:    [[__YEVEN:%.*]] = shl i32 [[Y]], 31
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[X]] to i32
 ; CHECK-NEXT:    [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP0]]
@@ -694,10 +694,10 @@ define <2 x float> @test_pown_afn_nnan_ninf_v2f32(<2 x float> %x, <2 x i32> %y)
 ; CHECK-SAME: (<2 x float> [[X:%.*]], <2 x i32> [[Y:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[__FABS:%.*]] = call nnan ninf afn <2 x float> @llvm.fabs.v2f32(<2 x float> [[X]])
-; CHECK-NEXT:    [[__LOG2:%.*]] = call nnan ninf afn <2 x float> @_Z4log2Dv2_f(<2 x float> [[__FABS]])
+; CHECK-NEXT:    [[__LOG2:%.*]] = call nnan ninf afn <2 x float> @llvm.log2.v2f32(<2 x float> [[__FABS]])
 ; CHECK-NEXT:    [[POWNI2F:%.*]] = sitofp <2 x i32> [[Y]] to <2 x float>
 ; CHECK-NEXT:    [[__YLOGX:%.*]] = fmul nnan ninf afn <2 x float> [[__LOG2]], [[POWNI2F]]
-; CHECK-NEXT:    [[__EXP2:%.*]] = call nnan ninf afn <2 x float> @_Z4exp2Dv2_f(<2 x float> [[__YLOGX]])
+; CHECK-NEXT:    [[__EXP2:%.*]] = call nnan ninf afn <2 x float> @llvm.exp2.v2f32(<2 x float> [[__YLOGX]])
 ; CHECK-NEXT:    [[__YTOU:%.*]] = fptosi <2 x float> [[POWNI2F]] to <2 x i32>
 ; CHECK-NEXT:    [[__YEVEN:%.*]] = shl <2 x i32> [[__YTOU]], <i32 31, i32 31>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[X]] to <2 x i32>
@@ -763,10 +763,10 @@ define half @test_pown_afn_nnan_ninf_f16(half %x, i32 %y) {
 ; CHECK-SAME: (half [[X:%.*]], i32 [[Y:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[__FABS:%.*]] = call nnan ninf afn half @llvm.fabs.f16(half [[X]])
-; CHECK-NEXT:    [[__LOG2:%.*]] = call nnan ninf afn half @_Z4log2Dh(half [[__FABS]])
+; CHECK-NEXT:    [[__LOG2:%.*]] = call nnan ninf afn half @llvm.log2.f16(half [[__FABS]])
 ; CHECK-NEXT:    [[POWNI2F:%.*]] = sitofp i32 [[Y]] to half
 ; CHECK-NEXT:    [[__YLOGX:%.*]] = fmul nnan ninf afn half [[__LOG2]], [[POWNI2F]]
-; CHECK-NEXT:    [[__EXP2:%.*]] = call nnan ninf afn half @_Z4exp2Dh(half [[__YLOGX]])
+; CHECK-NEXT:    [[__EXP2:%.*]] = call nnan ninf afn half @llvm.exp2.f16(half [[__YLOGX]])
 ; CHECK-NEXT:    [[__YTOU:%.*]] = trunc i32 [[Y]] to i16
 ; CHECK-NEXT:    [[__YEVEN:%.*]] = shl i16 [[__YTOU]], 15
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast half [[X]] to i16
@@ -786,10 +786,10 @@ define <2 x half> @test_pown_afn_nnan_ninf_v2f16(<2 x half> %x, <2 x i32> %y) {
 ; CHECK-SAME: (<2 x half> [[X:%.*]], <2 x i32> [[Y:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[__FABS:%.*]] = call nnan ninf afn <2 x half> @llvm.fabs.v2f16(<2 x half> [[X]])
-; CHECK-NEXT:    [[__LOG2:%.*]] = call nnan ninf afn <2 x half> @_Z4log2Dv2_Dh(<2 x half> [[__FABS]])
+; CHECK-NEXT:    [[__LOG2:%.*]] = call nnan ninf afn <2 x half> @llvm.log2.v2f16(<2 x half> [[__FABS]])
 ; CHECK-NEXT:    [[POWNI2F:%.*]] = sitofp <2 x i32> [[Y]] to <2 x half>
 ; CHECK-NEXT:    [[__YLOGX:%.*]] = fmul nnan ninf afn <2 x half> [[__LOG2]], [[POWNI2F]]
-; CHECK-NEXT:    [[__EXP2:%.*]] = call nnan ninf afn <2 x half> @_Z4exp2Dv2_Dh(<2 x half> [[__YLOGX]])
+; CHECK-NEXT:    [[__EXP2:%.*]] = call nnan ninf afn <2 x half> @llvm.exp2.v2f16(<2 x half> [[__YLOGX]])
 ; CHECK-NEXT:    [[__YTOU:%.*]] = fptosi <2 x half> [[POWNI2F]] to <2 x i16>
 ; CHECK-NEXT:    [[__YEVEN:%.*]] = shl <2 x i16> [[__YTOU]], <i16 15, i16 15>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x half> [[X]] to <2 x i16>
@@ -821,10 +821,10 @@ define float @test_pown_fast_f32_strictfp(float %x, i32 %y) #1 {
 ; CHECK-SAME: (float [[X:%.*]], i32 [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[__FABS:%.*]] = call fast float @llvm.fabs.f32(float [[X]])
-; CHECK-NEXT:    [[__LOG2:%.*]] = call fast float @_Z4log2f(float [[__FABS]])
+; CHECK-NEXT:    [[__LOG2:%.*]] = call fast float @llvm.log2.f32(float [[__FABS]])
 ; CHECK-NEXT:    [[POWNI2F:%.*]] = sitofp i32 [[Y]] to float
 ; CHECK-NEXT:    [[__YLOGX:%.*]] = fmul fast float [[__LOG2]], [[POWNI2F]]
-; CHECK-NEXT:    [[__EXP2:%.*]] = call fast float @_Z4exp2f(float [[__YLOGX]])
+; CHECK-NEXT:    [[__EXP2:%.*]] = call fast float @llvm.exp2.f32(float [[__YLOGX]])
 ; CHECK-NEXT:    [[__YEVEN:%.*]] = shl i32 [[Y]], 31
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[X]] to i32
 ; CHECK-NEXT:    [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP0]]
@@ -841,9 +841,6 @@ entry:
 define float @test_pown_fast_f32__y_poison(float %x) {
 ; CHECK-LABEL: define float @test_pown_fast_f32__y_poison
 ; CHECK-SAME: (float [[X:%.*]]) {
-; CHECK-NEXT:    [[__FABS:%.*]] = call fast float @llvm.fabs.f32(float [[X]])
-; CHECK-NEXT:    [[__LOG2:%.*]] = call fast float @_Z4log2f(float [[__FABS]])
-; CHECK-NEXT:    [[__EXP2:%.*]] = call fast float @_Z4exp2f(float poison)
 ; CHECK-NEXT:    ret float poison
 ;
   %call = tail call fast float @_Z4pownfi(float %x, i32 poison)
@@ -1070,10 +1067,10 @@ define float @test_pown_afn_ninf_nnan_f32__x_known_positive(float nofpclass(ninf
 ; CHECK-SAME: (float nofpclass(ninf nsub nnorm) [[X:%.*]], i32 [[Y:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[__FABS:%.*]] = call nnan ninf afn float @llvm.fabs.f32(float [[X]])
-; CHECK-NEXT:    [[__LOG2:%.*]] = call nnan ninf afn float @_Z4log2f(float [[__FABS]])
+; CHECK-NEXT:    [[__LOG2:%.*]] = call nnan ninf afn float @llvm.log2.f32(float [[__FABS]])
 ; CHECK-NEXT:    [[POWNI2F:%.*]] = sitofp i32 [[Y]] to float
 ; CHECK-NEXT:    [[__YLOGX:%.*]] = fmul nnan ninf afn float [[__LOG2]], [[POWNI2F]]
-; CHECK-NEXT:    [[__EXP2:%.*]] = call nnan ninf afn float @_Z4exp2f(float [[__YLOGX]])
+; CHECK-NEXT:    [[__EXP2:%.*]] = call nnan ninf afn float @llvm.exp2.f32(float [[__YLOGX]])
 ; CHECK-NEXT:    [[__YEVEN:%.*]] = shl i32 [[Y]], 31
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[X]] to i32
 ; CHECK-NEXT:    [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP0]]
@@ -1131,10 +1128,10 @@ define float @test_fast_pown_f32_y_known_even(float %x, i32 %y.arg) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[Y:%.*]] = shl i32 [[Y_ARG]], 1
 ; CHECK-NEXT:    [[__FABS:%.*]] = call fast float @llvm.fabs.f32(float [[X]])
-; CHECK-NEXT:    [[__LOG2:%.*]] = call fast float @_Z4log2f(float [[__FABS]])
+; CHECK-NEXT:    [[__LOG2:%.*]] = call fast float @llvm.log2.f32(float [[__FABS]])
 ; CHECK-NEXT:    [[POWNI2F:%.*]] = sitofp i32 [[Y]] to float
 ; CHECK-NEXT:    [[__YLOGX:%.*]] = fmul fast float [[__LOG2]], [[POWNI2F]]
-; CHECK-NEXT:    [[__EXP2:%.*]] = call fast float @_Z4exp2f(float [[__YLOGX]])
+; CHECK-NEXT:    [[__EXP2:%.*]] = call fast float @llvm.exp2.f32(float [[__YLOGX]])
 ; CHECK-NEXT:    ret float [[__EXP2]]
 ;
 entry:
@@ -1149,10 +1146,10 @@ define float @test_fast_pown_f32_known_positive_y_known_even(float nofpclass(nin
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[Y:%.*]] = shl i32 [[Y_ARG]], 1
 ; CHECK-NEXT:    [[__FABS:%.*]] = call fast float @llvm.fabs.f32(float [[X]])
-; CHECK-NEXT:    [[__LOG2:%.*]] = call fast float @_Z4log2f(float [[__FABS]])
+; CHECK-NEXT:    [[__LOG2:%.*]] = call fast float @llvm.log2.f32(float [[__FABS]])
 ; CHECK-NEXT:    [[POWNI2F:%.*]] = sitofp i32 [[Y]] to float
 ; CHECK-NEXT:    [[__YLOGX:%.*]] = fmul fast float [[__LOG2]], [[POWNI2F]]
-; CHECK-NEXT:    [[__EXP2:%.*]] = call fast float @_Z4exp2f(float [[__YLOGX]])
+; CHECK-NEXT:    [[__EXP2:%.*]] = call fast float @llvm.exp2.f32(float [[__YLOGX]])
 ; CHECK-NEXT:    ret float [[__EXP2]]
 ;
 entry:

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-powr.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-powr.ll
index bd742bb68a16ac..0d9f3e2de416fa 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-powr.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-powr.ll
@@ -25,9 +25,9 @@ declare <16 x half> @_Z4powrDv16_DhS_(<16 x half>, <16 x half>)
 define float @test_powr_fast_f32(float %x, float %y) {
 ; CHECK-LABEL: define float @test_powr_fast_f32
 ; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]]) {
-; CHECK-NEXT:    [[__LOG2:%.*]] = call fast float @_Z4log2f(float [[X]])
+; CHECK-NEXT:    [[__LOG2:%.*]] = call fast float @llvm.log2.f32(float [[X]])
 ; CHECK-NEXT:    [[__YLOGX:%.*]] = fmul fast float [[__LOG2]], [[Y]]
-; CHECK-NEXT:    [[__EXP2:%.*]] = call fast float @_Z4exp2f(float [[__YLOGX]])
+; CHECK-NEXT:    [[__EXP2:%.*]] = call fast float @llvm.exp2.f32(float [[__YLOGX]])
 ; CHECK-NEXT:    ret float [[__EXP2]]
 ;
   %powr = tail call fast float @_Z4powrff(float %x, float %y)
@@ -37,9 +37,9 @@ define float @test_powr_fast_f32(float %x, float %y) {
 define <2 x float> @test_powr_fast_v2f32(<2 x float> %x, <2 x float> %y) {
 ; CHECK-LABEL: define <2 x float> @test_powr_fast_v2f32
 ; CHECK-SAME: (<2 x float> [[X:%.*]], <2 x float> [[Y:%.*]]) {
-; CHECK-NEXT:    [[__LOG2:%.*]] = call fast <2 x float> @_Z4log2Dv2_f(<2 x float> [[X]])
+; CHECK-NEXT:    [[__LOG2:%.*]] = call fast <2 x float> @llvm.log2.v2f32(<2 x float> [[X]])
 ; CHECK-NEXT:    [[__YLOGX:%.*]] = fmul fast <2 x float> [[__LOG2]], [[Y]]
-; CHECK-NEXT:    [[__EXP2:%.*]] = call fast <2 x float> @_Z4exp2Dv2_f(<2 x float> [[__YLOGX]])
+; CHECK-NEXT:    [[__EXP2:%.*]] = call fast <2 x float> @llvm.exp2.v2f32(<2 x float> [[__YLOGX]])
 ; CHECK-NEXT:    ret <2 x float> [[__EXP2]]
 ;
   %powr = tail call fast <2 x float> @_Z4powrDv2_fS_(<2 x float> %x, <2 x float> %y)
@@ -449,7 +449,7 @@ define float @test_powr_afn_f32_nnan_minsize(float %x, float %y) #0 {
 define float @test_powr_afn_f32_noinline(float %x, float %y) {
 ; CHECK-LABEL: define float @test_powr_afn_f32_noinline
 ; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]]) {
-; CHECK-NEXT:    [[POWR:%.*]] = tail call afn float @_Z4powrff(float [[X]], float [[Y]]) #[[ATTR3:[0-9]+]]
+; CHECK-NEXT:    [[POWR:%.*]] = tail call afn float @_Z4powrff(float [[X]], float [[Y]]) #[[ATTR4:[0-9]+]]
 ; CHECK-NEXT:    ret float [[POWR]]
 ;
   %powr = tail call afn float @_Z4powrff(float %x, float %y) #1
@@ -459,7 +459,7 @@ define float @test_powr_afn_f32_noinline(float %x, float %y) {
 define float @test_powr_afn_f32_nnan_noinline(float %x, float %y) {
 ; CHECK-LABEL: define float @test_powr_afn_f32_nnan_noinline
 ; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]]) {
-; CHECK-NEXT:    [[POWR:%.*]] = tail call nnan afn float @_Z4powrff(float [[X]], float [[Y]]) #[[ATTR3]]
+; CHECK-NEXT:    [[POWR:%.*]] = tail call nnan afn float @_Z4powrff(float [[X]], float [[Y]]) #[[ATTR4]]
 ; CHECK-NEXT:    ret float [[POWR]]
 ;
   %powr = tail call afn nnan float @_Z4powrff(float %x, float %y) #1
@@ -479,7 +479,7 @@ define float @test_powr_afn_f32_strictfp(float %x, float %y) #2 {
 define float @test_powr_fast_f32_nobuiltin(float %x, float %y) {
 ; CHECK-LABEL: define float @test_powr_fast_f32_nobuiltin
 ; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]]) {
-; CHECK-NEXT:    [[POWR:%.*]] = tail call fast float @_Z4powrff(float [[X]], float [[Y]]) #[[ATTR4:[0-9]+]]
+; CHECK-NEXT:    [[POWR:%.*]] = tail call fast float @_Z4powrff(float [[X]], float [[Y]]) #[[ATTR5:[0-9]+]]
 ; CHECK-NEXT:    ret float [[POWR]]
 ;
   %powr = tail call fast float @_Z4powrff(float %x, float %y) #3
@@ -1010,9 +1010,9 @@ define float @test_powr_afn_f32_nnan_x_known_positive(float nofpclass(ninf nnorm
 define float @test_powr_afn_f32_nnan_ninf_x_known_positive(float nofpclass(ninf nnorm nsub) %x, float %y) {
 ; CHECK-LABEL: define float @test_powr_afn_f32_nnan_ninf_x_known_positive
 ; CHECK-SAME: (float nofpclass(ninf nsub nnorm) [[X:%.*]], float [[Y:%.*]]) {
-; CHECK-NEXT:    [[__LOG2:%.*]] = call nnan ninf afn float @_Z4log2f(float [[X]])
+; CHECK-NEXT:    [[__LOG2:%.*]] = call nnan ninf afn float @llvm.log2.f32(float [[X]])
 ; CHECK-NEXT:    [[__YLOGX:%.*]] = fmul nnan ninf afn float [[__LOG2]], [[Y]]
-; CHECK-NEXT:    [[__EXP2:%.*]] = call nnan ninf afn float @_Z4exp2f(float [[__YLOGX]])
+; CHECK-NEXT:    [[__EXP2:%.*]] = call nnan ninf afn float @llvm.exp2.f32(float [[__YLOGX]])
 ; CHECK-NEXT:    ret float [[__EXP2]]
 ;
   %powr = tail call afn nnan ninf float @_Z4powrff(float %x, float %y)
@@ -1032,9 +1032,9 @@ define <2 x float> @test_powr_afn_v2f32_nnan_x_known_positive(<2 x float> nofpcl
 define <2 x float> @test_powr_afn_v2f32_nnan_ninf_x_known_positive(<2 x float> nofpclass(ninf nnorm nsub) %x, <2 x float> %y) {
 ; CHECK-LABEL: define <2 x float> @test_powr_afn_v2f32_nnan_ninf_x_known_positive
 ; CHECK-SAME: (<2 x float> nofpclass(ninf nsub nnorm) [[X:%.*]], <2 x float> [[Y:%.*]]) {
-; CHECK-NEXT:    [[__LOG2:%.*]] = call nnan ninf afn <2 x float> @_Z4log2Dv2_f(<2 x float> [[X]])
+; CHECK-NEXT:    [[__LOG2:%.*]] = call nnan ninf afn <2 x float> @llvm.log2.v2f32(<2 x float> [[X]])
 ; CHECK-NEXT:    [[__YLOGX:%.*]] = fmul nnan ninf afn <2 x float> [[__LOG2]], [[Y]]
-; CHECK-NEXT:    [[__EXP2:%.*]] = call nnan ninf afn <2 x float> @_Z4exp2Dv2_f(<2 x float> [[__YLOGX]])
+; CHECK-NEXT:    [[__EXP2:%.*]] = call nnan ninf afn <2 x float> @llvm.exp2.v2f32(<2 x float> [[__YLOGX]])
 ; CHECK-NEXT:    ret <2 x float> [[__EXP2]]
 ;
   %powr = tail call afn nnan ninf <2 x float> @_Z4powrDv2_fS_(<2 x float> %x, <2 x float> %y)
@@ -1109,9 +1109,9 @@ define float @test_powr_afn_nnan_ninf_f32_known_integral_sitofp(float %x, i32 %y
 ; CHECK-LABEL: define float @test_powr_afn_nnan_ninf_f32_known_integral_sitofp
 ; CHECK-SAME: (float [[X:%.*]], i32 [[Y:%.*]]) {
 ; CHECK-NEXT:    [[Y_CAST:%.*]] = sitofp i32 [[Y]] to float
-; CHECK-NEXT:    [[__LOG2:%.*]] = call nnan ninf afn float @_Z4log2f(float [[X]])
+; CHECK-NEXT:    [[__LOG2:%.*]] = call nnan ninf afn float @llvm.log2.f32(float [[X]])
 ; CHECK-NEXT:    [[__YLOGX:%.*]] = fmul nnan ninf afn float [[__LOG2]], [[Y_CAST]]
-; CHECK-NEXT:    [[__EXP2:%.*]] = call nnan ninf afn float @_Z4exp2f(float [[__YLOGX]])
+; CHECK-NEXT:    [[__EXP2:%.*]] = call nnan ninf afn float @llvm.exp2.f32(float [[__YLOGX]])
 ; CHECK-NEXT:    ret float [[__EXP2]]
 ;
   %y.cast = sitofp i32 %y to float
@@ -1147,9 +1147,9 @@ define float @test_powr_afn_nnan_ninf_f32_known_integral_uitofp(float %x, i32 %y
 ; CHECK-LABEL: define float @test_powr_afn_nnan_ninf_f32_known_integral_uitofp
 ; CHECK-SAME: (float [[X:%.*]], i32 [[Y:%.*]]) {
 ; CHECK-NEXT:    [[Y_CAST:%.*]] = uitofp i32 [[Y]] to float
-; CHECK-NEXT:    [[__LOG2:%.*]] = call nnan ninf afn float @_Z4log2f(float [[X]])
+; CHECK-NEXT:    [[__LOG2:%.*]] = call nnan ninf afn float @llvm.log2.f32(float [[X]])
 ; CHECK-NEXT:    [[__YLOGX:%.*]] = fmul nnan ninf afn float [[__LOG2]], [[Y_CAST]]
-; CHECK-NEXT:    [[__EXP2:%.*]] = call nnan ninf afn float @_Z4exp2f(float [[__YLOGX]])
+; CHECK-NEXT:    [[__EXP2:%.*]] = call nnan ninf afn float @llvm.exp2.f32(float [[__YLOGX]])
 ; CHECK-NEXT:    ret float [[__EXP2]]
 ;
   %y.cast = uitofp i32 %y to float
@@ -1161,9 +1161,9 @@ define <2 x float> @test_powr_afn_nnan_ninf_v2f32_known_integral_sitofp(<2 x flo
 ; CHECK-LABEL: define <2 x float> @test_powr_afn_nnan_ninf_v2f32_known_integral_sitofp
 ; CHECK-SAME: (<2 x float> [[X:%.*]], <2 x i32> [[Y:%.*]]) {
 ; CHECK-NEXT:    [[Y_CAST:%.*]] = sitofp <2 x i32> [[Y]] to <2 x float>
-; CHECK-NEXT:    [[__LOG2:%.*]] = call nnan ninf afn <2 x float> @_Z4log2Dv2_f(<2 x float> [[X]])
+; CHECK-NEXT:    [[__LOG2:%.*]] = call nnan ninf afn <2 x float> @llvm.log2.v2f32(<2 x float> [[X]])
 ; CHECK-NEXT:    [[__YLOGX:%.*]] = fmul nnan ninf afn <2 x float> [[__LOG2]], [[Y_CAST]]
-; CHECK-NEXT:    [[__EXP2:%.*]] = call nnan ninf afn <2 x float> @_Z4exp2Dv2_f(<2 x float> [[__YLOGX]])
+; CHECK-NEXT:    [[__EXP2:%.*]] = call nnan ninf afn <2 x float> @llvm.exp2.v2f32(<2 x float> [[__YLOGX]])
 ; CHECK-NEXT:    ret <2 x float> [[__EXP2]]
 ;
   %y.cast = sitofp <2 x i32> %y to <2 x float>
@@ -1199,9 +1199,9 @@ define <2 x float> @test_powr_afn_nnan_ninf_v2f32_known_integral_uitofp(<2 x flo
 ; CHECK-LABEL: define <2 x float> @test_powr_afn_nnan_ninf_v2f32_known_integral_uitofp
 ; CHECK-SAME: (<2 x float> [[X:%.*]], <2 x i32> [[Y:%.*]]) {
 ; CHECK-NEXT:    [[Y_CAST:%.*]] = uitofp <2 x i32> [[Y]] to <2 x float>
-; CHECK-NEXT:    [[__LOG2:%.*]] = call nnan ninf afn <2 x float> @_Z4log2Dv2_f(<2 x float> [[X]])
+; CHECK-NEXT:    [[__LOG2:%.*]] = call nnan ninf afn <2 x float> @llvm.log2.v2f32(<2 x float> [[X]])
 ; CHECK-NEXT:    [[__YLOGX:%.*]] = fmul nnan ninf afn <2 x float> [[__LOG2]], [[Y_CAST]]
-; CHECK-NEXT:    [[__EXP2:%.*]] = call nnan ninf afn <2 x float> @_Z4exp2Dv2_f(<2 x float> [[__YLOGX]])
+; CHECK-NEXT:    [[__EXP2:%.*]] = call nnan ninf afn <2 x float> @llvm.exp2.v2f32(<2 x float> [[__YLOGX]])
 ; CHECK-NEXT:    ret <2 x float> [[__EXP2]]
 ;
   %y.cast = uitofp <2 x i32> %y to <2 x float>

diff  --git a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
index 2f4a7c5c9ea06a..87f69065c9fd59 100644
--- a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
+++ b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
@@ -351,10 +351,10 @@ declare half @_Z4pownDhi(half, i32)
 
 ; GCN-LABEL: {{^}}define half @test_pown_f16(
 ; GCN-NATIVE: %__fabs = tail call fast half @llvm.fabs.f16(half %x)
-; GCN-NATIVE: %__log2 = tail call fast half @_Z4log2Dh(half %__fabs)
+; GCN-NATIVE: %__log2 = tail call fast half @llvm.log2.f16(half %__fabs)
 ; GCN-NATIVE: %pownI2F = sitofp i32 %y to half
 ; GCN-NATIVE: %__ylogx = fmul fast half %__log2, %pownI2F
-; GCN-NATIVE: %__exp2 = tail call fast half @_Z4exp2Dh(half %__ylogx)
+; GCN-NATIVE: %__exp2 = tail call fast half @llvm.exp2.f16(half %__ylogx)
 ; GCN-NATIVE: %__ytou = trunc i32 %y to i16
 ; GCN-NATIVE: %__yeven = shl i16 %__ytou, 15
 ; GCN-NATIVE: %0 = bitcast half %x to i16
@@ -371,16 +371,15 @@ entry:
 declare float @_Z4pownfi(float, i32)
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow
-; GCN-POSTLINK: call fast float @_Z3powff(float %tmp, float 1.013000e+03)
-; GCN-PRELINK: %__fabs = tail call fast float @llvm.fabs.f32(float %tmp)
-; GCN-PRELINK: %__log2 = tail call fast float @_Z4log2f(float %__fabs)
-; GCN-PRELINK: %__ylogx = fmul fast float %__log2, 1.013000e+03
-; GCN-PRELINK: %__exp2 = tail call fast float @_Z4exp2f(float %__ylogx)
-; GCN-PRELINK: %[[r0:.*]] = bitcast float %tmp to i32
-; GCN-PRELINK: %__pow_sign = and i32 %[[r0]], -2147483648
-; GCN-PRELINK: %[[r1:.*]] = bitcast float %__exp2 to i32
-; GCN-PRELINK: %[[r2:.*]] = or i32 %__pow_sign, %[[r1]]
-; GCN-PRELINK: store i32 %[[r2]], ptr addrspace(1) %a, align 4
+; GCN: %__fabs = tail call fast float @llvm.fabs.f32(float %tmp)
+; GCN: %__log2 = tail call fast float @llvm.log2.f32(float %__fabs)
+; GCN: %__ylogx = fmul fast float %__log2, 1.013000e+03
+; GCN: %__exp2 = tail call fast float @llvm.exp2.f32(float %__ylogx)
+; GCN: %[[r0:.*]] = bitcast float %tmp to i32
+; GCN: %__pow_sign = and i32 %[[r0]], -2147483648
+; GCN: %[[r1:.*]] = bitcast float %__exp2 to i32
+; GCN: %[[r2:.*]] = or i32 %__pow_sign, %[[r1]]
+; GCN: store i32 %[[r2]], ptr addrspace(1) %a, align 4
 define amdgpu_kernel void @test_pow(ptr addrspace(1) nocapture %a) {
 entry:
   %tmp = load float, ptr addrspace(1) %a, align 4
@@ -390,15 +389,10 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_powr
-; GCN-POSTLINK: call fast float @_Z4powrff(float %tmp, float %tmp1)
-; GCN-PRELINK: %__log2 = tail call fast float @_Z4log2f(float %tmp)
-; GCN-PRELINK: %__ylogx = fmul fast float %__log2, %tmp1
-; GCN-PRELINK: %__exp2 = tail call fast float @_Z4exp2f(float %__ylogx)
-; GCN-PRELINK: store float %__exp2, ptr addrspace(1) %a, align 4
-; GCN-NATIVE:  %__log2 = tail call fast float @_Z11native_log2f(float %tmp)
-; GCN-NATIVE:  %__ylogx = fmul fast float %__log2, %tmp1
-; GCN-NATIVE:  %__exp2 = tail call fast float @_Z11native_exp2f(float %__ylogx)
-; GCN-NATIVE:  store float %__exp2, ptr addrspace(1) %a, align 4
+; GCN: %__log2 = tail call fast float @llvm.log2.f32(float %tmp)
+; GCN: %__ylogx = fmul fast float %tmp1, %__log2
+; GCN: %__exp2 = tail call fast float @llvm.exp2.f32(float %__ylogx)
+; GCN: store float %__exp2, ptr addrspace(1) %a, align 4
 define amdgpu_kernel void @test_powr(ptr addrspace(1) nocapture %a) {
 entry:
   %tmp = load float, ptr addrspace(1) %a, align 4
@@ -410,19 +404,18 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pown
-; GCN-POSTLINK: call fast float @_Z4pownfi(float %tmp, i32 %conv)
-; GCN-PRELINK: %conv = fptosi float %tmp1 to i32
-; GCN-PRELINK: %__fabs = tail call fast float @llvm.fabs.f32(float %tmp)
-; GCN-PRELINK: %__log2 = tail call fast float @_Z4log2f(float %__fabs)
-; GCN-PRELINK: %pownI2F = sitofp i32 %conv to float
-; GCN-PRELINK: %__ylogx = fmul fast float %__log2, %pownI2F
-; GCN-PRELINK: %__exp2 = tail call fast float @_Z4exp2f(float %__ylogx)
-; GCN-PRELINK: %__yeven = shl i32 %conv, 31
-; GCN-PRELINK: %[[r0:.*]] = bitcast float %tmp to i32
-; GCN-PRELINK: %__pow_sign = and i32 %__yeven, %[[r0]]
-; GCN-PRELINK: %[[r1:.*]] = bitcast float %__exp2 to i32
-; GCN-PRELINK: %[[r2:.*]] = or i32 %__pow_sign, %[[r1]]
-; GCN-PRELINK: store i32 %[[r2]], ptr addrspace(1) %a, align 4
+; GCN: %conv = fptosi float %tmp1 to i32
+; GCN: %__fabs = tail call fast float @llvm.fabs.f32(float %tmp)
+; GCN: %__log2 = tail call fast float @llvm.log2.f32(float %__fabs)
+; GCN: %pownI2F = sitofp i32 %conv to float
+; GCN: %__ylogx = fmul fast float %__log2, %pownI2F
+; GCN: %__exp2 = tail call fast float @llvm.exp2.f32(float %__ylogx)
+; GCN: %__yeven = shl i32 %conv, 31
+; GCN: %[[r0:.*]] = bitcast float %tmp to i32
+; GCN: %__pow_sign = and i32 %__yeven, %[[r0]]
+; GCN: %[[r1:.*]] = bitcast float %__exp2 to i32
+; GCN: %[[r2:.*]] = or i32 %__pow_sign, %[[r1]]
+; GCN: store i32 %[[r2]], ptr addrspace(1) %a, align 4
 define amdgpu_kernel void @test_pown(ptr addrspace(1) nocapture %a) {
 entry:
   %tmp = load float, ptr addrspace(1) %a, align 4
@@ -438,30 +431,30 @@ declare half @_Z3powDhDh(half, half)
 declare <2 x half> @_Z3powDv2_DhS_(<2 x half>, <2 x half>)
 
 ; GCN-LABEL: define half @test_pow_fast_f16__y_13(half %x)
-; GCN-PRELINK: %__fabs = tail call fast half @llvm.fabs.f16(half %x)
-; GCN-PRELINK: %__log2 = tail call fast half @_Z4log2Dh(half %__fabs)
-; GCN-PRELINK: %__ylogx = fmul fast half %__log2, 0xH4A80
-; GCN-PRELINK: %__exp2 = tail call fast half @_Z4exp2Dh(half %__ylogx)
-; GCN-PRELINK: %1 = bitcast half %x to i16
-; GCN-PRELINK: %__pow_sign = and i16 %1, -32768
-; GCN-PRELINK: %2 = bitcast half %__exp2 to i16
-; GCN-PRELINK: %3 = or i16 %__pow_sign, %2
-; GCN-PRELINK: %4 = bitcast i16 %3 to half
+; GCN: %__fabs = tail call fast half @llvm.fabs.f16(half %x)
+; GCN: %__log2 = tail call fast half @llvm.log2.f16(half %__fabs)
+; GCN: %__ylogx = fmul fast half %__log2, 0xH4A80
+; GCN: %__exp2 = tail call fast half @llvm.exp2.f16(half %__ylogx)
+; GCN: %1 = bitcast half %x to i16
+; GCN: %__pow_sign = and i16 %1, -32768
+; GCN: %2 = bitcast half %__exp2 to i16
+; GCN: %3 = or i16 %__pow_sign, %2
+; GCN: %4 = bitcast i16 %3 to half
 define half @test_pow_fast_f16__y_13(half %x) {
   %powr = tail call fast half @_Z3powDhDh(half %x, half 13.0)
   ret half %powr
 }
 
 ; GCN-LABEL: define <2 x half> @test_pow_fast_v2f16__y_13(<2 x half> %x)
-; GCN-PRELINK: %__fabs = tail call fast <2 x half> @llvm.fabs.v2f16(<2 x half> %x)
-; GCN-PRELINK: %__log2 = tail call fast <2 x half> @_Z4log2Dv2_Dh(<2 x half> %__fabs)
-; GCN-PRELINK: %__ylogx = fmul fast <2 x half> %__log2, <half 0xH4A80, half 0xH4A80>
-; GCN-PRELINK: %__exp2 = tail call fast <2 x half> @_Z4exp2Dv2_Dh(<2 x half> %__ylogx)
-; GCN-PRELINK: %1 = bitcast <2 x half> %x to <2 x i16>
-; GCN-PRELINK: %__pow_sign = and <2 x i16> %1, <i16 -32768, i16 -32768>
-; GCN-PRELINK: %2 = bitcast <2 x half> %__exp2 to <2 x i16>
-; GCN-PRELINK: %3 = or <2 x i16> %__pow_sign, %2
-; GCN-PRELINK: %4 = bitcast <2 x i16> %3 to <2 x half>
+; GCN: %__fabs = tail call fast <2 x half> @llvm.fabs.v2f16(<2 x half> %x)
+; GCN: %__log2 = tail call fast <2 x half> @llvm.log2.v2f16(<2 x half> %__fabs)
+; GCN: %__ylogx = fmul fast <2 x half> %__log2, <half 0xH4A80, half 0xH4A80>
+; GCN: %__exp2 = tail call fast <2 x half> @llvm.exp2.v2f16(<2 x half> %__ylogx)
+; GCN: %1 = bitcast <2 x half> %x to <2 x i16>
+; GCN: %__pow_sign = and <2 x i16> %1, <i16 -32768, i16 -32768>
+; GCN: %2 = bitcast <2 x half> %__exp2 to <2 x i16>
+; GCN: %3 = or <2 x i16> %__pow_sign, %2
+; GCN: %4 = bitcast <2 x i16> %3 to <2 x half>
 define <2 x half> @test_pow_fast_v2f16__y_13(<2 x half> %x) {
   %powr = tail call fast <2 x half> @_Z3powDv2_DhS_(<2 x half> %x, <2 x half> <half 13.0, half 13.0>)
   ret <2 x half> %powr
@@ -673,11 +666,11 @@ entry:
 declare float @_Z5log10f(float)
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_powr
-; GCN-NATIVE: %tmp1 = load float, ptr addrspace(1) %arrayidx1, align 4
-; GCN-NATIVE: %__log2 = tail call fast float @_Z11native_log2f(float %tmp)
-; GCN-NATIVE: %__ylogx = fmul fast float %__log2, %tmp1
-; GCN-NATIVE: %__exp2 = tail call fast float @_Z11native_exp2f(float %__ylogx)
-; GCN-NATIVE: store float %__exp2, ptr addrspace(1) %a, align 4
+; GCN: %tmp1 = load float, ptr addrspace(1) %arrayidx1, align 4
+; GCN: %__log2 = tail call fast float @llvm.log2.f32(float %tmp)
+; GCN: %__ylogx = fmul fast float %tmp1, %__log2
+; GCN: %__exp2 = tail call fast float @llvm.exp2.f32(float %__ylogx)
+; GCN: store float %__exp2, ptr addrspace(1) %a, align 4
 define amdgpu_kernel void @test_use_native_powr(ptr addrspace(1) nocapture %a) {
 entry:
   %tmp = load float, ptr addrspace(1) %a, align 4


        


More information about the llvm-commits mailing list