[llvm] 528bcf3 - AMDGPU: Restore deleted test checks from test

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Thu Sep 19 21:49:54 PDT 2024


Author: Matt Arsenault
Date: 2024-09-20T08:49:48+04:00
New Revision: 528bcf3a55ca520c31c77ed5fbacf09bff8f39ec

URL: https://github.com/llvm/llvm-project/commit/528bcf3a55ca520c31c77ed5fbacf09bff8f39ec
DIFF: https://github.com/llvm/llvm-project/commit/528bcf3a55ca520c31c77ed5fbacf09bff8f39ec.diff

LOG: AMDGPU: Restore deleted test checks from test

These were accidentally removed in 758444ca3e7163a1504eeced3383af861d01d761

Added: 
    

Modified: 
    llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
index 32b2fa238cbac4..0025d23b108038 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
@@ -18,33 +18,189 @@ declare hidden half @_Z4pownDhi(half, i32)
 ; --------------------------------------------------------------------
 
 define half @test_pow_fast_f16(half %x, half %y) {
+; CHECK-LABEL: test_pow_fast_f16:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_getpc_b64 s[16:17]
+; CHECK-NEXT:    s_add_u32 s16, s16, _Z3powDhDh at rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s17, s17, _Z3powDhDh at rel32@hi+12
+; CHECK-NEXT:    s_setpc_b64 s[16:17]
   %pow = tail call fast half @_Z3powDhDh(half %x, half %y)
   ret half %pow
 }
 
 define float @test_pow_fast_f32(float %x, float %y) {
+; CHECK-LABEL: test_pow_fast_f32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_getpc_b64 s[16:17]
+; CHECK-NEXT:    s_add_u32 s16, s16, _Z3powff at rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s17, s17, _Z3powff at rel32@hi+12
+; CHECK-NEXT:    s_setpc_b64 s[16:17]
   %pow = tail call fast float @_Z3powff(float %x, float %y)
   ret float %pow
 }
 
 define double @test_pow_fast_f64(double %x, double %y) {
+; CHECK-LABEL: test_pow_fast_f64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_getpc_b64 s[16:17]
+; CHECK-NEXT:    s_add_u32 s16, s16, _Z3powdd at rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s17, s17, _Z3powdd at rel32@hi+12
+; CHECK-NEXT:    s_setpc_b64 s[16:17]
   %pow = tail call fast double @_Z3powdd(double %x, double %y)
   ret double %pow
 }
 
 define half @test_pow_fast_f16__integral_y(half %x, i32 %y.i) {
+; CHECK-LABEL: test_pow_fast_f16__integral_y:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; CHECK-NEXT:    v_log_f16_e64 v3, |v0|
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; CHECK-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; CHECK-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; CHECK-NEXT:    v_cvt_f32_i32_e32 v2, v1
+; CHECK-NEXT:    v_lshlrev_b16_e32 v1, 15, v1
+; CHECK-NEXT:    v_and_b32_e32 v0, v1, v0
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; CHECK-NEXT:    v_mul_f16_e32 v2, v3, v2
+; CHECK-NEXT:    v_exp_f16_e32 v2, v2
+; CHECK-NEXT:    v_or_b32_e32 v0, v0, v2
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %y = sitofp i32 %y.i to half
   %pow = tail call fast half @_Z3powDhDh(half %x, half %y)
   ret half %pow
 }
 
 define float @test_pow_fast_f32__integral_y(float %x, i32 %y.i) {
+; CHECK-LABEL: test_pow_fast_f32__integral_y:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; CHECK-NEXT:    s_mov_b32 s4, 0x800000
+; CHECK-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0x4f800000
+; CHECK-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; CHECK-NEXT:    v_cndmask_b32_e32 v3, 1.0, v3, vcc
+; CHECK-NEXT:    v_mul_f32_e64 v3, |v0|, v3
+; CHECK-NEXT:    v_log_f32_e32 v3, v3
+; CHECK-NEXT:    v_cvt_f32_i32_e32 v4, v1
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0x42000000
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; CHECK-NEXT:    v_sub_f32_e32 v2, v3, v2
+; CHECK-NEXT:    v_mul_f32_e32 v3, v2, v4
+; CHECK-NEXT:    s_mov_b32 s4, 0xc2fc0000
+; CHECK-NEXT:    v_mov_b32_e32 v5, 0x42800000
+; CHECK-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v3
+; CHECK-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
+; CHECK-NEXT:    v_fma_f32 v2, v2, v4, v3
+; CHECK-NEXT:    v_exp_f32_e32 v2, v2
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0x1f800000
+; CHECK-NEXT:    v_cndmask_b32_e32 v3, 1.0, v3, vcc
+; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
+; CHECK-NEXT:    v_mul_f32_e32 v2, v2, v3
+; CHECK-NEXT:    v_and_or_b32 v0, v1, v0, v2
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %y = sitofp i32 %y.i to float
   %pow = tail call fast float @_Z3powff(float %x, float %y)
   ret float %pow
 }
 
 define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) {
+; CHECK-LABEL: test_pow_fast_f64__integral_y:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s16, s33
+; CHECK-NEXT:    s_mov_b32 s33, s32
+; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
+; CHECK-NEXT:    v_writelane_b32 v43, s16, 14
+; CHECK-NEXT:    v_writelane_b32 v43, s30, 0
+; CHECK-NEXT:    v_writelane_b32 v43, s31, 1
+; CHECK-NEXT:    v_writelane_b32 v43, s34, 2
+; CHECK-NEXT:    v_writelane_b32 v43, s35, 3
+; CHECK-NEXT:    v_writelane_b32 v43, s36, 4
+; CHECK-NEXT:    v_writelane_b32 v43, s37, 5
+; CHECK-NEXT:    v_writelane_b32 v43, s38, 6
+; CHECK-NEXT:    v_writelane_b32 v43, s39, 7
+; CHECK-NEXT:    s_addk_i32 s32, 0x800
+; CHECK-NEXT:    v_writelane_b32 v43, s40, 8
+; CHECK-NEXT:    v_writelane_b32 v43, s41, 9
+; CHECK-NEXT:    s_mov_b64 s[40:41], s[4:5]
+; CHECK-NEXT:    s_getpc_b64 s[4:5]
+; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
+; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; CHECK-NEXT:    v_writelane_b32 v43, s42, 10
+; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT:    v_writelane_b32 v43, s43, 11
+; CHECK-NEXT:    v_mov_b32_e32 v42, v1
+; CHECK-NEXT:    v_writelane_b32 v43, s44, 12
+; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v42
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT:    v_writelane_b32 v43, s45, 13
+; CHECK-NEXT:    v_mov_b32_e32 v40, v31
+; CHECK-NEXT:    s_mov_b64 s[34:35], s[6:7]
+; CHECK-NEXT:    v_mov_b32_e32 v41, v2
+; CHECK-NEXT:    s_mov_b32 s42, s15
+; CHECK-NEXT:    s_mov_b32 s43, s14
+; CHECK-NEXT:    s_mov_b32 s44, s13
+; CHECK-NEXT:    s_mov_b32 s45, s12
+; CHECK-NEXT:    s_mov_b64 s[36:37], s[10:11]
+; CHECK-NEXT:    s_mov_b64 s[38:39], s[8:9]
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT:    v_cvt_f64_i32_e32 v[2:3], v41
+; CHECK-NEXT:    s_getpc_b64 s[4:5]
+; CHECK-NEXT:    s_add_u32 s4, s4, _Z4exp2d at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4exp2d at gotpcrel32@hi+12
+; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; CHECK-NEXT:    s_mov_b64 s[6:7], s[34:35]
+; CHECK-NEXT:    s_mov_b64 s[8:9], s[38:39]
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; CHECK-NEXT:    s_mov_b32 s12, s45
+; CHECK-NEXT:    s_mov_b32 s13, s44
+; CHECK-NEXT:    s_mov_b32 s14, s43
+; CHECK-NEXT:    s_mov_b32 s15, s42
+; CHECK-NEXT:    v_mov_b32_e32 v31, v40
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 31, v41
+; CHECK-NEXT:    v_and_b32_e32 v2, v2, v42
+; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT:    v_or_b32_e32 v1, v2, v1
+; CHECK-NEXT:    v_readlane_b32 s45, v43, 13
+; CHECK-NEXT:    v_readlane_b32 s44, v43, 12
+; CHECK-NEXT:    v_readlane_b32 s43, v43, 11
+; CHECK-NEXT:    v_readlane_b32 s42, v43, 10
+; CHECK-NEXT:    v_readlane_b32 s41, v43, 9
+; CHECK-NEXT:    v_readlane_b32 s40, v43, 8
+; CHECK-NEXT:    v_readlane_b32 s39, v43, 7
+; CHECK-NEXT:    v_readlane_b32 s38, v43, 6
+; CHECK-NEXT:    v_readlane_b32 s37, v43, 5
+; CHECK-NEXT:    v_readlane_b32 s36, v43, 4
+; CHECK-NEXT:    v_readlane_b32 s35, v43, 3
+; CHECK-NEXT:    v_readlane_b32 s34, v43, 2
+; CHECK-NEXT:    v_readlane_b32 s31, v43, 1
+; CHECK-NEXT:    v_readlane_b32 s30, v43, 0
+; CHECK-NEXT:    v_readlane_b32 s4, v43, 14
+; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; CHECK-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
+; CHECK-NEXT:    s_addk_i32 s32, 0xf800
+; CHECK-NEXT:    s_mov_b32 s33, s4
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %y = sitofp i32 %y.i to double
   %pow = tail call fast double @_Z3powdd(double %x, double %y)
   ret double %pow
@@ -55,16 +211,132 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) {
 ; --------------------------------------------------------------------
 
 define half @test_powr_fast_f16(half %x, half %y) {
+; CHECK-LABEL: test_powr_fast_f16:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_log_f16_e32 v0, v0
+; CHECK-NEXT:    v_mul_f16_e32 v0, v1, v0
+; CHECK-NEXT:    v_exp_f16_e32 v0, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %powr = tail call fast half @_Z4powrDhDh(half %x, half %y)
   ret half %powr
 }
 
 define float @test_powr_fast_f32(float %x, float %y) {
+; CHECK-LABEL: test_powr_fast_f32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s4, 0x800000
+; CHECK-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0x4f800000
+; CHECK-NEXT:    v_cndmask_b32_e32 v3, 1.0, v3, vcc
+; CHECK-NEXT:    v_mul_f32_e32 v0, v0, v3
+; CHECK-NEXT:    v_log_f32_e32 v0, v0
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0x42000000
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; CHECK-NEXT:    s_mov_b32 s4, 0xc2fc0000
+; CHECK-NEXT:    v_sub_f32_e32 v0, v0, v2
+; CHECK-NEXT:    v_mul_f32_e32 v2, v1, v0
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0x42800000
+; CHECK-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v2
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
+; CHECK-NEXT:    v_fma_f32 v0, v1, v0, v2
+; CHECK-NEXT:    v_exp_f32_e32 v0, v0
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0x1f800000
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; CHECK-NEXT:    v_mul_f32_e32 v0, v0, v1
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %powr = tail call fast float @_Z4powrff(float %x, float %y)
   ret float %powr
 }
 
 define double @test_powr_fast_f64(double %x, double %y) {
+; CHECK-LABEL: test_powr_fast_f64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s16, s33
+; CHECK-NEXT:    s_mov_b32 s33, s32
+; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
+; CHECK-NEXT:    v_writelane_b32 v43, s16, 14
+; CHECK-NEXT:    v_writelane_b32 v43, s30, 0
+; CHECK-NEXT:    v_writelane_b32 v43, s31, 1
+; CHECK-NEXT:    v_writelane_b32 v43, s34, 2
+; CHECK-NEXT:    v_writelane_b32 v43, s35, 3
+; CHECK-NEXT:    v_writelane_b32 v43, s36, 4
+; CHECK-NEXT:    v_writelane_b32 v43, s37, 5
+; CHECK-NEXT:    v_writelane_b32 v43, s38, 6
+; CHECK-NEXT:    v_writelane_b32 v43, s39, 7
+; CHECK-NEXT:    s_addk_i32 s32, 0x800
+; CHECK-NEXT:    v_writelane_b32 v43, s40, 8
+; CHECK-NEXT:    v_writelane_b32 v43, s41, 9
+; CHECK-NEXT:    s_mov_b64 s[40:41], s[4:5]
+; CHECK-NEXT:    s_getpc_b64 s[4:5]
+; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
+; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; CHECK-NEXT:    v_writelane_b32 v43, s42, 10
+; CHECK-NEXT:    v_writelane_b32 v43, s43, 11
+; CHECK-NEXT:    v_writelane_b32 v43, s44, 12
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT:    v_writelane_b32 v43, s45, 13
+; CHECK-NEXT:    v_mov_b32_e32 v42, v31
+; CHECK-NEXT:    s_mov_b64 s[34:35], s[6:7]
+; CHECK-NEXT:    v_mov_b32_e32 v41, v3
+; CHECK-NEXT:    v_mov_b32_e32 v40, v2
+; CHECK-NEXT:    s_mov_b32 s42, s15
+; CHECK-NEXT:    s_mov_b32 s43, s14
+; CHECK-NEXT:    s_mov_b32 s44, s13
+; CHECK-NEXT:    s_mov_b32 s45, s12
+; CHECK-NEXT:    s_mov_b64 s[36:37], s[10:11]
+; CHECK-NEXT:    s_mov_b64 s[38:39], s[8:9]
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT:    v_mul_f64 v[0:1], v[40:41], v[0:1]
+; CHECK-NEXT:    s_getpc_b64 s[4:5]
+; CHECK-NEXT:    s_add_u32 s4, s4, _Z4exp2d at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4exp2d at gotpcrel32@hi+12
+; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT:    s_mov_b64 s[6:7], s[34:35]
+; CHECK-NEXT:    s_mov_b64 s[8:9], s[38:39]
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; CHECK-NEXT:    s_mov_b32 s12, s45
+; CHECK-NEXT:    s_mov_b32 s13, s44
+; CHECK-NEXT:    s_mov_b32 s14, s43
+; CHECK-NEXT:    s_mov_b32 s15, s42
+; CHECK-NEXT:    v_mov_b32_e32 v31, v42
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT:    v_readlane_b32 s45, v43, 13
+; CHECK-NEXT:    v_readlane_b32 s44, v43, 12
+; CHECK-NEXT:    v_readlane_b32 s43, v43, 11
+; CHECK-NEXT:    v_readlane_b32 s42, v43, 10
+; CHECK-NEXT:    v_readlane_b32 s41, v43, 9
+; CHECK-NEXT:    v_readlane_b32 s40, v43, 8
+; CHECK-NEXT:    v_readlane_b32 s39, v43, 7
+; CHECK-NEXT:    v_readlane_b32 s38, v43, 6
+; CHECK-NEXT:    v_readlane_b32 s37, v43, 5
+; CHECK-NEXT:    v_readlane_b32 s36, v43, 4
+; CHECK-NEXT:    v_readlane_b32 s35, v43, 3
+; CHECK-NEXT:    v_readlane_b32 s34, v43, 2
+; CHECK-NEXT:    v_readlane_b32 s31, v43, 1
+; CHECK-NEXT:    v_readlane_b32 s30, v43, 0
+; CHECK-NEXT:    v_readlane_b32 s4, v43, 14
+; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; CHECK-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
+; CHECK-NEXT:    s_addk_i32 s32, 0xf800
+; CHECK-NEXT:    s_mov_b32 s33, s4
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %powr = tail call fast double @_Z4powrdd(double %x, double %y)
   ret double %powr
 }
@@ -74,51 +346,429 @@ define double @test_powr_fast_f64(double %x, double %y) {
 ; --------------------------------------------------------------------
 
 define half @test_pown_fast_f16(half %x, i32 %y) {
+; CHECK-LABEL: test_pown_fast_f16:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_cvt_f32_i32_e32 v2, v1
+; CHECK-NEXT:    v_log_f16_e64 v3, |v0|
+; CHECK-NEXT:    v_lshlrev_b16_e32 v1, 15, v1
+; CHECK-NEXT:    v_and_b32_e32 v0, v1, v0
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; CHECK-NEXT:    v_mul_f16_e32 v2, v3, v2
+; CHECK-NEXT:    v_exp_f16_e32 v2, v2
+; CHECK-NEXT:    v_or_b32_e32 v0, v0, v2
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %call = tail call fast half @_Z4pownDhi(half %x, i32 %y)
   ret half %call
 }
 
 define float @test_pown_fast_f32(float %x, i32 %y) {
+; CHECK-LABEL: test_pown_fast_f32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s4, 0x800000
+; CHECK-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0x4f800000
+; CHECK-NEXT:    v_cndmask_b32_e32 v3, 1.0, v3, vcc
+; CHECK-NEXT:    v_mul_f32_e64 v3, |v0|, v3
+; CHECK-NEXT:    v_log_f32_e32 v3, v3
+; CHECK-NEXT:    v_cvt_f32_i32_e32 v4, v1
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0x42000000
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; CHECK-NEXT:    v_sub_f32_e32 v2, v3, v2
+; CHECK-NEXT:    v_mul_f32_e32 v3, v2, v4
+; CHECK-NEXT:    s_mov_b32 s4, 0xc2fc0000
+; CHECK-NEXT:    v_mov_b32_e32 v5, 0x42800000
+; CHECK-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v3
+; CHECK-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
+; CHECK-NEXT:    v_fma_f32 v2, v2, v4, v3
+; CHECK-NEXT:    v_exp_f32_e32 v2, v2
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0x1f800000
+; CHECK-NEXT:    v_cndmask_b32_e32 v3, 1.0, v3, vcc
+; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
+; CHECK-NEXT:    v_mul_f32_e32 v2, v2, v3
+; CHECK-NEXT:    v_and_or_b32 v0, v1, v0, v2
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %call = tail call fast float @_Z4pownfi(float %x, i32 %y)
   ret float %call
 }
 
 define double @test_pown_fast_f64(double %x, i32 %y) {
+; CHECK-LABEL: test_pown_fast_f64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s16, s33
+; CHECK-NEXT:    s_mov_b32 s33, s32
+; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
+; CHECK-NEXT:    v_writelane_b32 v43, s16, 14
+; CHECK-NEXT:    v_writelane_b32 v43, s30, 0
+; CHECK-NEXT:    v_writelane_b32 v43, s31, 1
+; CHECK-NEXT:    v_writelane_b32 v43, s34, 2
+; CHECK-NEXT:    v_writelane_b32 v43, s35, 3
+; CHECK-NEXT:    v_writelane_b32 v43, s36, 4
+; CHECK-NEXT:    v_writelane_b32 v43, s37, 5
+; CHECK-NEXT:    v_writelane_b32 v43, s38, 6
+; CHECK-NEXT:    v_writelane_b32 v43, s39, 7
+; CHECK-NEXT:    s_addk_i32 s32, 0x800
+; CHECK-NEXT:    v_writelane_b32 v43, s40, 8
+; CHECK-NEXT:    v_writelane_b32 v43, s41, 9
+; CHECK-NEXT:    s_mov_b64 s[40:41], s[4:5]
+; CHECK-NEXT:    s_getpc_b64 s[4:5]
+; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
+; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; CHECK-NEXT:    v_writelane_b32 v43, s42, 10
+; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT:    v_writelane_b32 v43, s43, 11
+; CHECK-NEXT:    v_mov_b32_e32 v42, v1
+; CHECK-NEXT:    v_writelane_b32 v43, s44, 12
+; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v42
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT:    v_writelane_b32 v43, s45, 13
+; CHECK-NEXT:    v_mov_b32_e32 v40, v31
+; CHECK-NEXT:    s_mov_b64 s[34:35], s[6:7]
+; CHECK-NEXT:    v_mov_b32_e32 v41, v2
+; CHECK-NEXT:    s_mov_b32 s42, s15
+; CHECK-NEXT:    s_mov_b32 s43, s14
+; CHECK-NEXT:    s_mov_b32 s44, s13
+; CHECK-NEXT:    s_mov_b32 s45, s12
+; CHECK-NEXT:    s_mov_b64 s[36:37], s[10:11]
+; CHECK-NEXT:    s_mov_b64 s[38:39], s[8:9]
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT:    v_cvt_f64_i32_e32 v[2:3], v41
+; CHECK-NEXT:    s_getpc_b64 s[4:5]
+; CHECK-NEXT:    s_add_u32 s4, s4, _Z4exp2d at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4exp2d at gotpcrel32@hi+12
+; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; CHECK-NEXT:    s_mov_b64 s[6:7], s[34:35]
+; CHECK-NEXT:    s_mov_b64 s[8:9], s[38:39]
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; CHECK-NEXT:    s_mov_b32 s12, s45
+; CHECK-NEXT:    s_mov_b32 s13, s44
+; CHECK-NEXT:    s_mov_b32 s14, s43
+; CHECK-NEXT:    s_mov_b32 s15, s42
+; CHECK-NEXT:    v_mov_b32_e32 v31, v40
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 31, v41
+; CHECK-NEXT:    v_and_b32_e32 v2, v2, v42
+; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT:    v_or_b32_e32 v1, v2, v1
+; CHECK-NEXT:    v_readlane_b32 s45, v43, 13
+; CHECK-NEXT:    v_readlane_b32 s44, v43, 12
+; CHECK-NEXT:    v_readlane_b32 s43, v43, 11
+; CHECK-NEXT:    v_readlane_b32 s42, v43, 10
+; CHECK-NEXT:    v_readlane_b32 s41, v43, 9
+; CHECK-NEXT:    v_readlane_b32 s40, v43, 8
+; CHECK-NEXT:    v_readlane_b32 s39, v43, 7
+; CHECK-NEXT:    v_readlane_b32 s38, v43, 6
+; CHECK-NEXT:    v_readlane_b32 s37, v43, 5
+; CHECK-NEXT:    v_readlane_b32 s36, v43, 4
+; CHECK-NEXT:    v_readlane_b32 s35, v43, 3
+; CHECK-NEXT:    v_readlane_b32 s34, v43, 2
+; CHECK-NEXT:    v_readlane_b32 s31, v43, 1
+; CHECK-NEXT:    v_readlane_b32 s30, v43, 0
+; CHECK-NEXT:    v_readlane_b32 s4, v43, 14
+; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; CHECK-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
+; CHECK-NEXT:    s_addk_i32 s32, 0xf800
+; CHECK-NEXT:    s_mov_b32 s33, s4
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %call = tail call fast double @_Z4powndi(double %x, i32 %y)
   ret double %call
 }
 
 define half @test_pown_fast_f16_known_even(half %x, i32 %y.arg) {
+; CHECK-LABEL: test_pown_fast_f16_known_even:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
+; CHECK-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; CHECK-NEXT:    v_log_f16_e64 v0, |v0|
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; CHECK-NEXT:    v_mul_f16_e32 v0, v0, v1
+; CHECK-NEXT:    v_exp_f16_e32 v0, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %y = shl i32 %y.arg, 1
   %call = tail call fast half @_Z4pownDhi(half %x, i32 %y)
   ret half %call
 }
 
 define float @test_pown_fast_f32_known_even(float %x, i32 %y.arg) {
+; CHECK-LABEL: test_pown_fast_f32_known_even:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s4, 0x800000
+; CHECK-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0x4f800000
+; CHECK-NEXT:    v_cndmask_b32_e32 v3, 1.0, v3, vcc
+; CHECK-NEXT:    v_mul_f32_e64 v0, |v0|, v3
+; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
+; CHECK-NEXT:    v_log_f32_e32 v0, v0
+; CHECK-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0x42000000
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; CHECK-NEXT:    v_sub_f32_e32 v0, v0, v2
+; CHECK-NEXT:    v_mul_f32_e32 v2, v0, v1
+; CHECK-NEXT:    s_mov_b32 s4, 0xc2fc0000
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0x42800000
+; CHECK-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v2
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
+; CHECK-NEXT:    v_fma_f32 v0, v0, v1, v2
+; CHECK-NEXT:    v_exp_f32_e32 v0, v0
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0x1f800000
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; CHECK-NEXT:    v_mul_f32_e32 v0, v0, v1
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %y = shl i32 %y.arg, 1
   %call = tail call fast float @_Z4pownfi(float %x, i32 %y)
   ret float %call
 }
 
 define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) {
+; CHECK-LABEL: test_pown_fast_f64_known_even:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s16, s33
+; CHECK-NEXT:    s_mov_b32 s33, s32
+; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
+; CHECK-NEXT:    v_writelane_b32 v42, s16, 14
+; CHECK-NEXT:    v_writelane_b32 v42, s30, 0
+; CHECK-NEXT:    v_writelane_b32 v42, s31, 1
+; CHECK-NEXT:    v_writelane_b32 v42, s34, 2
+; CHECK-NEXT:    v_writelane_b32 v42, s35, 3
+; CHECK-NEXT:    v_writelane_b32 v42, s36, 4
+; CHECK-NEXT:    v_writelane_b32 v42, s37, 5
+; CHECK-NEXT:    v_writelane_b32 v42, s38, 6
+; CHECK-NEXT:    v_writelane_b32 v42, s39, 7
+; CHECK-NEXT:    s_addk_i32 s32, 0x400
+; CHECK-NEXT:    v_writelane_b32 v42, s40, 8
+; CHECK-NEXT:    v_writelane_b32 v42, s41, 9
+; CHECK-NEXT:    s_mov_b64 s[40:41], s[4:5]
+; CHECK-NEXT:    s_getpc_b64 s[4:5]
+; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
+; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; CHECK-NEXT:    v_writelane_b32 v42, s42, 10
+; CHECK-NEXT:    v_writelane_b32 v42, s43, 11
+; CHECK-NEXT:    v_writelane_b32 v42, s44, 12
+; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT:    v_writelane_b32 v42, s45, 13
+; CHECK-NEXT:    v_mov_b32_e32 v40, v31
+; CHECK-NEXT:    s_mov_b64 s[34:35], s[6:7]
+; CHECK-NEXT:    s_mov_b32 s42, s15
+; CHECK-NEXT:    s_mov_b32 s43, s14
+; CHECK-NEXT:    s_mov_b32 s44, s13
+; CHECK-NEXT:    s_mov_b32 s45, s12
+; CHECK-NEXT:    s_mov_b64 s[36:37], s[10:11]
+; CHECK-NEXT:    s_mov_b64 s[38:39], s[8:9]
+; CHECK-NEXT:    v_lshlrev_b32_e32 v41, 1, v2
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT:    v_cvt_f64_i32_e32 v[2:3], v41
+; CHECK-NEXT:    s_getpc_b64 s[4:5]
+; CHECK-NEXT:    s_add_u32 s4, s4, _Z4exp2d at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4exp2d at gotpcrel32@hi+12
+; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; CHECK-NEXT:    s_mov_b64 s[6:7], s[34:35]
+; CHECK-NEXT:    s_mov_b64 s[8:9], s[38:39]
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; CHECK-NEXT:    s_mov_b32 s12, s45
+; CHECK-NEXT:    s_mov_b32 s13, s44
+; CHECK-NEXT:    s_mov_b32 s14, s43
+; CHECK-NEXT:    s_mov_b32 s15, s42
+; CHECK-NEXT:    v_mov_b32_e32 v31, v40
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT:    v_readlane_b32 s45, v42, 13
+; CHECK-NEXT:    v_readlane_b32 s44, v42, 12
+; CHECK-NEXT:    v_readlane_b32 s43, v42, 11
+; CHECK-NEXT:    v_readlane_b32 s42, v42, 10
+; CHECK-NEXT:    v_readlane_b32 s41, v42, 9
+; CHECK-NEXT:    v_readlane_b32 s40, v42, 8
+; CHECK-NEXT:    v_readlane_b32 s39, v42, 7
+; CHECK-NEXT:    v_readlane_b32 s38, v42, 6
+; CHECK-NEXT:    v_readlane_b32 s37, v42, 5
+; CHECK-NEXT:    v_readlane_b32 s36, v42, 4
+; CHECK-NEXT:    v_readlane_b32 s35, v42, 3
+; CHECK-NEXT:    v_readlane_b32 s34, v42, 2
+; CHECK-NEXT:    v_readlane_b32 s31, v42, 1
+; CHECK-NEXT:    v_readlane_b32 s30, v42, 0
+; CHECK-NEXT:    v_readlane_b32 s4, v42, 14
+; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
+; CHECK-NEXT:    s_addk_i32 s32, 0xfc00
+; CHECK-NEXT:    s_mov_b32 s33, s4
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %y = shl i32 %y.arg, 1
   %call = tail call fast double @_Z4powndi(double %x, i32 %y)
   ret double %call
 }
 
 define half @test_pown_fast_f16_known_odd(half %x, i32 %y.arg) {
+; CHECK-LABEL: test_pown_fast_f16_known_odd:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_or_b32_e32 v1, 1, v1
+; CHECK-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; CHECK-NEXT:    v_log_f16_e64 v2, |v0|
+; CHECK-NEXT:    s_movk_i32 s4, 0x7fff
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; CHECK-NEXT:    v_mul_f16_e32 v1, v2, v1
+; CHECK-NEXT:    v_exp_f16_e32 v1, v1
+; CHECK-NEXT:    v_bfi_b32 v0, s4, v1, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %y = or i32 %y.arg, 1
   %call = tail call fast half @_Z4pownDhi(half %x, i32 %y)
   ret half %call
 }
 
 define float @test_pown_fast_f32_known_odd(float %x, i32 %y.arg) {
+; CHECK-LABEL: test_pown_fast_f32_known_odd:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s4, 0x800000
+; CHECK-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0x4f800000
+; CHECK-NEXT:    v_cndmask_b32_e32 v3, 1.0, v3, vcc
+; CHECK-NEXT:    v_mul_f32_e64 v3, |v0|, v3
+; CHECK-NEXT:    v_or_b32_e32 v1, 1, v1
+; CHECK-NEXT:    v_log_f32_e32 v3, v3
+; CHECK-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0x42000000
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; CHECK-NEXT:    v_sub_f32_e32 v2, v3, v2
+; CHECK-NEXT:    v_mul_f32_e32 v3, v2, v1
+; CHECK-NEXT:    s_mov_b32 s4, 0xc2fc0000
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0x42800000
+; CHECK-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v3
+; CHECK-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
+; CHECK-NEXT:    v_fma_f32 v1, v2, v1, v3
+; CHECK-NEXT:    v_exp_f32_e32 v1, v1
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0x1f800000
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
+; CHECK-NEXT:    s_brev_b32 s4, -2
+; CHECK-NEXT:    v_mul_f32_e32 v1, v1, v2
+; CHECK-NEXT:    v_bfi_b32 v0, s4, v1, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %y = or i32 %y.arg, 1
   %call = tail call fast float @_Z4pownfi(float %x, i32 %y)
   ret float %call
 }
 
 define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) {
+; CHECK-LABEL: test_pown_fast_f64_known_odd:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s16, s33
+; CHECK-NEXT:    s_mov_b32 s33, s32
+; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
+; CHECK-NEXT:    v_writelane_b32 v43, s16, 14
+; CHECK-NEXT:    v_writelane_b32 v43, s30, 0
+; CHECK-NEXT:    v_writelane_b32 v43, s31, 1
+; CHECK-NEXT:    v_writelane_b32 v43, s34, 2
+; CHECK-NEXT:    v_writelane_b32 v43, s35, 3
+; CHECK-NEXT:    v_writelane_b32 v43, s36, 4
+; CHECK-NEXT:    v_writelane_b32 v43, s37, 5
+; CHECK-NEXT:    v_writelane_b32 v43, s38, 6
+; CHECK-NEXT:    v_writelane_b32 v43, s39, 7
+; CHECK-NEXT:    s_addk_i32 s32, 0x800
+; CHECK-NEXT:    v_writelane_b32 v43, s40, 8
+; CHECK-NEXT:    v_writelane_b32 v43, s41, 9
+; CHECK-NEXT:    s_mov_b64 s[40:41], s[4:5]
+; CHECK-NEXT:    s_getpc_b64 s[4:5]
+; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
+; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; CHECK-NEXT:    v_writelane_b32 v43, s42, 10
+; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT:    v_writelane_b32 v43, s43, 11
+; CHECK-NEXT:    v_mov_b32_e32 v41, v1
+; CHECK-NEXT:    v_writelane_b32 v43, s44, 12
+; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v41
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT:    v_writelane_b32 v43, s45, 13
+; CHECK-NEXT:    v_mov_b32_e32 v40, v31
+; CHECK-NEXT:    s_mov_b64 s[34:35], s[6:7]
+; CHECK-NEXT:    s_mov_b32 s42, s15
+; CHECK-NEXT:    s_mov_b32 s43, s14
+; CHECK-NEXT:    s_mov_b32 s44, s13
+; CHECK-NEXT:    s_mov_b32 s45, s12
+; CHECK-NEXT:    s_mov_b64 s[36:37], s[10:11]
+; CHECK-NEXT:    s_mov_b64 s[38:39], s[8:9]
+; CHECK-NEXT:    v_or_b32_e32 v42, 1, v2
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT:    v_cvt_f64_i32_e32 v[2:3], v42
+; CHECK-NEXT:    s_getpc_b64 s[4:5]
+; CHECK-NEXT:    s_add_u32 s4, s4, _Z4exp2d at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4exp2d at gotpcrel32@hi+12
+; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; CHECK-NEXT:    s_mov_b64 s[6:7], s[34:35]
+; CHECK-NEXT:    s_mov_b64 s[8:9], s[38:39]
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; CHECK-NEXT:    s_mov_b32 s12, s45
+; CHECK-NEXT:    s_mov_b32 s13, s44
+; CHECK-NEXT:    s_mov_b32 s14, s43
+; CHECK-NEXT:    s_mov_b32 s15, s42
+; CHECK-NEXT:    v_mov_b32_e32 v31, v40
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT:    v_and_b32_e32 v2, 0x80000000, v41
+; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT:    v_or_b32_e32 v1, v2, v1
+; CHECK-NEXT:    v_readlane_b32 s45, v43, 13
+; CHECK-NEXT:    v_readlane_b32 s44, v43, 12
+; CHECK-NEXT:    v_readlane_b32 s43, v43, 11
+; CHECK-NEXT:    v_readlane_b32 s42, v43, 10
+; CHECK-NEXT:    v_readlane_b32 s41, v43, 9
+; CHECK-NEXT:    v_readlane_b32 s40, v43, 8
+; CHECK-NEXT:    v_readlane_b32 s39, v43, 7
+; CHECK-NEXT:    v_readlane_b32 s38, v43, 6
+; CHECK-NEXT:    v_readlane_b32 s37, v43, 5
+; CHECK-NEXT:    v_readlane_b32 s36, v43, 4
+; CHECK-NEXT:    v_readlane_b32 s35, v43, 3
+; CHECK-NEXT:    v_readlane_b32 s34, v43, 2
+; CHECK-NEXT:    v_readlane_b32 s31, v43, 1
+; CHECK-NEXT:    v_readlane_b32 s30, v43, 0
+; CHECK-NEXT:    v_readlane_b32 s4, v43, 14
+; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; CHECK-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
+; CHECK-NEXT:    s_addk_i32 s32, 0xf800
+; CHECK-NEXT:    s_mov_b32 s33, s4
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %y = or i32 %y.arg, 1
   %call = tail call fast double @_Z4powndi(double %x, i32 %y)
   ret double %call
@@ -126,5 +776,3 @@ define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) {
 
 !llvm.module.flags = !{!0}
 !0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK: {{.*}}


        


More information about the llvm-commits mailing list