[llvm] dac8f97 - AMDGPU: Handle sitofp and uitofp exponents in fast pow expansion
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Fri Sep 1 05:22:30 PDT 2023
Author: Matt Arsenault
Date: 2023-09-01T08:22:16-04:00
New Revision: dac8f974b51b076773ec49603df4e627f4d9089c
URL: https://github.com/llvm/llvm-project/commit/dac8f974b51b076773ec49603df4e627f4d9089c
DIFF: https://github.com/llvm/llvm-project/commit/dac8f974b51b076773ec49603df4e627f4d9089c.diff
LOG: AMDGPU: Handle sitofp and uitofp exponents in fast pow expansion
https://reviews.llvm.org/D158996
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index edc1077de601e2..0fb1474f80127a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -567,7 +567,8 @@ bool AMDGPULibCalls::fold_read_write_pipe(CallInst *CI, IRBuilder<> &B,
return true;
}
-static bool isKnownIntegral(const Value *V) {
+static bool isKnownIntegral(const Value *V, const DataLayout &DL,
+ FastMathFlags FMF) {
if (isa<UndefValue>(V))
return true;
@@ -587,6 +588,24 @@ static bool isKnownIntegral(const Value *V) {
return true;
}
+ const Instruction *I = dyn_cast<Instruction>(V);
+ if (!I)
+ return false;
+
+ switch (I->getOpcode()) {
+ case Instruction::SIToFP:
+ case Instruction::UIToFP:
+ // TODO: Could check nofpclass(inf) on incoming argument
+ if (FMF.noInfs())
+ return true;
+
+ // Need to check int size cannot produce infinity, which computeKnownFPClass
+ // knows how to do already.
+ return isKnownNeverInfinity(I, DL);
+ default:
+ break;
+ }
+
return false;
}
@@ -1013,7 +1032,7 @@ bool AMDGPULibCalls::fold_pow(FPMathOperator *FPOp, IRBuilder<> &B,
if (needcopysign && (FInfo.getId() == AMDGPULibFunc::EI_POW)) {
// We cannot handle corner cases for a general pow() function, give up
// unless y is a constant integral value. Then proceed as if it were pown.
- if (!isKnownIntegral(opr1))
+ if (!isKnownIntegral(opr1, M->getDataLayout(), FPOp->getFastMathFlags()))
return false;
}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
index b58161c9c3419e..a8f75f039a0df7 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
@@ -57,12 +57,95 @@ define half @test_pow_fast_f16__integral_y(half %x, i32 %y.i) {
; CHECK-LABEL: test_pow_fast_f16__integral_y:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v1
-; CHECK-NEXT: s_getpc_b64 s[16:17]
-; CHECK-NEXT: s_add_u32 s16, s16, _Z3powDhDh at rel32@lo+4
-; CHECK-NEXT: s_addc_u32 s17, s17, _Z3powDhDh at rel32@hi+12
-; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CHECK-NEXT: s_setpc_b64 s[16:17]
+; CHECK-NEXT: s_mov_b32 s16, s33
+; CHECK-NEXT: s_mov_b32 s33, s32
+; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1
+; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; CHECK-NEXT: s_mov_b64 exec, s[18:19]
+; CHECK-NEXT: v_writelane_b32 v40, s16, 14
+; CHECK-NEXT: v_writelane_b32 v40, s30, 0
+; CHECK-NEXT: v_writelane_b32 v40, s31, 1
+; CHECK-NEXT: v_writelane_b32 v40, s34, 2
+; CHECK-NEXT: v_writelane_b32 v40, s35, 3
+; CHECK-NEXT: v_writelane_b32 v40, s36, 4
+; CHECK-NEXT: v_writelane_b32 v40, s37, 5
+; CHECK-NEXT: v_writelane_b32 v40, s38, 6
+; CHECK-NEXT: v_writelane_b32 v40, s39, 7
+; CHECK-NEXT: s_addk_i32 s32, 0x800
+; CHECK-NEXT: v_writelane_b32 v40, s40, 8
+; CHECK-NEXT: v_writelane_b32 v40, s41, 9
+; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5]
+; CHECK-NEXT: s_getpc_b64 s[4:5]
+; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2Dh at gotpcrel32@lo+4
+; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2Dh at gotpcrel32@hi+12
+; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT: v_mov_b32_e32 v42, v0
+; CHECK-NEXT: v_cvt_f32_i32_e32 v0, v1
+; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; CHECK-NEXT: v_writelane_b32 v40, s42, 10
+; CHECK-NEXT: v_writelane_b32 v40, s43, 11
+; CHECK-NEXT: v_writelane_b32 v40, s44, 12
+; CHECK-NEXT: v_cvt_f16_f32_e32 v43, v0
+; CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v42
+; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT: v_writelane_b32 v40, s45, 13
+; CHECK-NEXT: v_mov_b32_e32 v41, v31
+; CHECK-NEXT: s_mov_b32 s42, s15
+; CHECK-NEXT: s_mov_b32 s43, s14
+; CHECK-NEXT: s_mov_b32 s44, s13
+; CHECK-NEXT: s_mov_b32 s45, s12
+; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11]
+; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9]
+; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT: s_getpc_b64 s[4:5]
+; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2Dh at gotpcrel32@lo+4
+; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2Dh at gotpcrel32@hi+12
+; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; CHECK-NEXT: v_mul_f16_e32 v0, v0, v43
+; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
+; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37]
+; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
+; CHECK-NEXT: s_mov_b32 s12, s45
+; CHECK-NEXT: s_mov_b32 s13, s44
+; CHECK-NEXT: s_mov_b32 s14, s43
+; CHECK-NEXT: s_mov_b32 s15, s42
+; CHECK-NEXT: v_mov_b32_e32 v31, v41
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT: v_cvt_i16_f16_e32 v1, v43
+; CHECK-NEXT: v_lshlrev_b16_e32 v1, 15, v1
+; CHECK-NEXT: v_and_b32_e32 v1, v1, v42
+; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT: v_or_b32_e32 v0, v1, v0
+; CHECK-NEXT: v_readlane_b32 s45, v40, 13
+; CHECK-NEXT: v_readlane_b32 s44, v40, 12
+; CHECK-NEXT: v_readlane_b32 s43, v40, 11
+; CHECK-NEXT: v_readlane_b32 s42, v40, 10
+; CHECK-NEXT: v_readlane_b32 s41, v40, 9
+; CHECK-NEXT: v_readlane_b32 s40, v40, 8
+; CHECK-NEXT: v_readlane_b32 s39, v40, 7
+; CHECK-NEXT: v_readlane_b32 s38, v40, 6
+; CHECK-NEXT: v_readlane_b32 s37, v40, 5
+; CHECK-NEXT: v_readlane_b32 s36, v40, 4
+; CHECK-NEXT: v_readlane_b32 s35, v40, 3
+; CHECK-NEXT: v_readlane_b32 s34, v40, 2
+; CHECK-NEXT: v_readlane_b32 s31, v40, 1
+; CHECK-NEXT: v_readlane_b32 s30, v40, 0
+; CHECK-NEXT: v_readlane_b32 s4, v40, 14
+; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
+; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; CHECK-NEXT: s_mov_b64 exec, s[6:7]
+; CHECK-NEXT: s_addk_i32 s32, 0xf800
+; CHECK-NEXT: s_mov_b32 s33, s4
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
%y = sitofp i32 %y.i to half
%pow = tail call fast half @_Z3powDhDh(half %x, half %y)
ret half %pow
@@ -72,11 +155,93 @@ define float @test_pow_fast_f32__integral_y(float %x, i32 %y.i) {
; CHECK-LABEL: test_pow_fast_f32__integral_y:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v1
-; CHECK-NEXT: s_getpc_b64 s[16:17]
-; CHECK-NEXT: s_add_u32 s16, s16, _Z3powff at rel32@lo+4
-; CHECK-NEXT: s_addc_u32 s17, s17, _Z3powff at rel32@hi+12
-; CHECK-NEXT: s_setpc_b64 s[16:17]
+; CHECK-NEXT: s_mov_b32 s16, s33
+; CHECK-NEXT: s_mov_b32 s33, s32
+; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1
+; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; CHECK-NEXT: s_mov_b64 exec, s[18:19]
+; CHECK-NEXT: v_writelane_b32 v40, s16, 14
+; CHECK-NEXT: v_writelane_b32 v40, s30, 0
+; CHECK-NEXT: v_writelane_b32 v40, s31, 1
+; CHECK-NEXT: v_writelane_b32 v40, s34, 2
+; CHECK-NEXT: v_writelane_b32 v40, s35, 3
+; CHECK-NEXT: v_writelane_b32 v40, s36, 4
+; CHECK-NEXT: v_writelane_b32 v40, s37, 5
+; CHECK-NEXT: v_writelane_b32 v40, s38, 6
+; CHECK-NEXT: v_writelane_b32 v40, s39, 7
+; CHECK-NEXT: s_addk_i32 s32, 0x800
+; CHECK-NEXT: v_writelane_b32 v40, s40, 8
+; CHECK-NEXT: v_writelane_b32 v40, s41, 9
+; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5]
+; CHECK-NEXT: s_getpc_b64 s[4:5]
+; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2f at gotpcrel32@lo+4
+; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2f at gotpcrel32@hi+12
+; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; CHECK-NEXT: v_writelane_b32 v40, s42, 10
+; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT: v_writelane_b32 v40, s43, 11
+; CHECK-NEXT: v_mov_b32_e32 v42, v0
+; CHECK-NEXT: v_writelane_b32 v40, s44, 12
+; CHECK-NEXT: v_and_b32_e32 v0, 0x7fffffff, v42
+; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT: v_writelane_b32 v40, s45, 13
+; CHECK-NEXT: v_mov_b32_e32 v41, v31
+; CHECK-NEXT: s_mov_b32 s42, s15
+; CHECK-NEXT: s_mov_b32 s43, s14
+; CHECK-NEXT: s_mov_b32 s44, s13
+; CHECK-NEXT: s_mov_b32 s45, s12
+; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11]
+; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9]
+; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
+; CHECK-NEXT: v_cvt_f32_i32_e32 v43, v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT: s_getpc_b64 s[4:5]
+; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2f at gotpcrel32@lo+4
+; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2f at gotpcrel32@hi+12
+; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; CHECK-NEXT: v_mul_f32_e32 v0, v0, v43
+; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
+; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37]
+; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
+; CHECK-NEXT: s_mov_b32 s12, s45
+; CHECK-NEXT: s_mov_b32 s13, s44
+; CHECK-NEXT: s_mov_b32 s14, s43
+; CHECK-NEXT: s_mov_b32 s15, s42
+; CHECK-NEXT: v_mov_b32_e32 v31, v41
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT: v_cvt_i32_f32_e32 v1, v43
+; CHECK-NEXT: v_readlane_b32 s45, v40, 13
+; CHECK-NEXT: v_readlane_b32 s44, v40, 12
+; CHECK-NEXT: v_readlane_b32 s43, v40, 11
+; CHECK-NEXT: v_lshlrev_b32_e32 v1, 31, v1
+; CHECK-NEXT: v_and_or_b32 v0, v1, v42, v0
+; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT: v_readlane_b32 s42, v40, 10
+; CHECK-NEXT: v_readlane_b32 s41, v40, 9
+; CHECK-NEXT: v_readlane_b32 s40, v40, 8
+; CHECK-NEXT: v_readlane_b32 s39, v40, 7
+; CHECK-NEXT: v_readlane_b32 s38, v40, 6
+; CHECK-NEXT: v_readlane_b32 s37, v40, 5
+; CHECK-NEXT: v_readlane_b32 s36, v40, 4
+; CHECK-NEXT: v_readlane_b32 s35, v40, 3
+; CHECK-NEXT: v_readlane_b32 s34, v40, 2
+; CHECK-NEXT: v_readlane_b32 s31, v40, 1
+; CHECK-NEXT: v_readlane_b32 s30, v40, 0
+; CHECK-NEXT: v_readlane_b32 s4, v40, 14
+; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
+; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; CHECK-NEXT: s_mov_b64 exec, s[6:7]
+; CHECK-NEXT: s_addk_i32 s32, 0xf800
+; CHECK-NEXT: s_mov_b32 s33, s4
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
%y = sitofp i32 %y.i to float
%pow = tail call fast float @_Z3powff(float %x, float %y)
ret float %pow
@@ -86,11 +251,98 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) {
; CHECK-LABEL: test_pow_fast_f64__integral_y:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_cvt_f64_i32_e32 v[2:3], v2
-; CHECK-NEXT: s_getpc_b64 s[16:17]
-; CHECK-NEXT: s_add_u32 s16, s16, _Z3powdd at rel32@lo+4
-; CHECK-NEXT: s_addc_u32 s17, s17, _Z3powdd at rel32@hi+12
-; CHECK-NEXT: s_setpc_b64 s[16:17]
+; CHECK-NEXT: s_mov_b32 s16, s33
+; CHECK-NEXT: s_mov_b32 s33, s32
+; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1
+; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; CHECK-NEXT: s_mov_b64 exec, s[18:19]
+; CHECK-NEXT: v_writelane_b32 v40, s16, 14
+; CHECK-NEXT: v_writelane_b32 v40, s30, 0
+; CHECK-NEXT: v_writelane_b32 v40, s31, 1
+; CHECK-NEXT: v_writelane_b32 v40, s34, 2
+; CHECK-NEXT: v_writelane_b32 v40, s35, 3
+; CHECK-NEXT: v_writelane_b32 v40, s36, 4
+; CHECK-NEXT: v_writelane_b32 v40, s37, 5
+; CHECK-NEXT: v_writelane_b32 v40, s38, 6
+; CHECK-NEXT: v_writelane_b32 v40, s39, 7
+; CHECK-NEXT: s_addk_i32 s32, 0x800
+; CHECK-NEXT: v_writelane_b32 v40, s40, 8
+; CHECK-NEXT: v_writelane_b32 v40, s41, 9
+; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5]
+; CHECK-NEXT: s_getpc_b64 s[4:5]
+; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
+; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
+; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; CHECK-NEXT: v_writelane_b32 v40, s42, 10
+; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT: v_writelane_b32 v40, s43, 11
+; CHECK-NEXT: v_mov_b32_e32 v43, v1
+; CHECK-NEXT: v_writelane_b32 v40, s44, 12
+; CHECK-NEXT: v_mov_b32_e32 v42, v2
+; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v43
+; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT: v_writelane_b32 v40, s45, 13
+; CHECK-NEXT: v_mov_b32_e32 v41, v31
+; CHECK-NEXT: s_mov_b32 s42, s15
+; CHECK-NEXT: s_mov_b32 s43, s14
+; CHECK-NEXT: s_mov_b32 s44, s13
+; CHECK-NEXT: s_mov_b32 s45, s12
+; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11]
+; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9]
+; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
+; CHECK-NEXT: v_cvt_f64_i32_e32 v[44:45], v42
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT: v_mul_f64 v[0:1], v[0:1], v[44:45]
+; CHECK-NEXT: s_getpc_b64 s[4:5]
+; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2d at gotpcrel32@lo+4
+; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2d at gotpcrel32@hi+12
+; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
+; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37]
+; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
+; CHECK-NEXT: s_mov_b32 s12, s45
+; CHECK-NEXT: s_mov_b32 s13, s44
+; CHECK-NEXT: s_mov_b32 s14, s43
+; CHECK-NEXT: s_mov_b32 s15, s42
+; CHECK-NEXT: v_mov_b32_e32 v31, v41
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT: v_lshlrev_b32_e32 v2, 31, v42
+; CHECK-NEXT: v_and_b32_e32 v2, v2, v43
+; CHECK-NEXT: buffer_load_dword v45, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; CHECK-NEXT: v_or_b32_e32 v1, v2, v1
+; CHECK-NEXT: v_readlane_b32 s45, v40, 13
+; CHECK-NEXT: v_readlane_b32 s44, v40, 12
+; CHECK-NEXT: v_readlane_b32 s43, v40, 11
+; CHECK-NEXT: v_readlane_b32 s42, v40, 10
+; CHECK-NEXT: v_readlane_b32 s41, v40, 9
+; CHECK-NEXT: v_readlane_b32 s40, v40, 8
+; CHECK-NEXT: v_readlane_b32 s39, v40, 7
+; CHECK-NEXT: v_readlane_b32 s38, v40, 6
+; CHECK-NEXT: v_readlane_b32 s37, v40, 5
+; CHECK-NEXT: v_readlane_b32 s36, v40, 4
+; CHECK-NEXT: v_readlane_b32 s35, v40, 3
+; CHECK-NEXT: v_readlane_b32 s34, v40, 2
+; CHECK-NEXT: v_readlane_b32 s31, v40, 1
+; CHECK-NEXT: v_readlane_b32 s30, v40, 0
+; CHECK-NEXT: v_readlane_b32 s4, v40, 14
+; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
+; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; CHECK-NEXT: s_mov_b64 exec, s[6:7]
+; CHECK-NEXT: s_addk_i32 s32, 0xf800
+; CHECK-NEXT: s_mov_b32 s33, s4
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
%y = sitofp i32 %y.i to double
%pow = tail call fast double @_Z3powdd(double %x, double %y)
ret double %pow
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll
index 6b72ba56fd20b9..fe4c297680e38c 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll
@@ -2208,8 +2208,18 @@ define float @test_pow_afn_nnan_ninf_f32_known_integral_sitofp(float %x, i32 %y)
; CHECK-LABEL: define float @test_pow_afn_nnan_ninf_f32_known_integral_sitofp
; CHECK-SAME: (float [[X:%.*]], i32 [[Y:%.*]]) {
; CHECK-NEXT: [[Y_CAST:%.*]] = sitofp i32 [[Y]] to float
-; CHECK-NEXT: [[POW:%.*]] = tail call nnan ninf afn float @_Z3powff(float [[X]], float [[Y_CAST]])
-; CHECK-NEXT: ret float [[POW]]
+; CHECK-NEXT: [[__FABS:%.*]] = call nnan ninf afn float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn float @_Z4log2f(float [[__FABS]])
+; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn float [[__LOG2]], [[Y_CAST]]
+; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn float @_Z4exp2f(float [[__YLOGX]])
+; CHECK-NEXT: [[__YTOU:%.*]] = fptosi float [[Y_CAST]] to i32
+; CHECK-NEXT: [[__YEVEN:%.*]] = shl i32 [[__YTOU]], 31
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast float [[X]] to i32
+; CHECK-NEXT: [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP1]]
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast float [[__EXP2]] to i32
+; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[__POW_SIGN]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32 [[TMP3]] to float
+; CHECK-NEXT: ret float [[TMP4]]
;
%y.cast = sitofp i32 %y to float
%pow = tail call afn nnan ninf float @_Z3powff(float %x, float %y.cast)
@@ -2280,8 +2290,18 @@ define float @test_pow_afn_nnan_ninf_f32_known_integral_uitofp(float %x, i32 %y)
; CHECK-LABEL: define float @test_pow_afn_nnan_ninf_f32_known_integral_uitofp
; CHECK-SAME: (float [[X:%.*]], i32 [[Y:%.*]]) {
; CHECK-NEXT: [[Y_CAST:%.*]] = uitofp i32 [[Y]] to float
-; CHECK-NEXT: [[POW:%.*]] = tail call nnan ninf afn float @_Z3powff(float [[X]], float [[Y_CAST]])
-; CHECK-NEXT: ret float [[POW]]
+; CHECK-NEXT: [[__FABS:%.*]] = call nnan ninf afn float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn float @_Z4log2f(float [[__FABS]])
+; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn float [[__LOG2]], [[Y_CAST]]
+; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn float @_Z4exp2f(float [[__YLOGX]])
+; CHECK-NEXT: [[__YTOU:%.*]] = fptosi float [[Y_CAST]] to i32
+; CHECK-NEXT: [[__YEVEN:%.*]] = shl i32 [[__YTOU]], 31
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast float [[X]] to i32
+; CHECK-NEXT: [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP1]]
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast float [[__EXP2]] to i32
+; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[__POW_SIGN]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32 [[TMP3]] to float
+; CHECK-NEXT: ret float [[TMP4]]
;
%y.cast = uitofp i32 %y to float
%pow = tail call afn nnan ninf float @_Z3powff(float %x, float %y.cast)
@@ -2318,8 +2338,18 @@ define float @test_pow_afn_nnan_ninf_f32_known_integral_uitofp_i256(float %x, i2
; CHECK-LABEL: define float @test_pow_afn_nnan_ninf_f32_known_integral_uitofp_i256
; CHECK-SAME: (float [[X:%.*]], i256 [[Y:%.*]]) {
; CHECK-NEXT: [[Y_CAST:%.*]] = uitofp i256 [[Y]] to float
-; CHECK-NEXT: [[POW:%.*]] = tail call nnan ninf afn float @_Z3powff(float [[X]], float [[Y_CAST]])
-; CHECK-NEXT: ret float [[POW]]
+; CHECK-NEXT: [[__FABS:%.*]] = call nnan ninf afn float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn float @_Z4log2f(float [[__FABS]])
+; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn float [[__LOG2]], [[Y_CAST]]
+; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn float @_Z4exp2f(float [[__YLOGX]])
+; CHECK-NEXT: [[__YTOU:%.*]] = fptosi float [[Y_CAST]] to i32
+; CHECK-NEXT: [[__YEVEN:%.*]] = shl i32 [[__YTOU]], 31
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast float [[X]] to i32
+; CHECK-NEXT: [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP1]]
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast float [[__EXP2]] to i32
+; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[__POW_SIGN]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32 [[TMP3]] to float
+; CHECK-NEXT: ret float [[TMP4]]
;
%y.cast = uitofp i256 %y to float
%pow = tail call afn nnan ninf float @_Z3powff(float %x, float %y.cast)
@@ -2330,8 +2360,18 @@ define float @test_pow_afn_nnan_ninf_f32_known_integral_sitofp_i256(float %x, i2
; CHECK-LABEL: define float @test_pow_afn_nnan_ninf_f32_known_integral_sitofp_i256
; CHECK-SAME: (float [[X:%.*]], i256 [[Y:%.*]]) {
; CHECK-NEXT: [[Y_CAST:%.*]] = sitofp i256 [[Y]] to float
-; CHECK-NEXT: [[POW:%.*]] = tail call nnan ninf afn float @_Z3powff(float [[X]], float [[Y_CAST]])
-; CHECK-NEXT: ret float [[POW]]
+; CHECK-NEXT: [[__FABS:%.*]] = call nnan ninf afn float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn float @_Z4log2f(float [[__FABS]])
+; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn float [[__LOG2]], [[Y_CAST]]
+; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn float @_Z4exp2f(float [[__YLOGX]])
+; CHECK-NEXT: [[__YTOU:%.*]] = fptosi float [[Y_CAST]] to i32
+; CHECK-NEXT: [[__YEVEN:%.*]] = shl i32 [[__YTOU]], 31
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast float [[X]] to i32
+; CHECK-NEXT: [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP1]]
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast float [[__EXP2]] to i32
+; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[__POW_SIGN]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32 [[TMP3]] to float
+; CHECK-NEXT: ret float [[TMP4]]
;
%y.cast = sitofp i256 %y to float
%pow = tail call afn nnan ninf float @_Z3powff(float %x, float %y.cast)
@@ -2342,8 +2382,18 @@ define <2 x float> @test_pow_afn_nnan_ninf_v2f32_known_integral_sitofp(<2 x floa
; CHECK-LABEL: define <2 x float> @test_pow_afn_nnan_ninf_v2f32_known_integral_sitofp
; CHECK-SAME: (<2 x float> [[X:%.*]], <2 x i32> [[Y:%.*]]) {
; CHECK-NEXT: [[Y_CAST:%.*]] = sitofp <2 x i32> [[Y]] to <2 x float>
-; CHECK-NEXT: [[POW:%.*]] = tail call nnan ninf afn <2 x float> @_Z3powDv2_fS_(<2 x float> [[X]], <2 x float> [[Y_CAST]])
-; CHECK-NEXT: ret <2 x float> [[POW]]
+; CHECK-NEXT: [[__FABS:%.*]] = call nnan ninf afn <2 x float> @llvm.fabs.v2f32(<2 x float> [[X]])
+; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn <2 x float> @_Z4log2Dv2_f(<2 x float> [[__FABS]])
+; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn <2 x float> [[__LOG2]], [[Y_CAST]]
+; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn <2 x float> @_Z4exp2Dv2_f(<2 x float> [[__YLOGX]])
+; CHECK-NEXT: [[__YTOU:%.*]] = fptosi <2 x float> [[Y_CAST]] to <2 x i32>
+; CHECK-NEXT: [[__YEVEN:%.*]] = shl <2 x i32> [[__YTOU]], <i32 31, i32 31>
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[X]] to <2 x i32>
+; CHECK-NEXT: [[__POW_SIGN:%.*]] = and <2 x i32> [[__YEVEN]], [[TMP1]]
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[__EXP2]] to <2 x i32>
+; CHECK-NEXT: [[TMP3:%.*]] = or <2 x i32> [[__POW_SIGN]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <2 x float>
+; CHECK-NEXT: ret <2 x float> [[TMP4]]
;
%y.cast = sitofp <2 x i32> %y to <2 x float>
%pow = tail call afn nnan ninf <2 x float> @_Z3powDv2_fS_(<2 x float> %x, <2 x float> %y.cast)
@@ -2378,8 +2428,18 @@ define <2 x float> @test_pow_afn_nnan_ninf_v2f32_known_integral_uitofp(<2 x floa
; CHECK-LABEL: define <2 x float> @test_pow_afn_nnan_ninf_v2f32_known_integral_uitofp
; CHECK-SAME: (<2 x float> [[X:%.*]], <2 x i32> [[Y:%.*]]) {
; CHECK-NEXT: [[Y_CAST:%.*]] = uitofp <2 x i32> [[Y]] to <2 x float>
-; CHECK-NEXT: [[POW:%.*]] = tail call nnan ninf afn <2 x float> @_Z3powDv2_fS_(<2 x float> [[X]], <2 x float> [[Y_CAST]])
-; CHECK-NEXT: ret <2 x float> [[POW]]
+; CHECK-NEXT: [[__FABS:%.*]] = call nnan ninf afn <2 x float> @llvm.fabs.v2f32(<2 x float> [[X]])
+; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn <2 x float> @_Z4log2Dv2_f(<2 x float> [[__FABS]])
+; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn <2 x float> [[__LOG2]], [[Y_CAST]]
+; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn <2 x float> @_Z4exp2Dv2_f(<2 x float> [[__YLOGX]])
+; CHECK-NEXT: [[__YTOU:%.*]] = fptosi <2 x float> [[Y_CAST]] to <2 x i32>
+; CHECK-NEXT: [[__YEVEN:%.*]] = shl <2 x i32> [[__YTOU]], <i32 31, i32 31>
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[X]] to <2 x i32>
+; CHECK-NEXT: [[__POW_SIGN:%.*]] = and <2 x i32> [[__YEVEN]], [[TMP1]]
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[__EXP2]] to <2 x i32>
+; CHECK-NEXT: [[TMP3:%.*]] = or <2 x i32> [[__POW_SIGN]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <2 x float>
+; CHECK-NEXT: ret <2 x float> [[TMP4]]
;
%y.cast = uitofp <2 x i32> %y to <2 x float>
%pow = tail call afn nnan ninf <2 x float> @_Z3powDv2_fS_(<2 x float> %x, <2 x float> %y.cast)
More information about the llvm-commits
mailing list