[llvm] [AMDGPU] Extend llvm.amdgcn.set.inactive intrinsic to support Reg32/Reg64 types (PR #94457)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 5 03:40:42 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-globalisel
Author: Vikram Hegde (vikramRH)
<details>
<summary>Changes</summary>
Missed this while handling other patches. Any comments/concerns ?
---
Patch is 20.48 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/94457.diff
4 Files Affected:
- (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+1-1)
- (modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+12-6)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll (+228)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll (+230)
``````````diff
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index bfb966dfad02d..1ce6acfdd6dc1 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2356,7 +2356,7 @@ def int_amdgcn_strict_wqm : Intrinsic<[llvm_any_ty],
// program ever uses WQM, then the instruction and the first source will be
// computed in WQM.
def int_amdgcn_set_inactive :
- Intrinsic<[llvm_anyint_ty],
+ Intrinsic<[llvm_any_ty],
[LLVMMatchType<0>, // value to be copied
LLVMMatchType<0>], // value for the inactive lanes to take
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index c1b844f844c32..418458f0171bd 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -252,16 +252,22 @@ def FPTRUNC_DOWNWARD_PSEUDO : VPseudoInstSI <(outs VGPR_32:$vdst),
// restoring it after we're done.
let Defs = [SCC], isConvergent = 1 in {
def V_SET_INACTIVE_B32 : VPseudoInstSI <(outs VGPR_32:$vdst),
- (ins VSrc_b32: $src, VSrc_b32:$inactive),
- [(set i32:$vdst, (int_amdgcn_set_inactive i32:$src, i32:$inactive))]> {
-}
+ (ins VSrc_b32: $src, VSrc_b32:$inactive), []>;
def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst),
- (ins VSrc_b64: $src, VSrc_b64:$inactive),
- [(set i64:$vdst, (int_amdgcn_set_inactive i64:$src, i64:$inactive))]> {
-}
+ (ins VSrc_b64: $src, VSrc_b64:$inactive), []>;
} // End Defs = [SCC]
+foreach vt = Reg32Types.types in {
+def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)),
+ (V_SET_INACTIVE_B32 VSrc_b32:$src, VSrc_b32:$inactive)>;
+}
+
+foreach vt = Reg64Types.types in {
+def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)),
+ (V_SET_INACTIVE_B64 VSrc_b64:$src, VSrc_b64:$inactive)>;
+}
+
def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)),
(V_SET_INACTIVE_B32 VGPR_32:$src, VGPR_32:$inactive)>;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
index cbee039df7fd0..e08b7cf38ebb5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
@@ -93,6 +93,234 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
ret void
}
+define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) {
+; GCN-LABEL: set_inactive_f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: v_mov_b32_e32 v1, 0x40400000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, v1
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call float @llvm.amdgcn.set.inactive.f32(float %in, float 3.0) #0
+ store float %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
+; GCN-LABEL: set_inactive_f64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s4, 0xcccccccd
+; GCN-NEXT: s_mov_b32 s5, 0x4010cccc
+; GCN-NEXT: v_mov_b32_e32 v2, s4
+; GCN-NEXT: v_mov_b32_e32 v3, s5
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, v2
+; GCN-NEXT: v_mov_b32_e32 v1, v3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call double @llvm.amdgcn.set.inactive.f64(double %in, double 4.2) #0
+ store double %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) {
+; GCN-LABEL: set_inactive_v2i16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: v_mov_b32_e32 v1, 0x10001
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, v1
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call <2 x i16> @llvm.amdgcn.set.inactive.v2i16(<2 x i16> %in, <2 x i16> <i16 1, i16 1>) #0
+ store <2 x i16> %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
+; GCN-LABEL: set_inactive_v2f16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: v_mov_b32_e32 v1, 0x3c003c00
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, v1
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call <2 x half> @llvm.amdgcn.set.inactive.v2f16(<2 x half> %in, <2 x half> <half 1.0, half 1.0>) #0
+ store <2 x half> %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %in) {
+; GCN-LABEL: set_inactive_v2i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s4, 1
+; GCN-NEXT: s_mov_b32 s5, s4
+; GCN-NEXT: v_mov_b32_e32 v2, s4
+; GCN-NEXT: v_mov_b32_e32 v3, s5
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, v2
+; GCN-NEXT: v_mov_b32_e32 v1, v3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call <2 x i32> @llvm.amdgcn.set.inactive.v2i32(<2 x i32> %in, <2 x i32> <i32 1, i32 1>) #0
+ store <2 x i32> %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
+; GCN-LABEL: set_inactive_v2f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s4, 1.0
+; GCN-NEXT: s_mov_b32 s5, s4
+; GCN-NEXT: v_mov_b32_e32 v2, s4
+; GCN-NEXT: v_mov_b32_e32 v3, s5
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, v2
+; GCN-NEXT: v_mov_b32_e32 v1, v3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call <2 x float> @llvm.amdgcn.set.inactive.v2f32(<2 x float> %in, <2 x float> <float 1.0, float 1.0>) #0
+ store <2 x float> %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
+; GCN-LABEL: set_inactive_p0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call ptr @llvm.amdgcn.set.inactive.p0(ptr %in, ptr null) #0
+ store ptr %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) {
+; GCN-LABEL: set_inactive_p2:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call ptr addrspace(2) @llvm.amdgcn.set.inactive.p2(ptr addrspace(2) %in, ptr addrspace(2) null) #0
+ store ptr addrspace(2) %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) {
+; GCN-LABEL: set_inactive_p3:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call ptr addrspace(3) @llvm.amdgcn.set.inactive.p3(ptr addrspace(3) %in, ptr addrspace(3) null) #0
+ store ptr addrspace(3) %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) {
+; GCN-LABEL: set_inactive_p5:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call ptr addrspace(5) @llvm.amdgcn.set.inactive.p5(ptr addrspace(5) %in, ptr addrspace(5) null) #0
+ store ptr addrspace(5) %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) {
+; GCN-LABEL: set_inactive_p6:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call ptr addrspace(6) @llvm.amdgcn.set.inactive.p6(ptr addrspace(6) %in, ptr addrspace(6) null) #0
+ store ptr addrspace(6) %tmp, ptr addrspace(1) %out
+ ret void
+}
+
declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #0
declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64) #0
declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
index 8302af7450ed9..40d08f8692fb1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
@@ -124,6 +124,236 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
ret void
}
+define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) {
+; GCN-LABEL: set_inactive_f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s5, 0x40400000
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, s5
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call float @llvm.amdgcn.set.inactive.f32(float %in, float 3.0) #0
+ store float %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
+; GCN-LABEL: set_inactive_f64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: s_mov_b32 s0, 0xcccccccd
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: s_mov_b32 s1, 0x4010cccc
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call double @llvm.amdgcn.set.inactive.f64(double %in, double 4.2) #0
+ store double %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) {
+; GCN-LABEL: set_inactive_v2i16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s5, 0x10001
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, s5
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call <2 x i16> @llvm.amdgcn.set.inactive.v2i16(<2 x i16> %in, <2 x i16> <i16 1, i16 1>) #0
+ store <2 x i16> %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
+; GCN-LABEL: set_inactive_v2f16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s5, 0x3c003c00
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, s5
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call <2 x half> @llvm.amdgcn.set.inactive.v2f16(<2 x half> %in, <2 x half> <half 1.0, half 1.0>) #0
+ store <2 x half> %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %in) {
+; GCN-LABEL: set_inactive_v2i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s8, 1
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s9, s8
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, s8
+; GCN-NEXT: v_mov_b32_e32 v1, s9
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call <2 x i32> @llvm.amdgcn.set.inactive.v2i32(<2 x i32> %in, <2 x i32> <i32 1, i32 1>) #0
+ store <2 x i32> %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
+; GCN-LABEL: set_inactive_v2f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s8, 1.0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s9, s8
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, s8
+; GCN-NEXT: v_mov_b32_e32 v1, s9
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call <2 x float> @llvm.amdgcn.set.inactive.v2f32(<2 x float> %in, <2 x float> <float 1.0, float 1.0>) #0
+ store <2 x float> %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
+; GCN-LABEL: set_inactive_p0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call ptr @llvm.amdgcn.set.inactive.p0(ptr %in, ptr null) #0
+ store ptr %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) {
+; GCN-LABEL: set_inactive_p2:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call ptr addrspace(2) @llvm.amdgcn.set.inactive.p2(ptr addrspace(2) %in, ptr addrspace(2) null) #0
+ store ptr addrspace(2) %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) {
+; GCN-LABEL: set_inactive_p3:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call ptr addrspace(3) @llvm.amdgcn.set.inactive.p3(ptr addrspace(3) %in, ptr addrspace(3) null) #0
+ store ptr addrspace(3) %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) {
+; GCN-LABEL: set_inactive_p5:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call ptr addrspace(5) @llvm.amdgcn.set.inactive.p5(ptr addrspace(5) %in, ptr addrspace(5) null) #0
+ store ptr addrspace(5) %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) {
+; GCN-LABEL: set_inactive_p6:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/94457
More information about the llvm-commits
mailing list