[llvm] [AMDGPU] Extend llvm.amdgcn.set.inactive intrinsic to support Reg32/Reg64 types (PR #94457)
Vikram Hegde via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 12 04:36:56 PDT 2024
https://github.com/vikramRH updated https://github.com/llvm/llvm-project/pull/94457
>From 6b8ea1efc30e036b7cf259d115d4b8e3d1c0a066 Mon Sep 17 00:00:00 2001
From: Vikram <Vikram.Hegde at amd.com>
Date: Wed, 5 Jun 2024 08:55:19 +0000
Subject: [PATCH 1/3] [AMDGPU] Extend llvm.amdgcn.set.incative intrinsic to
support Reg32 and Reg64 types
---
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 2 +-
llvm/lib/Target/AMDGPU/SIInstructions.td | 18 +-
.../AMDGPU/llvm.amdgcn.set.inactive.ll | 230 ++++++++++++++++++
3 files changed, 243 insertions(+), 7 deletions(-)
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index bfb966dfad02d..1ce6acfdd6dc1 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2356,7 +2356,7 @@ def int_amdgcn_strict_wqm : Intrinsic<[llvm_any_ty],
// program ever uses WQM, then the instruction and the first source will be
// computed in WQM.
def int_amdgcn_set_inactive :
- Intrinsic<[llvm_anyint_ty],
+ Intrinsic<[llvm_any_ty],
[LLVMMatchType<0>, // value to be copied
LLVMMatchType<0>], // value for the inactive lanes to take
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index c1b844f844c32..418458f0171bd 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -252,16 +252,22 @@ def FPTRUNC_DOWNWARD_PSEUDO : VPseudoInstSI <(outs VGPR_32:$vdst),
// restoring it after we're done.
let Defs = [SCC], isConvergent = 1 in {
def V_SET_INACTIVE_B32 : VPseudoInstSI <(outs VGPR_32:$vdst),
- (ins VSrc_b32: $src, VSrc_b32:$inactive),
- [(set i32:$vdst, (int_amdgcn_set_inactive i32:$src, i32:$inactive))]> {
-}
+ (ins VSrc_b32: $src, VSrc_b32:$inactive), []>;
def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst),
- (ins VSrc_b64: $src, VSrc_b64:$inactive),
- [(set i64:$vdst, (int_amdgcn_set_inactive i64:$src, i64:$inactive))]> {
-}
+ (ins VSrc_b64: $src, VSrc_b64:$inactive), []>;
} // End Defs = [SCC]
+foreach vt = Reg32Types.types in {
+def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)),
+ (V_SET_INACTIVE_B32 VSrc_b32:$src, VSrc_b32:$inactive)>;
+}
+
+foreach vt = Reg64Types.types in {
+def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)),
+ (V_SET_INACTIVE_B64 VSrc_b64:$src, VSrc_b64:$inactive)>;
+}
+
def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)),
(V_SET_INACTIVE_B32 VGPR_32:$src, VGPR_32:$inactive)>;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
index 8302af7450ed9..40d08f8692fb1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
@@ -124,6 +124,236 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
ret void
}
+define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) {
+; GCN-LABEL: set_inactive_f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s5, 0x40400000
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, s5
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call float @llvm.amdgcn.set.inactive.f32(float %in, float 3.0) #0
+ store float %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
+; GCN-LABEL: set_inactive_f64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: s_mov_b32 s0, 0xcccccccd
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: s_mov_b32 s1, 0x4010cccc
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call double @llvm.amdgcn.set.inactive.f64(double %in, double 4.2) #0
+ store double %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) {
+; GCN-LABEL: set_inactive_v2i16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s5, 0x10001
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, s5
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call <2 x i16> @llvm.amdgcn.set.inactive.v2i16(<2 x i16> %in, <2 x i16> <i16 1, i16 1>) #0
+ store <2 x i16> %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
+; GCN-LABEL: set_inactive_v2f16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s5, 0x3c003c00
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, s5
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call <2 x half> @llvm.amdgcn.set.inactive.v2f16(<2 x half> %in, <2 x half> <half 1.0, half 1.0>) #0
+ store <2 x half> %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %in) {
+; GCN-LABEL: set_inactive_v2i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s8, 1
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s9, s8
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, s8
+; GCN-NEXT: v_mov_b32_e32 v1, s9
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call <2 x i32> @llvm.amdgcn.set.inactive.v2i32(<2 x i32> %in, <2 x i32> <i32 1, i32 1>) #0
+ store <2 x i32> %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
+; GCN-LABEL: set_inactive_v2f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s8, 1.0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s9, s8
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, s8
+; GCN-NEXT: v_mov_b32_e32 v1, s9
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call <2 x float> @llvm.amdgcn.set.inactive.v2f32(<2 x float> %in, <2 x float> <float 1.0, float 1.0>) #0
+ store <2 x float> %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
+; GCN-LABEL: set_inactive_p0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call ptr @llvm.amdgcn.set.inactive.p0(ptr %in, ptr null) #0
+ store ptr %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) {
+; GCN-LABEL: set_inactive_p2:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call ptr addrspace(2) @llvm.amdgcn.set.inactive.p2(ptr addrspace(2) %in, ptr addrspace(2) null) #0
+ store ptr addrspace(2) %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) {
+; GCN-LABEL: set_inactive_p3:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call ptr addrspace(3) @llvm.amdgcn.set.inactive.p3(ptr addrspace(3) %in, ptr addrspace(3) null) #0
+ store ptr addrspace(3) %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) {
+; GCN-LABEL: set_inactive_p5:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call ptr addrspace(5) @llvm.amdgcn.set.inactive.p5(ptr addrspace(5) %in, ptr addrspace(5) null) #0
+ store ptr addrspace(5) %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) {
+; GCN-LABEL: set_inactive_p6:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call ptr addrspace(6) @llvm.amdgcn.set.inactive.p6(ptr addrspace(6) %in, ptr addrspace(6) null) #0
+ store ptr addrspace(6) %tmp, ptr addrspace(1) %out
+ ret void
+}
+
declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #0
declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64) #0
declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32)
>From aef4e710251d8c5a911a6bb0d47454066e49c202 Mon Sep 17 00:00:00 2001
From: Vikram <Vikram.Hegde at amd.com>
Date: Wed, 5 Jun 2024 10:05:37 +0000
Subject: [PATCH 2/3] add GIsel tests
---
.../GlobalISel/llvm.amdgcn.set.inactive.ll | 228 ++++++++++++++++++
1 file changed, 228 insertions(+)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
index cbee039df7fd0..e08b7cf38ebb5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
@@ -93,6 +93,234 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
ret void
}
+define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) {
+; GCN-LABEL: set_inactive_f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: v_mov_b32_e32 v1, 0x40400000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, v1
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call float @llvm.amdgcn.set.inactive.f32(float %in, float 3.0) #0
+ store float %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
+; GCN-LABEL: set_inactive_f64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s4, 0xcccccccd
+; GCN-NEXT: s_mov_b32 s5, 0x4010cccc
+; GCN-NEXT: v_mov_b32_e32 v2, s4
+; GCN-NEXT: v_mov_b32_e32 v3, s5
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, v2
+; GCN-NEXT: v_mov_b32_e32 v1, v3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call double @llvm.amdgcn.set.inactive.f64(double %in, double 4.2) #0
+ store double %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) {
+; GCN-LABEL: set_inactive_v2i16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: v_mov_b32_e32 v1, 0x10001
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, v1
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call <2 x i16> @llvm.amdgcn.set.inactive.v2i16(<2 x i16> %in, <2 x i16> <i16 1, i16 1>) #0
+ store <2 x i16> %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
+; GCN-LABEL: set_inactive_v2f16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: v_mov_b32_e32 v1, 0x3c003c00
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, v1
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call <2 x half> @llvm.amdgcn.set.inactive.v2f16(<2 x half> %in, <2 x half> <half 1.0, half 1.0>) #0
+ store <2 x half> %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %in) {
+; GCN-LABEL: set_inactive_v2i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s4, 1
+; GCN-NEXT: s_mov_b32 s5, s4
+; GCN-NEXT: v_mov_b32_e32 v2, s4
+; GCN-NEXT: v_mov_b32_e32 v3, s5
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, v2
+; GCN-NEXT: v_mov_b32_e32 v1, v3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call <2 x i32> @llvm.amdgcn.set.inactive.v2i32(<2 x i32> %in, <2 x i32> <i32 1, i32 1>) #0
+ store <2 x i32> %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
+; GCN-LABEL: set_inactive_v2f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s4, 1.0
+; GCN-NEXT: s_mov_b32 s5, s4
+; GCN-NEXT: v_mov_b32_e32 v2, s4
+; GCN-NEXT: v_mov_b32_e32 v3, s5
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, v2
+; GCN-NEXT: v_mov_b32_e32 v1, v3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call <2 x float> @llvm.amdgcn.set.inactive.v2f32(<2 x float> %in, <2 x float> <float 1.0, float 1.0>) #0
+ store <2 x float> %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
+; GCN-LABEL: set_inactive_p0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call ptr @llvm.amdgcn.set.inactive.p0(ptr %in, ptr null) #0
+ store ptr %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) {
+; GCN-LABEL: set_inactive_p2:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call ptr addrspace(2) @llvm.amdgcn.set.inactive.p2(ptr addrspace(2) %in, ptr addrspace(2) null) #0
+ store ptr addrspace(2) %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) {
+; GCN-LABEL: set_inactive_p3:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call ptr addrspace(3) @llvm.amdgcn.set.inactive.p3(ptr addrspace(3) %in, ptr addrspace(3) null) #0
+ store ptr addrspace(3) %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) {
+; GCN-LABEL: set_inactive_p5:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call ptr addrspace(5) @llvm.amdgcn.set.inactive.p5(ptr addrspace(5) %in, ptr addrspace(5) null) #0
+ store ptr addrspace(5) %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) {
+; GCN-LABEL: set_inactive_p6:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call ptr addrspace(6) @llvm.amdgcn.set.inactive.p6(ptr addrspace(6) %in, ptr addrspace(6) null) #0
+ store ptr addrspace(6) %tmp, ptr addrspace(1) %out
+ ret void
+}
+
declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #0
declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64) #0
declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32)
>From e5a61fd3cf5cabddf153bee2f201a6671b73c103 Mon Sep 17 00:00:00 2001
From: Vikram <Vikram.Hegde at amd.com>
Date: Wed, 12 Jun 2024 07:34:40 -0400
Subject: [PATCH 3/3] add tests for v4i16, v4f16, v4bf16, v2bf16
---
llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 2 +-
.../GlobalISel/llvm.amdgcn.set.inactive.ll | 92 +++++++++++++++++++
.../AMDGPU/llvm.amdgcn.set.inactive.ll | 92 +++++++++++++++++++
3 files changed, 185 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index caac7126068ef..3666976cf82f8 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -586,7 +586,7 @@ class RegisterTypes<list<ValueType> reg_types> {
def Reg16Types : RegisterTypes<[i16, f16, bf16]>;
def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, v2bf16, p2, p3, p5, p6]>;
-def Reg64Types : RegisterTypes<[i64, f64, v2i32, v2f32, p0]>;
+def Reg64Types : RegisterTypes<[i64, f64, v2i32, v2f32, v4i16, v4f16, v4bf16, p0]>;
let HasVGPR = 1 in {
// VOP3 and VINTERP can access 256 lo and 256 hi registers.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
index e08b7cf38ebb5..0c60be9d94591 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
@@ -225,6 +225,98 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float>
ret void
}
+define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) {
+; GCN-LABEL: set_inactive_v2bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: v_mov_b32_e32 v1, 0x3f803f80
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, v1
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call <2 x bfloat> @llvm.amdgcn.set.inactive.v2bf16(<2 x bfloat> %in, <2 x bfloat> <bfloat 1.0, bfloat 1.0>) #0
+ store <2 x bfloat> %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> %in) {
+; GCN-LABEL: set_inactive_v4i16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s4, 0x10001
+; GCN-NEXT: s_mov_b32 s5, s4
+; GCN-NEXT: v_mov_b32_e32 v2, s4
+; GCN-NEXT: v_mov_b32_e32 v3, s5
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, v2
+; GCN-NEXT: v_mov_b32_e32 v1, v3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call <4 x i16> @llvm.amdgcn.set.inactive.v4i16(<4 x i16> %in, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) #0
+ store <4 x i16> %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
+; GCN-LABEL: set_inactive_v4f16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s4, 0x3c003c00
+; GCN-NEXT: s_mov_b32 s5, s4
+; GCN-NEXT: v_mov_b32_e32 v2, s4
+; GCN-NEXT: v_mov_b32_e32 v3, s5
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, v2
+; GCN-NEXT: v_mov_b32_e32 v1, v3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call <4 x half> @llvm.amdgcn.set.inactive.v4f16(<4 x half> %in, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>) #0
+ store <4 x half> %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in) {
+; GCN-LABEL: set_inactive_v4bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s4, 0x3f803f80
+; GCN-NEXT: s_mov_b32 s5, s4
+; GCN-NEXT: v_mov_b32_e32 v2, s4
+; GCN-NEXT: v_mov_b32_e32 v3, s5
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, v2
+; GCN-NEXT: v_mov_b32_e32 v1, v3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call <4 x bfloat> @llvm.amdgcn.set.inactive.v4bf16(<4 x bfloat> %in, <4 x bfloat> <bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0>) #0
+ store <4 x bfloat> %tmp, ptr addrspace(1) %out
+ ret void
+}
+
define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
; GCN-LABEL: set_inactive_p0:
; GCN: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
index 40d08f8692fb1..fc33206845a71 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
@@ -256,6 +256,98 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float>
ret void
}
+define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) {
+; GCN-LABEL: set_inactive_v2bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s5, 0x3f803f80
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, s5
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call <2 x bfloat> @llvm.amdgcn.set.inactive.v2bf16(<2 x bfloat> %in, <2 x bfloat> <bfloat 1.0, bfloat 1.0>) #0
+ store <2 x bfloat> %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> %in) {
+; GCN-LABEL: set_inactive_v4i16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s8, 0x10001
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s9, s8
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, s8
+; GCN-NEXT: v_mov_b32_e32 v1, s9
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call <4 x i16> @llvm.amdgcn.set.inactive.v4i16(<4 x i16> %in, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) #0
+ store <4 x i16> %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
+; GCN-LABEL: set_inactive_v4f16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s8, 0x3c003c00
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s9, s8
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, s8
+; GCN-NEXT: v_mov_b32_e32 v1, s9
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call <4 x half> @llvm.amdgcn.set.inactive.v4f16(<4 x half> %in, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>) #0
+ store <4 x half> %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in) {
+; GCN-LABEL: set_inactive_v4bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s8, 0x3f803f80
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s9, s8
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, s8
+; GCN-NEXT: v_mov_b32_e32 v1, s9
+; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_endpgm
+ %tmp = call <4 x bfloat> @llvm.amdgcn.set.inactive.v4bf16(<4 x bfloat> %in, <4 x bfloat> <bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0>) #0
+ store <4 x bfloat> %tmp, ptr addrspace(1) %out
+ ret void
+}
+
define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
; GCN-LABEL: set_inactive_p0:
; GCN: ; %bb.0:
More information about the llvm-commits
mailing list