[llvm] [AMDGPU] Extend llvm.amdgcn.set.inactive intrinsic to support Reg32/Reg64 types (PR #94457)

Vikram Hegde via llvm-commits llvm-commits at lists.llvm.org
Wed Jun 12 04:36:56 PDT 2024


https://github.com/vikramRH updated https://github.com/llvm/llvm-project/pull/94457

>From 6b8ea1efc30e036b7cf259d115d4b8e3d1c0a066 Mon Sep 17 00:00:00 2001
From: Vikram <Vikram.Hegde at amd.com>
Date: Wed, 5 Jun 2024 08:55:19 +0000
Subject: [PATCH 1/3] [AMDGPU] Extend llvm.amdgcn.set.incative intrinsic to
 support Reg32 and Reg64 types

---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      |   2 +-
 llvm/lib/Target/AMDGPU/SIInstructions.td      |  18 +-
 .../AMDGPU/llvm.amdgcn.set.inactive.ll        | 230 ++++++++++++++++++
 3 files changed, 243 insertions(+), 7 deletions(-)

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index bfb966dfad02d..1ce6acfdd6dc1 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2356,7 +2356,7 @@ def int_amdgcn_strict_wqm : Intrinsic<[llvm_any_ty],
 // program ever uses WQM, then the instruction and the first source will be
 // computed in WQM.
 def int_amdgcn_set_inactive :
-  Intrinsic<[llvm_anyint_ty],
+  Intrinsic<[llvm_any_ty],
             [LLVMMatchType<0>, // value to be copied
              LLVMMatchType<0>], // value for the inactive lanes to take
             [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index c1b844f844c32..418458f0171bd 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -252,16 +252,22 @@ def FPTRUNC_DOWNWARD_PSEUDO : VPseudoInstSI <(outs VGPR_32:$vdst),
 // restoring it after we're done.
 let Defs = [SCC], isConvergent = 1 in {
 def V_SET_INACTIVE_B32 : VPseudoInstSI <(outs VGPR_32:$vdst),
-  (ins VSrc_b32: $src, VSrc_b32:$inactive),
-  [(set i32:$vdst, (int_amdgcn_set_inactive i32:$src, i32:$inactive))]> {
-}
+  (ins VSrc_b32: $src, VSrc_b32:$inactive), []>;
 
 def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst),
-  (ins VSrc_b64: $src, VSrc_b64:$inactive),
-  [(set i64:$vdst, (int_amdgcn_set_inactive i64:$src, i64:$inactive))]> {
-}
+  (ins VSrc_b64: $src, VSrc_b64:$inactive), []>;
 } // End Defs = [SCC]
 
+foreach vt = Reg32Types.types in {
+def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)),
+     (V_SET_INACTIVE_B32 VSrc_b32:$src, VSrc_b32:$inactive)>;
+}
+
+foreach vt = Reg64Types.types in {
+def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)),
+     (V_SET_INACTIVE_B64 VSrc_b64:$src, VSrc_b64:$inactive)>;
+}
+
 def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)),
     (V_SET_INACTIVE_B32 VGPR_32:$src, VGPR_32:$inactive)>;
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
index 8302af7450ed9..40d08f8692fb1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
@@ -124,6 +124,236 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
   ret void
 }
 
+define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) {
+; GCN-LABEL: set_inactive_f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s5, 0x40400000
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    v_mov_b32_e32 v0, s5
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
+  %tmp = call float @llvm.amdgcn.set.inactive.f32(float %in, float 3.0) #0
+  store float %tmp, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
+; GCN-LABEL: set_inactive_f64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    s_mov_b32 s0, 0xcccccccd
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    s_mov_b32 s1, 0x4010cccc
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT:    s_endpgm
+  %tmp = call double @llvm.amdgcn.set.inactive.f64(double %in, double 4.2) #0
+  store double %tmp, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) {
+; GCN-LABEL: set_inactive_v2i16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s5, 0x10001
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    v_mov_b32_e32 v0, s5
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
+  %tmp = call <2 x i16> @llvm.amdgcn.set.inactive.v2i16(<2 x i16> %in, <2 x i16> <i16 1, i16 1>) #0
+  store <2 x i16> %tmp, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
+; GCN-LABEL: set_inactive_v2f16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s5, 0x3c003c00
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    v_mov_b32_e32 v0, s5
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
+  %tmp = call <2 x half> @llvm.amdgcn.set.inactive.v2f16(<2 x half> %in, <2 x half> <half 1.0, half 1.0>) #0
+  store <2 x half> %tmp, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %in) {
+; GCN-LABEL: set_inactive_v2i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_mov_b32 s8, 1
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s9, s8
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-NEXT:    v_mov_b32_e32 v1, s9
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT:    s_endpgm
+  %tmp = call <2 x i32> @llvm.amdgcn.set.inactive.v2i32(<2 x i32> %in, <2 x i32> <i32 1, i32 1>) #0
+  store <2 x i32> %tmp, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
+; GCN-LABEL: set_inactive_v2f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_mov_b32 s8, 1.0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s9, s8
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-NEXT:    v_mov_b32_e32 v1, s9
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT:    s_endpgm
+  %tmp = call <2 x float> @llvm.amdgcn.set.inactive.v2f32(<2 x float> %in, <2 x float> <float 1.0, float 1.0>) #0
+  store <2 x float> %tmp, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
+; GCN-LABEL: set_inactive_p0:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT:    s_endpgm
+  %tmp = call ptr @llvm.amdgcn.set.inactive.p0(ptr %in, ptr null) #0
+  store ptr %tmp, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) {
+; GCN-LABEL: set_inactive_p2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
+  %tmp = call ptr addrspace(2) @llvm.amdgcn.set.inactive.p2(ptr addrspace(2) %in, ptr addrspace(2) null) #0
+  store ptr addrspace(2) %tmp, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) {
+; GCN-LABEL: set_inactive_p3:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
+  %tmp = call ptr addrspace(3) @llvm.amdgcn.set.inactive.p3(ptr addrspace(3) %in, ptr addrspace(3) null) #0
+  store ptr addrspace(3) %tmp, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) {
+; GCN-LABEL: set_inactive_p5:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
+  %tmp = call ptr addrspace(5) @llvm.amdgcn.set.inactive.p5(ptr addrspace(5) %in, ptr addrspace(5) null) #0
+  store ptr addrspace(5) %tmp, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) {
+; GCN-LABEL: set_inactive_p6:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
+  %tmp = call ptr addrspace(6) @llvm.amdgcn.set.inactive.p6(ptr addrspace(6) %in, ptr addrspace(6) null) #0
+  store ptr addrspace(6) %tmp, ptr addrspace(1) %out
+  ret void
+}
+
 declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #0
 declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64) #0
 declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32)

>From aef4e710251d8c5a911a6bb0d47454066e49c202 Mon Sep 17 00:00:00 2001
From: Vikram <Vikram.Hegde at amd.com>
Date: Wed, 5 Jun 2024 10:05:37 +0000
Subject: [PATCH 2/3] add GIsel tests

---
 .../GlobalISel/llvm.amdgcn.set.inactive.ll    | 228 ++++++++++++++++++
 1 file changed, 228 insertions(+)

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
index cbee039df7fd0..e08b7cf38ebb5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
@@ -93,6 +93,234 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
   ret void
 }
 
+define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) {
+; GCN-LABEL: set_inactive_f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s3, s[0:1], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT:    v_mov_b32_e32 v1, 0x40400000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s3
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    v_mov_b32_e32 v0, v1
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
+  %tmp = call float @llvm.amdgcn.set.inactive.f32(float %in, float 3.0) #0
+  store float %tmp, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
+; GCN-LABEL: set_inactive_f64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_mov_b32 s4, 0xcccccccd
+; GCN-NEXT:    s_mov_b32 s5, 0x4010cccc
+; GCN-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    v_mov_b32_e32 v0, v2
+; GCN-NEXT:    v_mov_b32_e32 v1, v3
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
+  %tmp = call double @llvm.amdgcn.set.inactive.f64(double %in, double 4.2) #0
+  store double %tmp, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) {
+; GCN-LABEL: set_inactive_v2i16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s3, s[0:1], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT:    v_mov_b32_e32 v1, 0x10001
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s3
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    v_mov_b32_e32 v0, v1
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
+  %tmp = call <2 x i16> @llvm.amdgcn.set.inactive.v2i16(<2 x i16> %in, <2 x i16> <i16 1, i16 1>) #0
+  store <2 x i16> %tmp, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
+; GCN-LABEL: set_inactive_v2f16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s3, s[0:1], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT:    v_mov_b32_e32 v1, 0x3c003c00
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s3
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    v_mov_b32_e32 v0, v1
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
+  %tmp = call <2 x half> @llvm.amdgcn.set.inactive.v2f16(<2 x half> %in, <2 x half> <half 1.0, half 1.0>) #0
+  store <2 x half> %tmp, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %in) {
+; GCN-LABEL: set_inactive_v2i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_mov_b32 s4, 1
+; GCN-NEXT:    s_mov_b32 s5, s4
+; GCN-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    v_mov_b32_e32 v0, v2
+; GCN-NEXT:    v_mov_b32_e32 v1, v3
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
+  %tmp = call <2 x i32> @llvm.amdgcn.set.inactive.v2i32(<2 x i32> %in, <2 x i32> <i32 1, i32 1>) #0
+  store <2 x i32> %tmp, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
+; GCN-LABEL: set_inactive_v2f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_mov_b32 s4, 1.0
+; GCN-NEXT:    s_mov_b32 s5, s4
+; GCN-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    v_mov_b32_e32 v0, v2
+; GCN-NEXT:    v_mov_b32_e32 v1, v3
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
+  %tmp = call <2 x float> @llvm.amdgcn.set.inactive.v2f32(<2 x float> %in, <2 x float> <float 1.0, float 1.0>) #0
+  store <2 x float> %tmp, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
+; GCN-LABEL: set_inactive_p0:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
+  %tmp = call ptr @llvm.amdgcn.set.inactive.p0(ptr %in, ptr null) #0
+  store ptr %tmp, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) {
+; GCN-LABEL: set_inactive_p2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s3, s[0:1], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s3
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
+  %tmp = call ptr addrspace(2) @llvm.amdgcn.set.inactive.p2(ptr addrspace(2) %in, ptr addrspace(2) null) #0
+  store ptr addrspace(2) %tmp, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) {
+; GCN-LABEL: set_inactive_p3:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s3, s[0:1], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s3
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
+  %tmp = call ptr addrspace(3) @llvm.amdgcn.set.inactive.p3(ptr addrspace(3) %in, ptr addrspace(3) null) #0
+  store ptr addrspace(3) %tmp, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) {
+; GCN-LABEL: set_inactive_p5:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s3, s[0:1], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s3
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
+  %tmp = call ptr addrspace(5) @llvm.amdgcn.set.inactive.p5(ptr addrspace(5) %in, ptr addrspace(5) null) #0
+  store ptr addrspace(5) %tmp, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) {
+; GCN-LABEL: set_inactive_p6:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s3, s[0:1], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s3
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
+  %tmp = call ptr addrspace(6) @llvm.amdgcn.set.inactive.p6(ptr addrspace(6) %in, ptr addrspace(6) null) #0
+  store ptr addrspace(6) %tmp, ptr addrspace(1) %out
+  ret void
+}
+
 declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #0
 declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64) #0
 declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32)

>From e5a61fd3cf5cabddf153bee2f201a6671b73c103 Mon Sep 17 00:00:00 2001
From: Vikram <Vikram.Hegde at amd.com>
Date: Wed, 12 Jun 2024 07:34:40 -0400
Subject: [PATCH 3/3] add tests for v4i16, v4f16, v4bf16, v2bf16

---
 llvm/lib/Target/AMDGPU/SIRegisterInfo.td      |  2 +-
 .../GlobalISel/llvm.amdgcn.set.inactive.ll    | 92 +++++++++++++++++++
 .../AMDGPU/llvm.amdgcn.set.inactive.ll        | 92 +++++++++++++++++++
 3 files changed, 185 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index caac7126068ef..3666976cf82f8 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -586,7 +586,7 @@ class RegisterTypes<list<ValueType> reg_types> {
 
 def Reg16Types : RegisterTypes<[i16, f16, bf16]>;
 def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, v2bf16, p2, p3, p5, p6]>;
-def Reg64Types : RegisterTypes<[i64, f64, v2i32, v2f32, p0]>;
+def Reg64Types : RegisterTypes<[i64, f64, v2i32, v2f32, v4i16, v4f16, v4bf16, p0]>;
 
 let HasVGPR = 1 in {
 // VOP3 and VINTERP can access 256 lo and 256 hi registers.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
index e08b7cf38ebb5..0c60be9d94591 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
@@ -225,6 +225,98 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float>
   ret void
 }
 
+define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) {
+; GCN-LABEL: set_inactive_v2bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s3, s[0:1], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT:    v_mov_b32_e32 v1, 0x3f803f80
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s3
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    v_mov_b32_e32 v0, v1
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
+  %tmp = call <2 x bfloat> @llvm.amdgcn.set.inactive.v2bf16(<2 x bfloat> %in, <2 x bfloat> <bfloat 1.0, bfloat 1.0>) #0
+  store <2 x bfloat> %tmp, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> %in) {
+; GCN-LABEL: set_inactive_v4i16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_mov_b32 s4, 0x10001
+; GCN-NEXT:    s_mov_b32 s5, s4
+; GCN-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    v_mov_b32_e32 v0, v2
+; GCN-NEXT:    v_mov_b32_e32 v1, v3
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
+  %tmp = call <4 x i16> @llvm.amdgcn.set.inactive.v4i16(<4 x i16> %in, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) #0
+  store <4 x i16> %tmp, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
+; GCN-LABEL: set_inactive_v4f16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_mov_b32 s4, 0x3c003c00
+; GCN-NEXT:    s_mov_b32 s5, s4
+; GCN-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    v_mov_b32_e32 v0, v2
+; GCN-NEXT:    v_mov_b32_e32 v1, v3
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
+  %tmp = call <4 x half> @llvm.amdgcn.set.inactive.v4f16(<4 x half> %in, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>) #0
+  store <4 x half> %tmp, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in) {
+; GCN-LABEL: set_inactive_v4bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_mov_b32 s4, 0x3f803f80
+; GCN-NEXT:    s_mov_b32 s5, s4
+; GCN-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    v_mov_b32_e32 v0, v2
+; GCN-NEXT:    v_mov_b32_e32 v1, v3
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
+  %tmp = call <4 x bfloat> @llvm.amdgcn.set.inactive.v4bf16(<4 x bfloat> %in, <4 x bfloat> <bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0>) #0
+  store <4 x bfloat> %tmp, ptr addrspace(1) %out
+  ret void
+}
+
 define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
 ; GCN-LABEL: set_inactive_p0:
 ; GCN:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
index 40d08f8692fb1..fc33206845a71 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
@@ -256,6 +256,98 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float>
   ret void
 }
 
+define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) {
+; GCN-LABEL: set_inactive_v2bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s5, 0x3f803f80
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    v_mov_b32_e32 v0, s5
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
+  %tmp = call <2 x bfloat> @llvm.amdgcn.set.inactive.v2bf16(<2 x bfloat> %in, <2 x bfloat> <bfloat 1.0, bfloat 1.0>) #0
+  store <2 x bfloat> %tmp, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> %in) {
+; GCN-LABEL: set_inactive_v4i16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_mov_b32 s8, 0x10001
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s9, s8
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-NEXT:    v_mov_b32_e32 v1, s9
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT:    s_endpgm
+  %tmp = call <4 x i16> @llvm.amdgcn.set.inactive.v4i16(<4 x i16> %in, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) #0
+  store <4 x i16> %tmp, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
+; GCN-LABEL: set_inactive_v4f16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_mov_b32 s8, 0x3c003c00
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s9, s8
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-NEXT:    v_mov_b32_e32 v1, s9
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT:    s_endpgm
+  %tmp = call <4 x half> @llvm.amdgcn.set.inactive.v4f16(<4 x half> %in, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>) #0
+  store <4 x half> %tmp, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in) {
+; GCN-LABEL: set_inactive_v4bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT:    s_mov_b32 s8, 0x3f803f80
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s9, s8
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-NEXT:    v_mov_b32_e32 v1, s9
+; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT:    s_endpgm
+  %tmp = call <4 x bfloat> @llvm.amdgcn.set.inactive.v4bf16(<4 x bfloat> %in, <4 x bfloat> <bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0>) #0
+  store <4 x bfloat> %tmp, ptr addrspace(1) %out
+  ret void
+}
+
 define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
 ; GCN-LABEL: set_inactive_p0:
 ; GCN:       ; %bb.0:



More information about the llvm-commits mailing list