[llvm] 5447017 - [AMDGPU] Add inreg support for SGPR arguments (#67182)

via llvm-commits llvm-commits at lists.llvm.org
Wed Nov 8 11:35:56 PST 2023


Author: Jun Wang
Date: 2023-11-08T11:35:52-08:00
New Revision: 54470176afe20b16e6b026ab989591d1d19ad2b7

URL: https://github.com/llvm/llvm-project/commit/54470176afe20b16e6b026ab989591d1d19ad2b7
DIFF: https://github.com/llvm/llvm-project/commit/54470176afe20b16e6b026ab989591d1d19ad2b7.diff

LOG: [AMDGPU] Add inreg support for SGPR arguments (#67182)

Function parameters marked with inreg are supposed to be allocated to
SGPRs. However, for compute functions, this is ignored and function
parameters are allocated to VGPRs. This fix modifies CC_AMDGPU_Func in
AMDGPUCallingConv.td to use SGPRs if input arg is marked inreg.
---------

Co-authored-by: Jun Wang <jun.wang7 at amd.com>

Added: 
    llvm/test/CodeGen/AMDGPU/function-args-inreg.ll

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
    llvm/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
    llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll
    llvm/test/Analysis/UniformityAnalysis/AMDGPU/kernel-args.ll
    llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 443bb45a2351491..9036b26a6f6bcb4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -187,6 +187,11 @@ def CC_AMDGPU_Func : CallingConv<[
   CCIfByVal<CCPassByVal<4, 4>>,
   CCIfType<[i1], CCPromoteToType<i32>>,
   CCIfType<[i8, i16], CCIfExtend<CCPromoteToType<i32>>>,
+
+  CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<
+    !foreach(i, !range(0, 30), !cast<Register>("SGPR"#i))  // SGPR0-29
+  >>>,
+
   CCIfType<[i32, f32, i16, f16, v2i16, v2f16, i1], CCAssignToReg<[
     VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
     VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,

diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index cff7e4bc66218ca..4681004d3ba74ff 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2665,6 +2665,11 @@ SDValue SITargetLowering::LowerFormalArguments(
 
   if (!IsKernel) {
     CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
+    if (!IsGraphics && !Subtarget->enableFlatScratch()) {
+      CCInfo.AllocateRegBlock(ArrayRef<MCPhysReg>{AMDGPU::SGPR0, AMDGPU::SGPR1,
+                                                  AMDGPU::SGPR2, AMDGPU::SGPR3},
+                              4);
+    }
     CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
   }
 

diff  --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 7b8a37532c9fad0..d0c84f7bf2574ca 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -2529,8 +2529,8 @@ bool isArgPassedInSGPR(const Argument *A) {
     return A->hasAttribute(Attribute::InReg) ||
            A->hasAttribute(Attribute::ByVal);
   default:
-    // TODO: Should calls support inreg for SGPR inputs?
-    return false;
+    // TODO: treat i1 as divergent?
+    return A->hasAttribute(Attribute::InReg);
   }
 }
 
@@ -2556,8 +2556,7 @@ bool isArgPassedInSGPR(const CallBase *CB, unsigned ArgNo) {
     return CB->paramHasAttr(ArgNo, Attribute::InReg) ||
            CB->paramHasAttr(ArgNo, Attribute::ByVal);
   default:
-    // TODO: Should calls support inreg for SGPR inputs?
-    return false;
+    return CB->paramHasAttr(ArgNo, Attribute::InReg);
   }
 }
 

diff  --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll
index 48528c6112b00ef..d7d8d29fbfd42e5 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll
@@ -39,6 +39,15 @@ define i32 @asm_sgpr(i32 %divergent) {
   ret i32 %sgpr
 }
 
+; SGPR asm outputs are uniform regardless of the input operands.
+; Argument not divergent if marked inreg.
+; CHECK-LABEL: for function 'asm_sgpr_inreg_arg':
+; CHECK-NOT: DIVERGENT
+define i32 @asm_sgpr_inreg_arg(i32 inreg %divergent) {
+  %sgpr = call i32 asm "; def $0, $1","=s,v"(i32 %divergent)
+  ret i32 %sgpr
+}
+
 ; CHECK-LABEL: for function 'asm_mixed_sgpr_vgpr':
 ; CHECK: DIVERGENT: %asm = call { i32, i32 } asm "; def $0, $1, $2", "=s,=v,v"(i32 %divergent)
 ; CHECK-NEXT: {{^[ \t]+}}%sgpr = extractvalue { i32, i32 } %asm, 0
@@ -58,6 +67,18 @@ define void @single_lane_func_arguments(i32 %i32, i1 %i1) #2 {
  ret void
 }
 
+; CHECK-LABEL: for function 'divergent_args':
+; CHECK: DIVERGENT ARGUMENTS
+define void @divergent_args(i32 %i32, i1 %i1) {
+ ret void
+}
+
+; CHECK-LABEL: for function 'no_divergent_args_if_inreg':
+; CHECK-NOT: DIVERGENT
+define void @no_divergent_args_if_inreg(i32 inreg %i32, i1 inreg %i1) {
+ ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 declare i32 @llvm.amdgcn.readfirstlane(i32) #0
 declare i64 @llvm.amdgcn.icmp.i32(i32, i32, i32) #1

diff  --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/kernel-args.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/kernel-args.ll
index 4b014f969257bdb..f6fe654896a2de9 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/kernel-args.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/kernel-args.ll
@@ -30,8 +30,6 @@ define amdgpu_kernel void @test_amdgpu_kernel(ptr addrspace(4) byref([4 x <16 x
 ; CHECK: DIVERGENT:
 ; CHECK: DIVERGENT:
 ; CHECK: DIVERGENT:
-; CHECK: DIVERGENT:
-; CHECK: DIVERGENT:
 define void @test_c(ptr addrspace(5) byval([4 x <16 x i8>]) %arg0, float inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <3 x i32> %arg4, float %arg5, i32 %arg6) #0 {
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
new file mode 100644
index 000000000000000..84287b5e4458dee
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
@@ -0,0 +1,1808 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+
+define void @void_func_i1_inreg(i1 inreg %arg0) #0 {
+; GFX9-LABEL: void_func_i1_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_and_b32 s4, s4, 1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i1_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_and_b32 s0, s0, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store i1 %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_i8_inreg(i8 inreg %arg0) #0 {
+; GFX9-LABEL: void_func_i8_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i8_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store i8 %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_i16_inreg(i16 inreg %arg0) #0 {
+; GFX9-LABEL: void_func_i16_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    global_store_short v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i16_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store i16 %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_i32_inreg(i32 inreg %arg0) #0 {
+; GFX9-LABEL: void_func_i32_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i32_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store i32 %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_i64_inreg(i64 inreg %arg0) #0 {
+; GFX9-LABEL: void_func_i64_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:       s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v0, s4
+; GFX9-NEXT:       v_mov_b32_e32 v1, s5
+; GFX9-NEXT:       global_store_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       s_setpc_b64 s[30:31]
+;
+; GFX11:      ; %bb.0:
+; GFX11-NEXT:      s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:      v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:      global_store_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:      s_setpc_b64 s[30:31]
+  store i64 %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_f16_inreg(half inreg %arg0) #0 {
+; GFX9-LABEL: void_func_f16_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    global_store_short v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_f16_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store half %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_f32_inreg(float inreg %arg0) #0 {
+; GFX9-LABEL: void_func_f32_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_f32_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store float %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_f64_inreg(double inreg %arg0) #0 {
+; GFX9-LABEL: void_func_f64_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_f64_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store double %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v2i16_inreg(<2 x i16> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v2i16_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v2i16_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <2 x i16> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v3i16_inreg(<3 x i16> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v3i16_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s5
+; GFX9-NEXT:    global_store_short v[0:1], v0, off
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v3i16_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX11-NEXT:    global_store_b32 v[0:1], v1, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <3 x i16> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v4i16_inreg(<4 x i16> inreg %arg0) #0 {
+; GFX89-LABEL: void_func_v4i16_inreg:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    v_mov_b32_e32 v0, s4
+; GFX89-NEXT:    v_mov_b32_e32 v1, s5
+; GFX89-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v4i16_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <4 x i16> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v5i16_inreg(<5 x i16> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v5i16_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    global_store_short v[0:1], v0, off
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v5i16_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <5 x i16> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v8i16_inreg(<8 x i16> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v8i16_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v8i16_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <8 x i16> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v2i32_inreg(<2 x i32> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v2i32_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v2i32_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <2 x i32> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v3i32_inreg(<3 x i32> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v3i32_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    global_store_dwordx3 v[0:1], v[0:2], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v3i32_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    global_store_b96 v[0:1], v[0:2], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <3 x i32> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v4i32_inreg(<4 x i32> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v4i32_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v4i32_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <4 x i32> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v5i32_inreg(<5 x i32> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v5i32_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v5i32_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b32 v[0:1], v4, off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <5 x i32> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v8i32_inreg(<8 x i32> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v8i32_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v8i32_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX11-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX11-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
+; GFX11-NEXT:    v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[4:7], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <8 x i32> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v16i32_inreg(<16 x i32> inreg %arg0) #0 {
+; CIGFX89-LABEL: void_func_v16i32_inreg:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    v_mov_b32_e32 v0, s16
+; CIGFX89-NEXT:    v_mov_b32_e32 v1, s17
+; CIGFX89-NEXT:    v_mov_b32_e32 v2, s18
+; CIGFX89-NEXT:    v_mov_b32_e32 v3, s19
+; CIGFX89-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; CIGFX89-NEXT:    s_nop 0
+; CIGFX89-NEXT:    v_mov_b32_e32 v0, s12
+; CIGFX89-NEXT:    v_mov_b32_e32 v1, s13
+; CIGFX89-NEXT:    v_mov_b32_e32 v2, s14
+; CIGFX89-NEXT:    v_mov_b32_e32 v3, s15
+; CIGFX89-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; CIGFX89-NEXT:    s_nop 0
+; CIGFX89-NEXT:    v_mov_b32_e32 v0, s8
+; CIGFX89-NEXT:    v_mov_b32_e32 v1, s9
+; CIGFX89-NEXT:    v_mov_b32_e32 v2, s10
+; CIGFX89-NEXT:    v_mov_b32_e32 v3, s11
+; CIGFX89-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; CIGFX89-NEXT:    s_nop 0
+; CIGFX89-NEXT:    v_mov_b32_e32 v0, s4
+; CIGFX89-NEXT:    v_mov_b32_e32 v1, s5
+; CIGFX89-NEXT:    v_mov_b32_e32 v2, s6
+; CIGFX89-NEXT:    v_mov_b32_e32 v3, s7
+; CIGFX89-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v16i32_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-NEXT:    v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-NEXT:    v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v5, s9
+; GFX11-NEXT:    v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v7, s11
+; GFX11-NEXT:    v_dual_mov_b32 v8, s4 :: v_dual_mov_b32 v9, s5
+; GFX11-NEXT:    v_dual_mov_b32 v10, s6 :: v_dual_mov_b32 v11, s7
+; GFX11-NEXT:    v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v13, s1
+; GFX11-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v15, s3
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[4:7], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[8:11], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[12:15], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <16 x i32> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v32i32_inreg(<32 x i32> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v32i32_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v7, v1
+; GFX9-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, s28
+; GFX9-NEXT:    v_mov_b32_e32 v5, s29
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off
+; GFX9-NEXT:    v_mov_b32_e32 v0, s24
+; GFX9-NEXT:    v_mov_b32_e32 v1, s25
+; GFX9-NEXT:    v_mov_b32_e32 v2, s26
+; GFX9-NEXT:    v_mov_b32_e32 v3, s27
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s20
+; GFX9-NEXT:    v_mov_b32_e32 v1, s21
+; GFX9-NEXT:    v_mov_b32_e32 v2, s22
+; GFX9-NEXT:    v_mov_b32_e32 v3, s23
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s16
+; GFX9-NEXT:    v_mov_b32_e32 v1, s17
+; GFX9-NEXT:    v_mov_b32_e32 v2, s18
+; GFX9-NEXT:    v_mov_b32_e32 v3, s19
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s12
+; GFX9-NEXT:    v_mov_b32_e32 v1, s13
+; GFX9-NEXT:    v_mov_b32_e32 v2, s14
+; GFX9-NEXT:    v_mov_b32_e32 v3, s15
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v32i32_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX11-NEXT:    v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v1, s29
+; GFX11-NEXT:    v_dual_mov_b32 v4, s24 :: v_dual_mov_b32 v5, s25
+; GFX11-NEXT:    v_dual_mov_b32 v6, s26 :: v_dual_mov_b32 v7, s27
+; GFX11-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[4:7], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[8:11], off
+; GFX11-NEXT:    v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v1, s17
+; GFX11-NEXT:    v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v3, s19
+; GFX11-NEXT:    v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13
+; GFX11-NEXT:    v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15
+; GFX11-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
+; GFX11-NEXT:    v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
+; GFX11-NEXT:    v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v13, s5
+; GFX11-NEXT:    v_dual_mov_b32 v14, s6 :: v_dual_mov_b32 v15, s7
+; GFX11-NEXT:    v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v17, s1
+; GFX11-NEXT:    v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v19, s3
+; GFX11-NEXT:    s_clause 0x4
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[4:7], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[8:11], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[12:15], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[16:19], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <32 x i32> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v2i64_inreg(<2 x i64> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v2i64_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v2i64_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <2 x i64> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v3i64_inreg(<3 x i64> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v3i64_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v3i64_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b64 v[0:1], v[4:5], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <3 x i64> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v4i64_inreg(<4 x i64> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v4i64_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v4i64_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX11-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX11-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
+; GFX11-NEXT:    v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[4:7], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <4 x i64> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v5i64_inreg(<5 x i64> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v5i64_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s12
+; GFX9-NEXT:    v_mov_b32_e32 v1, s13
+; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v5i64_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX11-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX11-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
+; GFX11-NEXT:    v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3
+; GFX11-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[4:7], off
+; GFX11-NEXT:    global_store_b64 v[0:1], v[8:9], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <5 x i64> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v8i64_inreg(<8 x i64> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v8i64_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s16
+; GFX9-NEXT:    v_mov_b32_e32 v1, s17
+; GFX9-NEXT:    v_mov_b32_e32 v2, s18
+; GFX9-NEXT:    v_mov_b32_e32 v3, s19
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s12
+; GFX9-NEXT:    v_mov_b32_e32 v1, s13
+; GFX9-NEXT:    v_mov_b32_e32 v2, s14
+; GFX9-NEXT:    v_mov_b32_e32 v3, s15
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v8i64_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-NEXT:    v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-NEXT:    v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v5, s9
+; GFX11-NEXT:    v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v7, s11
+; GFX11-NEXT:    v_dual_mov_b32 v8, s4 :: v_dual_mov_b32 v9, s5
+; GFX11-NEXT:    v_dual_mov_b32 v10, s6 :: v_dual_mov_b32 v11, s7
+; GFX11-NEXT:    v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v13, s1
+; GFX11-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v15, s3
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[4:7], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[8:11], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[12:15], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <8 x i64> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v16i64_inreg(<16 x i64> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v16i64_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v7, v1
+; GFX9-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, s28
+; GFX9-NEXT:    v_mov_b32_e32 v5, s29
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off
+; GFX9-NEXT:    v_mov_b32_e32 v0, s24
+; GFX9-NEXT:    v_mov_b32_e32 v1, s25
+; GFX9-NEXT:    v_mov_b32_e32 v2, s26
+; GFX9-NEXT:    v_mov_b32_e32 v3, s27
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s20
+; GFX9-NEXT:    v_mov_b32_e32 v1, s21
+; GFX9-NEXT:    v_mov_b32_e32 v2, s22
+; GFX9-NEXT:    v_mov_b32_e32 v3, s23
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s16
+; GFX9-NEXT:    v_mov_b32_e32 v1, s17
+; GFX9-NEXT:    v_mov_b32_e32 v2, s18
+; GFX9-NEXT:    v_mov_b32_e32 v3, s19
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s12
+; GFX9-NEXT:    v_mov_b32_e32 v1, s13
+; GFX9-NEXT:    v_mov_b32_e32 v2, s14
+; GFX9-NEXT:    v_mov_b32_e32 v3, s15
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v16i64_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX11-NEXT:    v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v1, s29
+; GFX11-NEXT:    v_dual_mov_b32 v4, s24 :: v_dual_mov_b32 v5, s25
+; GFX11-NEXT:    v_dual_mov_b32 v6, s26 :: v_dual_mov_b32 v7, s27
+; GFX11-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[4:7], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[8:11], off
+; GFX11-NEXT:    v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v1, s17
+; GFX11-NEXT:    v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v3, s19
+; GFX11-NEXT:    v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13
+; GFX11-NEXT:    v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15
+; GFX11-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
+; GFX11-NEXT:    v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
+; GFX11-NEXT:    v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v13, s5
+; GFX11-NEXT:    v_dual_mov_b32 v14, s6 :: v_dual_mov_b32 v15, s7
+; GFX11-NEXT:    v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v17, s1
+; GFX11-NEXT:    v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v19, s3
+; GFX11-NEXT:    s_clause 0x4
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[4:7], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[8:11], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[12:15], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[16:19], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <16 x i64> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v2f16_inreg(<2 x half> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v2f16_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v2f16_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <2 x half> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v3f16_inreg(<3 x half> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v3f16_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s5
+; GFX9-NEXT:    global_store_short v[0:1], v0, off
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v3f16_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX11-NEXT:    global_store_b32 v[0:1], v1, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <3 x half> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v4f16_inreg(<4 x half> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v4f16_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v4f16_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <4 x half> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v8f16_inreg(<8 x half> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v8f16_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v8f16_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <8 x half> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v16f16_inreg(<16 x half> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v16f16_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v16f16_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX11-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX11-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
+; GFX11-NEXT:    v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[4:7], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <16 x half> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v2f32_inreg(<2 x float> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v2f32_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v2f32_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <2 x float> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v3f32_inreg(<3 x float> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v3f32_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    global_store_dwordx3 v[0:1], v[0:2], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v3f32_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    global_store_b96 v[0:1], v[0:2], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <3 x float> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v4f32_inreg(<4 x float> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v4f32_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v4f32_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <4 x float> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v8f32_inreg(<8 x float> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v8f32_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v8f32_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX11-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX11-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
+; GFX11-NEXT:    v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[4:7], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <8 x float> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v16f32_inreg(<16 x float> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v16f32_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s16
+; GFX9-NEXT:    v_mov_b32_e32 v1, s17
+; GFX9-NEXT:    v_mov_b32_e32 v2, s18
+; GFX9-NEXT:    v_mov_b32_e32 v3, s19
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s12
+; GFX9-NEXT:    v_mov_b32_e32 v1, s13
+; GFX9-NEXT:    v_mov_b32_e32 v2, s14
+; GFX9-NEXT:    v_mov_b32_e32 v3, s15
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v16f32_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-NEXT:    v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-NEXT:    v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v5, s9
+; GFX11-NEXT:    v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v7, s11
+; GFX11-NEXT:    v_dual_mov_b32 v8, s4 :: v_dual_mov_b32 v9, s5
+; GFX11-NEXT:    v_dual_mov_b32 v10, s6 :: v_dual_mov_b32 v11, s7
+; GFX11-NEXT:    v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v13, s1
+; GFX11-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v15, s3
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[4:7], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[8:11], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[12:15], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <16 x float> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v2f64_inreg(<2 x double> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v2f64_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v2f64_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <2 x double> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v3f64_inreg(<3 x double> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v3f64_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v3f64_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b64 v[0:1], v[4:5], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <3 x double> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v4f64_inreg(<4 x double> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v4f64_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v4f64_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX11-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX11-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
+; GFX11-NEXT:    v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[4:7], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <4 x double> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v8f64_inreg(<8 x double> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v8f64_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s16
+; GFX9-NEXT:    v_mov_b32_e32 v1, s17
+; GFX9-NEXT:    v_mov_b32_e32 v2, s18
+; GFX9-NEXT:    v_mov_b32_e32 v3, s19
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s12
+; GFX9-NEXT:    v_mov_b32_e32 v1, s13
+; GFX9-NEXT:    v_mov_b32_e32 v2, s14
+; GFX9-NEXT:    v_mov_b32_e32 v3, s15
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v8f64_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-NEXT:    v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-NEXT:    v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v5, s9
+; GFX11-NEXT:    v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v7, s11
+; GFX11-NEXT:    v_dual_mov_b32 v8, s4 :: v_dual_mov_b32 v9, s5
+; GFX11-NEXT:    v_dual_mov_b32 v10, s6 :: v_dual_mov_b32 v11, s7
+; GFX11-NEXT:    v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v13, s1
+; GFX11-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v15, s3
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[4:7], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[8:11], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[12:15], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <8 x double> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v16f64_inreg(<16 x double> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v16f64_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v7, v1
+; GFX9-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, s28
+; GFX9-NEXT:    v_mov_b32_e32 v5, s29
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off
+; GFX9-NEXT:    v_mov_b32_e32 v0, s24
+; GFX9-NEXT:    v_mov_b32_e32 v1, s25
+; GFX9-NEXT:    v_mov_b32_e32 v2, s26
+; GFX9-NEXT:    v_mov_b32_e32 v3, s27
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s20
+; GFX9-NEXT:    v_mov_b32_e32 v1, s21
+; GFX9-NEXT:    v_mov_b32_e32 v2, s22
+; GFX9-NEXT:    v_mov_b32_e32 v3, s23
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s16
+; GFX9-NEXT:    v_mov_b32_e32 v1, s17
+; GFX9-NEXT:    v_mov_b32_e32 v2, s18
+; GFX9-NEXT:    v_mov_b32_e32 v3, s19
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s12
+; GFX9-NEXT:    v_mov_b32_e32 v1, s13
+; GFX9-NEXT:    v_mov_b32_e32 v2, s14
+; GFX9-NEXT:    v_mov_b32_e32 v3, s15
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v16f64_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX11-NEXT:    v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v1, s29
+; GFX11-NEXT:    v_dual_mov_b32 v4, s24 :: v_dual_mov_b32 v5, s25
+; GFX11-NEXT:    v_dual_mov_b32 v6, s26 :: v_dual_mov_b32 v7, s27
+; GFX11-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[4:7], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[8:11], off
+; GFX11-NEXT:    v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v1, s17
+; GFX11-NEXT:    v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v3, s19
+; GFX11-NEXT:    v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13
+; GFX11-NEXT:    v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15
+; GFX11-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
+; GFX11-NEXT:    v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
+; GFX11-NEXT:    v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v13, s5
+; GFX11-NEXT:    v_dual_mov_b32 v14, s6 :: v_dual_mov_b32 v15, s7
+; GFX11-NEXT:    v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v17, s1
+; GFX11-NEXT:    v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v19, s3
+; GFX11-NEXT:    s_clause 0x4
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[4:7], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[8:11], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[12:15], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[16:19], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <16 x double> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v32i32_i1_i8_i16_f32_inreg(<32 x i32> inreg %arg0, i1 inreg %arg1, i8 inreg %arg2, i16 inreg %arg3, half inreg %arg4) #0 {
+; GFX9-LABEL: void_func_v32i32_i1_i8_i16_f32_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v13, v1
+; GFX9-NEXT:    v_mov_b32_e32 v12, v0
+; GFX9-NEXT:    v_mov_b32_e32 v10, s28
+; GFX9-NEXT:    v_mov_b32_e32 v11, s29
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[10:13], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s24
+; GFX9-NEXT:    v_mov_b32_e32 v1, s25
+; GFX9-NEXT:    v_mov_b32_e32 v2, s26
+; GFX9-NEXT:    v_mov_b32_e32 v3, s27
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s20
+; GFX9-NEXT:    v_mov_b32_e32 v1, s21
+; GFX9-NEXT:    v_mov_b32_e32 v2, s22
+; GFX9-NEXT:    v_mov_b32_e32 v3, s23
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s16
+; GFX9-NEXT:    v_mov_b32_e32 v1, s17
+; GFX9-NEXT:    v_mov_b32_e32 v2, s18
+; GFX9-NEXT:    v_mov_b32_e32 v3, s19
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s12
+; GFX9-NEXT:    v_mov_b32_e32 v1, s13
+; GFX9-NEXT:    v_mov_b32_e32 v2, s14
+; GFX9-NEXT:    v_mov_b32_e32 v3, s15
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v6
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_byte v[0:1], v7, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_short v[0:1], v8, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_short v[0:1], v9, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v32i32_i1_i8_i16_f32_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
+; GFX11-NEXT:    v_dual_mov_b32 v6, s28 :: v_dual_mov_b32 v7, s29
+; GFX11-NEXT:    v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v11, s25
+; GFX11-NEXT:    v_dual_mov_b32 v12, s26 :: v_dual_mov_b32 v13, s27
+; GFX11-NEXT:    v_dual_mov_b32 v14, s20 :: v_dual_mov_b32 v15, s21
+; GFX11-NEXT:    v_dual_mov_b32 v16, s22 :: v_dual_mov_b32 v17, s23
+; GFX11-NEXT:    v_dual_mov_b32 v18, s16 :: v_dual_mov_b32 v19, s17
+; GFX11-NEXT:    v_dual_mov_b32 v20, s18 :: v_dual_mov_b32 v21, s19
+; GFX11-NEXT:    global_store_b128 v[0:1], v[6:9], off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b128 v[0:1], v[10:13], off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b128 v[0:1], v[14:17], off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b128 v[0:1], v[18:21], off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_dual_mov_b32 v6, s12 :: v_dual_mov_b32 v7, s13
+; GFX11-NEXT:    v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v9, s15
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v2
+; GFX11-NEXT:    v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v11, s9
+; GFX11-NEXT:    v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v13, s11
+; GFX11-NEXT:    v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v15, s5
+; GFX11-NEXT:    v_dual_mov_b32 v16, s6 :: v_dual_mov_b32 v17, s7
+; GFX11-NEXT:    v_dual_mov_b32 v18, s0 :: v_dual_mov_b32 v19, s1
+; GFX11-NEXT:    v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v21, s3
+; GFX11-NEXT:    global_store_b128 v[0:1], v[6:9], off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b128 v[0:1], v[10:13], off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b128 v[0:1], v[14:17], off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b128 v[0:1], v[18:21], off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b8 v[0:1], v0, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b8 v[0:1], v3, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b16 v[0:1], v4, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b16 v[0:1], v5, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store volatile <32 x i32> %arg0, ptr addrspace(1) undef
+  store volatile i1 %arg1, ptr addrspace(1) undef
+  store volatile i8 %arg2, ptr addrspace(1) undef
+  store volatile i16 %arg3, ptr addrspace(1) undef
+  store volatile half %arg4, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v32i32_v2i32_v2f32_inreg(<32 x i32> inreg %arg0, <2 x i32> inreg %arg1, <2 x float> inreg %arg2) #0 {
+; GFX9-LABEL: void_func_v32i32_v2i32_v2f32_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v13, v1
+; GFX9-NEXT:    v_mov_b32_e32 v12, v0
+; GFX9-NEXT:    v_mov_b32_e32 v10, s28
+; GFX9-NEXT:    v_mov_b32_e32 v11, s29
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[10:13], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s24
+; GFX9-NEXT:    v_mov_b32_e32 v1, s25
+; GFX9-NEXT:    v_mov_b32_e32 v2, s26
+; GFX9-NEXT:    v_mov_b32_e32 v3, s27
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s20
+; GFX9-NEXT:    v_mov_b32_e32 v1, s21
+; GFX9-NEXT:    v_mov_b32_e32 v2, s22
+; GFX9-NEXT:    v_mov_b32_e32 v3, s23
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s16
+; GFX9-NEXT:    v_mov_b32_e32 v1, s17
+; GFX9-NEXT:    v_mov_b32_e32 v2, s18
+; GFX9-NEXT:    v_mov_b32_e32 v3, s19
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s12
+; GFX9-NEXT:    v_mov_b32_e32 v1, s13
+; GFX9-NEXT:    v_mov_b32_e32 v2, s14
+; GFX9-NEXT:    v_mov_b32_e32 v3, s15
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v32i32_v2i32_v2f32_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
+; GFX11-NEXT:    v_dual_mov_b32 v6, s28 :: v_dual_mov_b32 v7, s29
+; GFX11-NEXT:    v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v11, s25
+; GFX11-NEXT:    v_dual_mov_b32 v12, s26 :: v_dual_mov_b32 v13, s27
+; GFX11-NEXT:    v_dual_mov_b32 v14, s20 :: v_dual_mov_b32 v15, s21
+; GFX11-NEXT:    v_dual_mov_b32 v16, s22 :: v_dual_mov_b32 v17, s23
+; GFX11-NEXT:    global_store_b128 v[0:1], v[6:9], off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b128 v[0:1], v[10:13], off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b128 v[0:1], v[14:17], off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_dual_mov_b32 v6, s16 :: v_dual_mov_b32 v7, s17
+; GFX11-NEXT:    v_dual_mov_b32 v8, s18 :: v_dual_mov_b32 v9, s19
+; GFX11-NEXT:    v_dual_mov_b32 v10, s12 :: v_dual_mov_b32 v11, s13
+; GFX11-NEXT:    v_dual_mov_b32 v12, s14 :: v_dual_mov_b32 v13, s15
+; GFX11-NEXT:    v_dual_mov_b32 v14, s8 :: v_dual_mov_b32 v15, s9
+; GFX11-NEXT:    v_dual_mov_b32 v16, s10 :: v_dual_mov_b32 v17, s11
+; GFX11-NEXT:    v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5
+; GFX11-NEXT:    v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7
+; GFX11-NEXT:    v_dual_mov_b32 v22, s0 :: v_dual_mov_b32 v23, s1
+; GFX11-NEXT:    v_dual_mov_b32 v24, s2 :: v_dual_mov_b32 v25, s3
+; GFX11-NEXT:    global_store_b128 v[0:1], v[6:9], off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b128 v[0:1], v[10:13], off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b128 v[0:1], v[14:17], off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b128 v[0:1], v[18:21], off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b128 v[0:1], v[22:25], off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b64 v[0:1], v[4:5], off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store volatile <32 x i32> %arg0, ptr addrspace(1) undef
+  store volatile <2 x i32> %arg1, ptr addrspace(1) undef
+  store volatile <2 x float> %arg2, ptr addrspace(1) undef
+  ret void
+}
+
+define void @too_many_args_use_workitem_id_x_inreg(
+  i32 inreg %arg0, i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3, i32 inreg %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 inreg %arg7,
+  i32 inreg %arg8, i32 inreg %arg9, i32 inreg %arg10, i32 inreg %arg11, i32 inreg %arg12, i32 inreg %arg13, i32 inreg %arg14, i32 inreg %arg15,
+  i32 inreg %arg16, i32 inreg %arg17, i32 inreg %arg18, i32 inreg %arg19, i32 inreg %arg20, i32 inreg %arg21, i32 inreg %arg22, i32 inreg %arg23,
+  i32 inreg %arg24, i32 inreg %arg25, i32 inreg %arg26, i32 inreg %arg27, i32 inreg %arg28, i32 inreg %arg29, i32 inreg %arg30, i32 inreg %arg31) {
+; GFX9-LABEL: {{^}}too_many_args_use_workitem_id_x_inreg:
+; GFX9:        ; %bb.0:
+; GFX9-NEXT:       s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v6, s4
+; GFX9-NEXT:       global_store_dword v[0:1], v6, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v6, s5
+; GFX9-NEXT:       global_store_dword v[0:1], v6, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v6, s6
+; GFX9-NEXT:       global_store_dword v[0:1], v6, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v6, s7
+; GFX9-NEXT:       global_store_dword v[0:1], v6, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v6, s8
+; GFX9-NEXT:       global_store_dword v[0:1], v6, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v6, s9
+; GFX9-NEXT:       global_store_dword v[0:1], v6, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v6, s10
+; GFX9-NEXT:       global_store_dword v[0:1], v6, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v6, s11
+; GFX9-NEXT:       global_store_dword v[0:1], v6, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v6, s12
+; GFX9-NEXT:       global_store_dword v[0:1], v6, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v6, s13
+; GFX9-NEXT:       global_store_dword v[0:1], v6, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v6, s14
+; GFX9-NEXT:       global_store_dword v[0:1], v6, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v6, s15
+; GFX9-NEXT:       global_store_dword v[0:1], v6, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v6, s16
+; GFX9-NEXT:       global_store_dword v[0:1], v6, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v6, s17
+; GFX9-NEXT:       global_store_dword v[0:1], v6, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v6, s18
+; GFX9-NEXT:       global_store_dword v[0:1], v6, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v6, s19
+; GFX9-NEXT:       global_store_dword v[0:1], v6, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v6, s20
+; GFX9-NEXT:       global_store_dword v[0:1], v6, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v6, s21
+; GFX9-NEXT:       global_store_dword v[0:1], v6, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v6, s22
+; GFX9-NEXT:       global_store_dword v[0:1], v6, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v6, s23
+; GFX9-NEXT:       global_store_dword v[0:1], v6, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v6, s24
+; GFX9-NEXT:       global_store_dword v[0:1], v6, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v6, s25
+; GFX9-NEXT:       global_store_dword v[0:1], v6, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v6, s26
+; GFX9-NEXT:       global_store_dword v[0:1], v6, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v6, s27
+; GFX9-NEXT:       global_store_dword v[0:1], v6, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v6, s28
+; GFX9-NEXT:       global_store_dword v[0:1], v6, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v6, s29
+; GFX9-NEXT:       global_store_dword v[0:1], v6, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       global_store_dword v[0:1], v0, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       global_store_dword v[0:1], v1, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       global_store_dword v[0:1], v2, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       global_store_dword v[0:1], v3, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       global_store_dword v[0:1], v4, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       global_store_dword v[0:1], v5, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: {{^}}too_many_args_use_workitem_id_x_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX11-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
+; GFX11-NEXT:    v_mov_b32_e32 v6, s4
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v[0:1], v3, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v[0:1], v4, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v[0:1], v5, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v[0:1], v6, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_dual_mov_b32 v3, s6 :: v_dual_mov_b32 v2, s5
+; GFX11-NEXT:    v_dual_mov_b32 v5, s8 :: v_dual_mov_b32 v4, s7
+; GFX11-NEXT:    v_mov_b32_e32 v6, s9
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v[0:1], v3, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v[0:1], v4, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v[0:1], v5, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v[0:1], v6, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_dual_mov_b32 v5, s13 :: v_dual_mov_b32 v2, s10
+; GFX11-NEXT:    v_dual_mov_b32 v3, s11 :: v_dual_mov_b32 v4, s12
+; GFX11-NEXT:    v_mov_b32_e32 v6, s14
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v[0:1], v3, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v[0:1], v4, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v[0:1], v5, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v[0:1], v6, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_dual_mov_b32 v2, s15 :: v_dual_mov_b32 v3, s16
+; GFX11-NEXT:    v_dual_mov_b32 v4, s17 :: v_dual_mov_b32 v5, s18
+; GFX11-NEXT:    v_mov_b32_e32 v6, s19
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v[0:1], v3, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v[0:1], v4, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v[0:1], v5, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v[0:1], v6, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_dual_mov_b32 v2, s20 :: v_dual_mov_b32 v3, s21
+; GFX11-NEXT:    v_dual_mov_b32 v4, s22 :: v_dual_mov_b32 v5, s23
+; GFX11-NEXT:    v_mov_b32_e32 v6, s24
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v[0:1], v3, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v[0:1], v4, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v[0:1], v5, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v[0:1], v6, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_dual_mov_b32 v5, s28 :: v_dual_mov_b32 v2, s25
+; GFX11-NEXT:    v_dual_mov_b32 v3, s26 :: v_dual_mov_b32 v4, s27
+; GFX11-NEXT:    v_mov_b32_e32 v6, s29
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v[0:1], v3, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v[0:1], v4, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v[0:1], v5, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v[0:1], v6, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v[0:1], v0, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v[0:1], v1, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  ;%val = call i32 @llvm.amdgcn.workitem.id.x()
+  ;store volatile i32 %val, ptr addrspace(1) undef
+
+  store volatile i32 %arg0, ptr addrspace(1) undef
+  store volatile i32 %arg1, ptr addrspace(1) undef
+  store volatile i32 %arg2, ptr addrspace(1) undef
+  store volatile i32 %arg3, ptr addrspace(1) undef
+  store volatile i32 %arg4, ptr addrspace(1) undef
+  store volatile i32 %arg5, ptr addrspace(1) undef
+  store volatile i32 %arg6, ptr addrspace(1) undef
+  store volatile i32 %arg7, ptr addrspace(1) undef
+
+  store volatile i32 %arg8, ptr addrspace(1) undef
+  store volatile i32 %arg9, ptr addrspace(1) undef
+  store volatile i32 %arg10, ptr addrspace(1) undef
+  store volatile i32 %arg11, ptr addrspace(1) undef
+  store volatile i32 %arg12, ptr addrspace(1) undef
+  store volatile i32 %arg13, ptr addrspace(1) undef
+  store volatile i32 %arg14, ptr addrspace(1) undef
+  store volatile i32 %arg15, ptr addrspace(1) undef
+
+  store volatile i32 %arg16, ptr addrspace(1) undef
+  store volatile i32 %arg17, ptr addrspace(1) undef
+  store volatile i32 %arg18, ptr addrspace(1) undef
+  store volatile i32 %arg19, ptr addrspace(1) undef
+  store volatile i32 %arg20, ptr addrspace(1) undef
+  store volatile i32 %arg21, ptr addrspace(1) undef
+  store volatile i32 %arg22, ptr addrspace(1) undef
+  store volatile i32 %arg23, ptr addrspace(1) undef
+
+  store volatile i32 %arg24, ptr addrspace(1) undef
+  store volatile i32 %arg25, ptr addrspace(1) undef
+  store volatile i32 %arg26, ptr addrspace(1) undef
+  store volatile i32 %arg27, ptr addrspace(1) undef
+  store volatile i32 %arg28, ptr addrspace(1) undef
+  store volatile i32 %arg29, ptr addrspace(1) undef
+  store volatile i32 %arg30, ptr addrspace(1) undef
+  store volatile i32 %arg31, ptr addrspace(1) undef
+
+  ret void
+}
+
+define void @void_func_i32_v2float_inreg(i32 inreg %arg0, <2 x float> inreg %arg1) #0 {
+; GFX9-LABEL: void_func_i32_v2float_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:  v_mov_b32_e32 v0, s4
+; GFX9-NEXT:  global_store_dword v[0:1], v0, off
+; GFX9-NEXT:  v_mov_b32_e32 v0, s5
+; GFX9-NEXT:  v_mov_b32_e32 v1, s6
+; GFX9-NEXT:  global_store_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:  s_waitcnt vmcnt(0)
+; GFX9-NEXT:  s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i32_v2float_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:  v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT:  v_mov_b32_e32 v0, s1
+; GFX11-NEXT:  s_clause 0x1
+; GFX11-NEXT:  global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:  global_store_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:  s_setpc_b64 s[30:31]
+  store i32 %arg0, ptr addrspace(1) undef
+  store <2 x float> %arg1, ptr addrspace(1) undef
+  ret void
+}
+
+define void @caller_void_func_i32_v2float_inreg(i32 inreg %arg0, <2 x float> inreg %arg1) #0 {
+; GFX9-LABEL: caller_void_func_i32_v2float_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:  s_mov_b32 s7, s33
+; GFX9-NEXT:  s_mov_b32 s33, s32
+; GFX9-NEXT:  s_or_saveexec_b64 s[8:9], -1
+; GFX9-NEXT:  buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:  s_mov_b64 exec, s[8:9]
+; GFX9-NEXT:  s_addk_i32 s32, 0x400
+; GFX9-NEXT:  s_getpc_b64 s[8:9]
+; GFX9-NEXT:  s_add_u32 s8, s8, caller_void_func_i32_v2float_inreg at gotpcrel32@lo+4
+; GFX9-NEXT:  s_addc_u32 s9, s9, caller_void_func_i32_v2float_inreg at gotpcrel32@hi+12
+; GFX9-NEXT:  s_load_dwordx2 s[8:9], s[8:9], 0x0
+; GFX9-NEXT:  v_writelane_b32 v40, s7, 2
+; GFX9-NEXT:  v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:  s_mov_b32 s2, s6
+; GFX9-NEXT:  s_mov_b32 s1, s5
+; GFX9-NEXT:  s_mov_b32 s0, s4
+; GFX9-NEXT:  v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:  s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:  s_swappc_b64 s[30:31], s[8:9]
+; GFX9-NEXT:  v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:  v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:  v_readlane_b32 s4, v40, 2
+; GFX9-NEXT:  s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:  buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:  s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:  s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:  s_mov_b32 s33, s4
+; GFX9-NEXT:  s_waitcnt vmcnt(0)
+; GFX9-NEXT:  s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: caller_void_func_i32_v2float_inreg:
+; GFX11:      ; %bb.0:
+; GFX11-NEXT  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT  s_mov_b32 s3, s33
+; GFX11-NEXT  s_mov_b32 s33, s32
+; GFX11-NEXT  s_or_saveexec_b32 s4, -1
+; GFX11-NEXT  scratch_store_b32 off, v40, s33         ; 4-byte Folded Spill
+; GFX11-NEXT  s_mov_b32 exec_lo, s4
+; GFX11-NEXT  s_add_i32 s32, s32, 16
+; GFX11-NEXT  s_getpc_b64 s[4:5]
+; GFX11-NEXT  s_add_u32 s4, s4, caller_void_func_i32_v2float_inreg at gotpcrel32@lo+4
+; GFX11-NEXT  s_addc_u32 s5, s5, caller_void_func_i32_v2float_inreg at gotpcrel32@hi+12
+; GFX11-NEXT  v_writelane_b32 v40, s3, 2
+; GFX11-NEXT  s_load_b64 s[4:5], s[4:5], 0x0
+; GFX11-NEXT  v_writelane_b32 v40, s30, 0
+; GFX11-NEXT  v_writelane_b32 v40, s31, 1
+; GFX11-NEXT  s_waitcnt lgkmcnt(0)
+; GFX11-NEXT  s_swappc_b64 s[30:31], s[4:5]
+; GFX11-NEXT  s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT  v_readlane_b32 s31, v40, 1
+; GFX11-NEXT  v_readlane_b32 s30, v40, 0
+; GFX11-NEXT  v_readlane_b32 s0, v40, 2
+; GFX11-NEXT  s_or_saveexec_b32 s1, -1
+; GFX11-NEXT  scratch_load_b32 v40, off, s33          ; 4-byte Folded Reload
+; GFX11-NEXT  s_mov_b32 exec_lo, s1
+; GFX11-NEXT  s_add_i32 s32, s32, -16
+; GFX11-NEXT  s_mov_b32 s33, s0
+; GFX11-NEXT  s_waitcnt vmcnt(0)
+; GFX11-NEXT  s_setpc_b64 s[30:31]
+  call void @caller_void_func_i32_v2float_inreg(i32 inreg %arg0, <2 x float> inreg %arg1)
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind noinline }
+

diff  --git a/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll
index 9dcd3a66a16dbf0..deaf361c823cfea 100644
--- a/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll
+++ b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll
@@ -134,13 +134,11 @@ l2:
 ; any of the v_cmp source operands.
 
 ; GCN-LABEL: check_saveexec_overwrites_vcmp_source:
-; GCN:  .LBB7_3: ; %then
-; GFX1010:          v_cmp_ge_i32_e32 vcc_lo, s[[A:[0-9]+]], v{{.*}}
-; GFX1010-NEXT:     v_mov_b32_e32 {{.*}}, s[[A]]
-; GFX1010-NEXT:     s_and_saveexec_b32 s[[A]], vcc_lo
-; GFX1030:          v_cmp_ge_i32_e32 vcc_lo, s[[A:[0-9]+]], v{{.*}}
-; GFX1030-NEXT:     v_mov_b32_e32 {{.*}}, s[[A]]
-; GFX1030-NEXT:     s_and_saveexec_b32 s[[A]], vcc_lo
+; GCN:  .LBB7_2: ; %then
+; GFX1010:          v_cmp_eq_u32_e64 s[[C:[0-9]+]], s[[A:[0-9]+]], s[[B:[0-9]+]]
+; GFX1010-NEXT:     s_cmp_ge_i32 s[[C]], s[[B]]
+; GFX1030:          v_cmp_eq_u32_e64 s[[C:[0-9]+]], s[[A:[0-9]+]], s[[B:[0-9]+]]
+; GFX1030-NEXT:     s_cmp_ge_i32 s[[C]], s[[B]]
 define i32 @check_saveexec_overwrites_vcmp_source(i32 inreg %a, i32 inreg %b) {
 entry:
   %0 = icmp sge i32 %a, 0


        


More information about the llvm-commits mailing list