[llvm] [AMDGPU] Add inreg support for SGPR arguments (PR #67182)

Jun Wang via llvm-commits llvm-commits at lists.llvm.org
Wed Oct 11 14:13:54 PDT 2023


https://github.com/jwanggit86 updated https://github.com/llvm/llvm-project/pull/67182

>From 76dc90c4603cc7269648f8b439b5c288d0c88c0c Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Fri, 22 Sep 2023 14:06:13 -0500
Subject: [PATCH 1/3] AMDGPU: Add inreg support for SGPR arguments

Function parameters marked with inreg are supposed to be
allocated to SGPRs. However, for compute functions, this is ignored and
function parameters are allocated to VGPRs. This fix modifies CC_AMDGPU_Func
in AMDGPUCallingConv.td to use SGPRs if input arg is marked inreg.
Correspondingly, the function check_saveexec_overwrites_vcmp_source in
testcase vcmp-saveexec-to-vcmpx.ll is duplicate such that one has inreg
and the other does not.
---
 llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td   |   5 +
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |   3 +
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |   4 +-
 .../AMDGPU/always_uniform.ll                  |  21 +
 .../UniformityAnalysis/AMDGPU/kernel-args.ll  |   2 -
 .../CodeGen/AMDGPU/function-args-inreg.ll     | 359 ++++++++++++++++++
 .../CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll  |  37 +-
 7 files changed, 425 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/function-args-inreg.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 4d7090942142f3c..4b43fbecaf523ad 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -183,6 +183,11 @@ def CC_AMDGPU_Func : CallingConv<[
   CCIfByVal<CCPassByVal<4, 4>>,
   CCIfType<[i1], CCPromoteToType<i32>>,
   CCIfType<[i8, i16], CCIfExtend<CCPromoteToType<i32>>>,
+
+  CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<
+    !foreach(i, !range(0, 30), !cast<Register>("SGPR"#i))  // SGPR0-29
+  >>>,
+
   CCIfType<[i32, f32, i16, f16, v2i16, v2f16, i1], CCAssignToReg<[
     VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
     VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 9bd0f5390b19e31..9ac948712ec61dd 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2646,6 +2646,9 @@ SDValue SITargetLowering::LowerFormalArguments(
 
   if (!IsKernel) {
     CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
+    if (!IsGraphics && !IsKernel && !Subtarget->enableFlatScratch()) {
+      CCInfo.AllocateRegBlock(ArrayRef<MCPhysReg>{AMDGPU::SGPR0, AMDGPU::SGPR1, AMDGPU::SGPR2, AMDGPU::SGPR3}, 4);
+    }
     CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
   }
 
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 6d0ad763d9e6cc1..166b7c64140f2f1 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -2544,7 +2544,7 @@ bool isArgPassedInSGPR(const Argument *A) {
            A->hasAttribute(Attribute::ByVal);
   default:
     // TODO: Should calls support inreg for SGPR inputs?
-    return false;
+    return A->hasAttribute(Attribute::InReg);
   }
 }
 
@@ -2571,7 +2571,7 @@ bool isArgPassedInSGPR(const CallBase *CB, unsigned ArgNo) {
            CB->paramHasAttr(ArgNo, Attribute::ByVal);
   default:
     // TODO: Should calls support inreg for SGPR inputs?
-    return false;
+    return CB->paramHasAttr(ArgNo, Attribute::InReg);
   }
 }
 
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll
index 48528c6112b00ef..2e5a3e998ed89d4 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll
@@ -39,6 +39,15 @@ define i32 @asm_sgpr(i32 %divergent) {
   ret i32 %sgpr
 }
 
+; SGPR asm outputs are uniform regardless of the input operands.
+; Argument not divergent if marked inreg.
+; CHECK-LABEL: for function 'asm_sgpr_inreg_arg':
+; CHECK-NOT: DIVERGENT
+define i32 @asm_sgpr_inreg_arg(i32 inreg %divergent) {
+  %sgpr = call i32 asm "; def $0, $1","=s,v"(i32 %divergent)
+  ret i32 %sgpr
+}
+
 ; CHECK-LABEL: for function 'asm_mixed_sgpr_vgpr':
 ; CHECK: DIVERGENT: %asm = call { i32, i32 } asm "; def $0, $1, $2", "=s,=v,v"(i32 %divergent)
 ; CHECK-NEXT: {{^[ \t]+}}%sgpr = extractvalue { i32, i32 } %asm, 0
@@ -58,6 +67,18 @@ define void @single_lane_func_arguments(i32 %i32, i1 %i1) #2 {
  ret void
 }
 
+; CHECK-LABEL: for function 'divergent_args':
+; CHECK: DIVERGENT ARGUMENTS
+define void @divergent_args(i32 %i32, i1 %i1) {
+ ret void
+}
+
+; CHECK-LABEL: for function 'no_divergent_args_if_inreg':
+; CHECK-NOT: DIVERGENT ARGUMENTS
+define void @no_divergent_args_if_inreg(i32 inreg %i32, i1 inreg %i1) {
+ ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 declare i32 @llvm.amdgcn.readfirstlane(i32) #0
 declare i64 @llvm.amdgcn.icmp.i32(i32, i32, i32) #1
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/kernel-args.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/kernel-args.ll
index 4b014f969257bdb..f6fe654896a2de9 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/kernel-args.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/kernel-args.ll
@@ -30,8 +30,6 @@ define amdgpu_kernel void @test_amdgpu_kernel(ptr addrspace(4) byref([4 x <16 x
 ; CHECK: DIVERGENT:
 ; CHECK: DIVERGENT:
 ; CHECK: DIVERGENT:
-; CHECK: DIVERGENT:
-; CHECK: DIVERGENT:
 define void @test_c(ptr addrspace(5) byval([4 x <16 x i8>]) %arg0, float inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <3 x i32> %arg4, float %arg5, i32 %arg6) #0 {
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
new file mode 100644
index 000000000000000..7750fb3316c1c82
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
@@ -0,0 +1,359 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89,GFX89 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+
+define void @void_func_i1(i1 %arg0) #0 {
+; CIGFX89-LABEL: void_func_i1:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    v_and_b32_e32 v0, 1, v0
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store i1 %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_i1_inreg(i1 inreg %arg0) #0 {
+; CIGFX89-LABEL: void_func_i1_inreg:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    s_and_b32 s4, s4, 1
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    v_mov_b32_e32 v0, s4
+; CIGFX89-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i1_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_and_b32 s0, s0, 1
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store i1 %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_i8(i8 %arg0) #0 {
+; CIGFX89-LABEL: void_func_i8:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i8:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store i8 %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_i8_inreg(i8 inreg %arg0) #0 {
+; CIGFX89-LABEL: void_func_i8_inreg:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    v_mov_b32_e32 v0, s4
+; CIGFX89-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i8_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store i8 %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_i16(i16 %arg0) #0 {
+; CIGFX89-LABEL: void_func_i16:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store i16 %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_i16_inreg(i16 inreg %arg0) #0 {
+; CIGFX89-LABEL: void_func_i16_inreg:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    v_mov_b32_e32 v0, s4
+; CIGFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i16_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store i16 %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_i32(i32 %arg0) #0 {
+; CIGFX89-LABEL: void_func_i32:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store i32 %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_i32_inreg(i32 inreg %arg0) #0 {
+; CIGFX89-LABEL: void_func_i32_inreg:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    v_mov_b32_e32 v0, s4
+; CIGFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i32_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store i32 %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_f16(half %arg0) #0 {
+; GFX89-LABEL: void_func_f16:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store half %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_f16_inreg(half inreg %arg0) #0 {
+; GFX89-LABEL: void_func_f16_inreg:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    v_mov_b32_e32 v0, s4
+; GFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_f16_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store half %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_f32(float %arg0) #0 {
+; CIGFX89-LABEL: void_func_f32:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store float %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_f32_inreg(float inreg %arg0) #0 {
+; CIGFX89-LABEL: void_func_f32_inreg:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    v_mov_b32_e32 v0, s4
+; CIGFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_f32_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store float %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v2i16(<2 x i16> %arg0) #0 {
+; GFX89-LABEL: void_func_v2i16:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v2i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <2 x i16> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v2i16_inreg(<2 x i16> inreg %arg0) #0 {
+; GFX89-LABEL: void_func_v2i16_inreg:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    v_mov_b32_e32 v0, s4
+; GFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v2i16_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <2 x i16> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v2f16(<2 x half> %arg0) #0 {
+; GFX89-LABEL: void_func_v2f16:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v2f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <2 x half> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v2f16_inreg(<2 x half> inreg %arg0) #0 {
+; GFX89-LABEL: void_func_v2f16_inreg:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    v_mov_b32_e32 v0, s4
+; GFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v2f16_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <2 x half> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll
index 9dcd3a66a16dbf0..14d13bbee28e0ff 100644
--- a/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll
+++ b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll
@@ -133,15 +133,48 @@ l2:
 ; Omit the transformation if the s_and_saveexec instruction overwrites
 ; any of the v_cmp source operands.
 
+
 ; GCN-LABEL: check_saveexec_overwrites_vcmp_source:
-; GCN:  .LBB7_3: ; %then
+; GCN:  .LBB7_2: ; %then
+; GFX1010:          v_cmp_eq_u32_e64 s[[C:[0-9]+]], s[[A:[0-9]+]], s[[B:[0-9]+]]
+; GFX1010-NEXT:     s_cmp_ge_i32 s[[C]], s[[B]]
+; GFX1030:          v_cmp_eq_u32_e64 s[[C:[0-9]+]], s[[A:[0-9]+]], s[[B:[0-9]+]]
+; GFX1030-NEXT:     s_cmp_ge_i32 s[[C]], s[[B]]
+define i32 @check_saveexec_overwrites_vcmp_source(i32 inreg %a, i32 inreg %b) {
+entry:
+  %0 = icmp sge i32 %a, 0
+  br i1 %0, label %if, label %then
+
+if:
+  %1 = shl i32 %a, 2
+  %2 = or i32 %1, %b
+  ret i32 %2
+
+then:
+  %3 = call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 32)
+  %4 = trunc i64 %3 to i32
+  %5 = icmp slt i32 %4, %b
+  br i1 %5, label %after, label %end
+
+after:
+  ret i32 %4
+
+end:
+  ret i32 %a
+}
+
+; Omit the transformation if the s_and_saveexec instruction overwrites
+; any of the v_cmp source operands.
+
+; GCN-LABEL: check_saveexec_overwrites_vcmp_source_no_inreg:
+; GCN:  .LBB8_3: ; %then
 ; GFX1010:          v_cmp_ge_i32_e32 vcc_lo, s[[A:[0-9]+]], v{{.*}}
 ; GFX1010-NEXT:     v_mov_b32_e32 {{.*}}, s[[A]]
 ; GFX1010-NEXT:     s_and_saveexec_b32 s[[A]], vcc_lo
 ; GFX1030:          v_cmp_ge_i32_e32 vcc_lo, s[[A:[0-9]+]], v{{.*}}
 ; GFX1030-NEXT:     v_mov_b32_e32 {{.*}}, s[[A]]
 ; GFX1030-NEXT:     s_and_saveexec_b32 s[[A]], vcc_lo
-define i32 @check_saveexec_overwrites_vcmp_source(i32 inreg %a, i32 inreg %b) {
+define i32 @check_saveexec_overwrites_vcmp_source_no_inreg(i32 %a, i32 %b) {
 entry:
   %0 = icmp sge i32 %a, 0
   br i1 %0, label %if, label %then

>From 66387b8afa977e580c43f6055a22a900bb5f0303 Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Fri, 22 Sep 2023 15:44:20 -0500
Subject: [PATCH 2/3] [AMDGPU] Add inreg support for SGPR arguments

Fix code format.
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 9ac948712ec61dd..5775877e807126e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2647,7 +2647,9 @@ SDValue SITargetLowering::LowerFormalArguments(
   if (!IsKernel) {
     CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
     if (!IsGraphics && !IsKernel && !Subtarget->enableFlatScratch()) {
-      CCInfo.AllocateRegBlock(ArrayRef<MCPhysReg>{AMDGPU::SGPR0, AMDGPU::SGPR1, AMDGPU::SGPR2, AMDGPU::SGPR3}, 4);
+      CCInfo.AllocateRegBlock(ArrayRef<MCPhysReg>{AMDGPU::SGPR0, AMDGPU::SGPR1,
+                                                  AMDGPU::SGPR2, AMDGPU::SGPR3},
+                              4);
     }
     CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
   }

>From 7e04f8d2eff963a31e5ad486623227b0a38cd1f3 Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Wed, 11 Oct 2023 15:44:29 -0500
Subject: [PATCH 3/3] Removed Todo comments; added testcases.

---
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |    2 -
 .../AMDGPU/always_uniform.ll                  |    2 +-
 .../CodeGen/AMDGPU/function-args-inreg.ll     | 1809 +++++++++++++++--
 3 files changed, 1584 insertions(+), 229 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 166b7c64140f2f1..3779bcafff1aefd 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -2543,7 +2543,6 @@ bool isArgPassedInSGPR(const Argument *A) {
     return A->hasAttribute(Attribute::InReg) ||
            A->hasAttribute(Attribute::ByVal);
   default:
-    // TODO: Should calls support inreg for SGPR inputs?
     return A->hasAttribute(Attribute::InReg);
   }
 }
@@ -2570,7 +2569,6 @@ bool isArgPassedInSGPR(const CallBase *CB, unsigned ArgNo) {
     return CB->paramHasAttr(ArgNo, Attribute::InReg) ||
            CB->paramHasAttr(ArgNo, Attribute::ByVal);
   default:
-    // TODO: Should calls support inreg for SGPR inputs?
     return CB->paramHasAttr(ArgNo, Attribute::InReg);
   }
 }
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll
index 2e5a3e998ed89d4..d7d8d29fbfd42e5 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll
@@ -74,7 +74,7 @@ define void @divergent_args(i32 %i32, i1 %i1) {
 }
 
 ; CHECK-LABEL: for function 'no_divergent_args_if_inreg':
-; CHECK-NOT: DIVERGENT ARGUMENTS
+; CHECK-NOT: DIVERGENT
 define void @no_divergent_args_if_inreg(i32 inreg %i32, i1 inreg %i1) {
  ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
index 7750fb3316c1c82..27b0d2fff9e1f91 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
@@ -1,359 +1,1716 @@
-; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89,GFX89 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
 
-define void @void_func_i1(i1 %arg0) #0 {
-; CIGFX89-LABEL: void_func_i1:
-; CIGFX89:       ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CIGFX89-NEXT:    v_and_b32_e32 v0, 1, v0
-; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
-; CIGFX89-NEXT:    s_mov_b32 s6, -1
-; CIGFX89-NEXT:    buffer_store_byte v0, off, s[4:7], 0
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
-; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+define void @void_func_i1_inreg(i1 inreg %arg0) #0 {
+; GFX9-LABEL: void_func_i1_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_and_b32 s4, s4, 1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: void_func_i1:
+; GFX11-LABEL: void_func_i1_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_and_b32 s0, s0, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store i1 %arg0, ptr addrspace(1) undef
   ret void
 }
 
-define void @void_func_i1_inreg(i1 inreg %arg0) #0 {
-; CIGFX89-LABEL: void_func_i1_inreg:
-; CIGFX89:       ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CIGFX89-NEXT:    s_and_b32 s4, s4, 1
-; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
-; CIGFX89-NEXT:    s_mov_b32 s6, -1
-; CIGFX89-NEXT:    v_mov_b32_e32 v0, s4
-; CIGFX89-NEXT:    buffer_store_byte v0, off, s[4:7], 0
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
-; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+define void @void_func_i8_inreg(i8 inreg %arg0) #0 {
+; GFX9-LABEL: void_func_i8_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: void_func_i1_inreg:
+; GFX11-LABEL: void_func_i8_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_and_b32 s0, s0, 1
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
+; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  store i1 %arg0, ptr addrspace(1) undef
+  store i8 %arg0, ptr addrspace(1) undef
   ret void
 }
 
-define void @void_func_i8(i8 %arg0) #0 {
-; CIGFX89-LABEL: void_func_i8:
-; CIGFX89:       ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
-; CIGFX89-NEXT:    s_mov_b32 s6, -1
-; CIGFX89-NEXT:    buffer_store_byte v0, off, s[4:7], 0
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
-; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+define void @void_func_i16_inreg(i16 inreg %arg0) #0 {
+; GFX9-LABEL: void_func_i16_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    global_store_short v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: void_func_i8:
+; GFX11-LABEL: void_func_i16_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    global_store_b16 v[0:1], v0, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  store i8 %arg0, ptr addrspace(1) undef
+  store i16 %arg0, ptr addrspace(1) undef
   ret void
 }
 
-define void @void_func_i8_inreg(i8 inreg %arg0) #0 {
-; CIGFX89-LABEL: void_func_i8_inreg:
-; CIGFX89:       ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
-; CIGFX89-NEXT:    s_mov_b32 s6, -1
-; CIGFX89-NEXT:    v_mov_b32_e32 v0, s4
-; CIGFX89-NEXT:    buffer_store_byte v0, off, s[4:7], 0
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
-; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+define void @void_func_i32_inreg(i32 inreg %arg0) #0 {
+; GFX9-LABEL: void_func_i32_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: void_func_i8_inreg:
+; GFX11-LABEL: void_func_i32_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
+; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  store i8 %arg0, ptr addrspace(1) undef
+  store i32 %arg0, ptr addrspace(1) undef
   ret void
 }
 
-define void @void_func_i16(i16 %arg0) #0 {
-; CIGFX89-LABEL: void_func_i16:
-; CIGFX89:       ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
-; CIGFX89-NEXT:    s_mov_b32 s6, -1
-; CIGFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
-; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+define void @void_func_i64_inreg(i64 inreg %arg0) #0 {
+; GFX9-LABEL: void_func_i64_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:       s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v0, s4
+; GFX9-NEXT:       v_mov_b32_e32 v1, s5
+; GFX9-NEXT:       global_store_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       s_setpc_b64 s[30:31]
+;
+; GFX11:      ; %bb.0:
+; GFX11-NEXT:      s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:      v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:      global_store_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:      s_setpc_b64 s[30:31]
+  store i64 %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_f16_inreg(half inreg %arg0) #0 {
+; GFX9-LABEL: void_func_f16_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    global_store_short v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: void_func_i16:
+; GFX11-LABEL: void_func_f16_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    global_store_b16 v[0:1], v0, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  store i16 %arg0, ptr addrspace(1) undef
+  store half %arg0, ptr addrspace(1) undef
   ret void
 }
 
-define void @void_func_i16_inreg(i16 inreg %arg0) #0 {
-; CIGFX89-LABEL: void_func_i16_inreg:
-; CIGFX89:       ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
-; CIGFX89-NEXT:    s_mov_b32 s6, -1
-; CIGFX89-NEXT:    v_mov_b32_e32 v0, s4
-; CIGFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
-; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+define void @void_func_f32_inreg(float inreg %arg0) #0 {
+; GFX9-LABEL: void_func_f32_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: void_func_i16_inreg:
+; GFX11-LABEL: void_func_f32_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  store i16 %arg0, ptr addrspace(1) undef
+  store float %arg0, ptr addrspace(1) undef
   ret void
 }
 
-define void @void_func_i32(i32 %arg0) #0 {
-; CIGFX89-LABEL: void_func_i32:
-; CIGFX89:       ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
-; CIGFX89-NEXT:    s_mov_b32 s6, -1
-; CIGFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
-; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+define void @void_func_f64_inreg(double inreg %arg0) #0 {
+; GFX9-LABEL: void_func_f64_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: void_func_i32:
+; GFX11-LABEL: void_func_f64_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  store i32 %arg0, ptr addrspace(1) undef
+  store double %arg0, ptr addrspace(1) undef
   ret void
 }
 
-define void @void_func_i32_inreg(i32 inreg %arg0) #0 {
-; CIGFX89-LABEL: void_func_i32_inreg:
-; CIGFX89:       ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
-; CIGFX89-NEXT:    s_mov_b32 s6, -1
-; CIGFX89-NEXT:    v_mov_b32_e32 v0, s4
-; CIGFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
-; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+define void @void_func_v2i16_inreg(<2 x i16> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v2i16_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: void_func_i32_inreg:
+; GFX11-LABEL: void_func_v2i16_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  store i32 %arg0, ptr addrspace(1) undef
+  store <2 x i16> %arg0, ptr addrspace(1) undef
   ret void
 }
 
-define void @void_func_f16(half %arg0) #0 {
-; GFX89-LABEL: void_func_f16:
-; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX89-NEXT:    s_mov_b32 s7, 0xf000
-; GFX89-NEXT:    s_mov_b32 s6, -1
-; GFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
-; GFX89-NEXT:    s_setpc_b64 s[30:31]
+define void @void_func_v3i16_inreg(<3 x i16> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v3i16_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s5
+; GFX9-NEXT:    global_store_short v[0:1], v0, off
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: void_func_f16:
+; GFX11-LABEL: void_func_v3i16_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX11-NEXT:    global_store_b32 v[0:1], v1, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  store half %arg0, ptr addrspace(1) undef
+  store <3 x i16> %arg0, ptr addrspace(1) undef
   ret void
 }
 
-define void @void_func_f16_inreg(half inreg %arg0) #0 {
-; GFX89-LABEL: void_func_f16_inreg:
+define void @void_func_v4i16_inreg(<4 x i16> inreg %arg0) #0 {
+; GFX89-LABEL: void_func_v4i16_inreg:
 ; GFX89:       ; %bb.0:
 ; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX89-NEXT:    s_mov_b32 s7, 0xf000
-; GFX89-NEXT:    s_mov_b32 s6, -1
 ; GFX89-NEXT:    v_mov_b32_e32 v0, s4
-; GFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT:    v_mov_b32_e32 v1, s5
+; GFX89-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
 ; GFX89-NEXT:    s_waitcnt vmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: void_func_f16_inreg:
+; GFX11-LABEL: void_func_v4i16_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <4 x i16> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v5i16_inreg(<5 x i16> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v5i16_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    global_store_short v[0:1], v0, off
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v5i16_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  store half %arg0, ptr addrspace(1) undef
+  store <5 x i16> %arg0, ptr addrspace(1) undef
   ret void
 }
 
-define void @void_func_f32(float %arg0) #0 {
-; CIGFX89-LABEL: void_func_f32:
-; CIGFX89:       ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
-; CIGFX89-NEXT:    s_mov_b32 s6, -1
-; CIGFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
-; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+define void @void_func_v8i16_inreg(<8 x i16> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v8i16_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: void_func_f32:
+; GFX11-LABEL: void_func_v8i16_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  store float %arg0, ptr addrspace(1) undef
+  store <8 x i16> %arg0, ptr addrspace(1) undef
   ret void
 }
 
-define void @void_func_f32_inreg(float inreg %arg0) #0 {
-; CIGFX89-LABEL: void_func_f32_inreg:
+define void @void_func_v2i32_inreg(<2 x i32> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v2i32_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v2i32_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <2 x i32> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v3i32_inreg(<3 x i32> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v3i32_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    global_store_dwordx3 v[0:1], v[0:2], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v3i32_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    global_store_b96 v[0:1], v[0:2], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <3 x i32> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v4i32_inreg(<4 x i32> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v4i32_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v4i32_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <4 x i32> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v5i32_inreg(<5 x i32> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v5i32_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v5i32_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b32 v[0:1], v4, off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <5 x i32> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v8i32_inreg(<8 x i32> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v8i32_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v8i32_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX11-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX11-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
+; GFX11-NEXT:    v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[4:7], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <8 x i32> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v16i32_inreg(<16 x i32> inreg %arg0) #0 {
+; CIGFX89-LABEL: void_func_v16i32_inreg:
 ; CIGFX89:       ; %bb.0:
 ; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
-; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    v_mov_b32_e32 v0, s16
+; CIGFX89-NEXT:    v_mov_b32_e32 v1, s17
+; CIGFX89-NEXT:    v_mov_b32_e32 v2, s18
+; CIGFX89-NEXT:    v_mov_b32_e32 v3, s19
+; CIGFX89-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; CIGFX89-NEXT:    s_nop 0
+; CIGFX89-NEXT:    v_mov_b32_e32 v0, s12
+; CIGFX89-NEXT:    v_mov_b32_e32 v1, s13
+; CIGFX89-NEXT:    v_mov_b32_e32 v2, s14
+; CIGFX89-NEXT:    v_mov_b32_e32 v3, s15
+; CIGFX89-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; CIGFX89-NEXT:    s_nop 0
+; CIGFX89-NEXT:    v_mov_b32_e32 v0, s8
+; CIGFX89-NEXT:    v_mov_b32_e32 v1, s9
+; CIGFX89-NEXT:    v_mov_b32_e32 v2, s10
+; CIGFX89-NEXT:    v_mov_b32_e32 v3, s11
+; CIGFX89-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; CIGFX89-NEXT:    s_nop 0
 ; CIGFX89-NEXT:    v_mov_b32_e32 v0, s4
-; CIGFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; CIGFX89-NEXT:    v_mov_b32_e32 v1, s5
+; CIGFX89-NEXT:    v_mov_b32_e32 v2, s6
+; CIGFX89-NEXT:    v_mov_b32_e32 v3, s7
+; CIGFX89-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
 ; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
 ; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: void_func_f32_inreg:
+; GFX11-LABEL: void_func_v16i32_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v0, s0
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-NEXT:    v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-NEXT:    v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v5, s9
+; GFX11-NEXT:    v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v7, s11
+; GFX11-NEXT:    v_dual_mov_b32 v8, s4 :: v_dual_mov_b32 v9, s5
+; GFX11-NEXT:    v_dual_mov_b32 v10, s6 :: v_dual_mov_b32 v11, s7
+; GFX11-NEXT:    v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v13, s1
+; GFX11-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v15, s3
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[4:7], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[8:11], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[12:15], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  store float %arg0, ptr addrspace(1) undef
+  store <16 x i32> %arg0, ptr addrspace(1) undef
   ret void
 }
 
-define void @void_func_v2i16(<2 x i16> %arg0) #0 {
-; GFX89-LABEL: void_func_v2i16:
-; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX89-NEXT:    s_mov_b32 s7, 0xf000
-; GFX89-NEXT:    s_mov_b32 s6, -1
-; GFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
-; GFX89-NEXT:    s_setpc_b64 s[30:31]
+define void @void_func_v32i32_inreg(<32 x i32> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v32i32_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v7, v1
+; GFX9-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, s28
+; GFX9-NEXT:    v_mov_b32_e32 v5, s29
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off
+; GFX9-NEXT:    v_mov_b32_e32 v0, s24
+; GFX9-NEXT:    v_mov_b32_e32 v1, s25
+; GFX9-NEXT:    v_mov_b32_e32 v2, s26
+; GFX9-NEXT:    v_mov_b32_e32 v3, s27
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s20
+; GFX9-NEXT:    v_mov_b32_e32 v1, s21
+; GFX9-NEXT:    v_mov_b32_e32 v2, s22
+; GFX9-NEXT:    v_mov_b32_e32 v3, s23
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s16
+; GFX9-NEXT:    v_mov_b32_e32 v1, s17
+; GFX9-NEXT:    v_mov_b32_e32 v2, s18
+; GFX9-NEXT:    v_mov_b32_e32 v3, s19
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s12
+; GFX9-NEXT:    v_mov_b32_e32 v1, s13
+; GFX9-NEXT:    v_mov_b32_e32 v2, s14
+; GFX9-NEXT:    v_mov_b32_e32 v3, s15
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: void_func_v2i16:
+; GFX11-LABEL: void_func_v32i32_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX11-NEXT:    v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v1, s29
+; GFX11-NEXT:    v_dual_mov_b32 v4, s24 :: v_dual_mov_b32 v5, s25
+; GFX11-NEXT:    v_dual_mov_b32 v6, s26 :: v_dual_mov_b32 v7, s27
+; GFX11-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[4:7], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[8:11], off
+; GFX11-NEXT:    v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v1, s17
+; GFX11-NEXT:    v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v3, s19
+; GFX11-NEXT:    v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13
+; GFX11-NEXT:    v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15
+; GFX11-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
+; GFX11-NEXT:    v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
+; GFX11-NEXT:    v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v13, s5
+; GFX11-NEXT:    v_dual_mov_b32 v14, s6 :: v_dual_mov_b32 v15, s7
+; GFX11-NEXT:    v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v17, s1
+; GFX11-NEXT:    v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v19, s3
+; GFX11-NEXT:    s_clause 0x4
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[4:7], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[8:11], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[12:15], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[16:19], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  store <2 x i16> %arg0, ptr addrspace(1) undef
+  store <32 x i32> %arg0, ptr addrspace(1) undef
   ret void
 }
 
-define void @void_func_v2i16_inreg(<2 x i16> inreg %arg0) #0 {
-; GFX89-LABEL: void_func_v2i16_inreg:
-; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX89-NEXT:    s_mov_b32 s7, 0xf000
-; GFX89-NEXT:    s_mov_b32 s6, -1
-; GFX89-NEXT:    v_mov_b32_e32 v0, s4
-; GFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
-; GFX89-NEXT:    s_setpc_b64 s[30:31]
+define void @void_func_v2i64_inreg(<2 x i64> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v2i64_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: void_func_v2i16_inreg:
+; GFX11-LABEL: void_func_v2i64_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v0, s0
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  store <2 x i16> %arg0, ptr addrspace(1) undef
+  store <2 x i64> %arg0, ptr addrspace(1) undef
   ret void
 }
 
-define void @void_func_v2f16(<2 x half> %arg0) #0 {
-; GFX89-LABEL: void_func_v2f16:
-; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX89-NEXT:    s_mov_b32 s7, 0xf000
-; GFX89-NEXT:    s_mov_b32 s6, -1
-; GFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
-; GFX89-NEXT:    s_setpc_b64 s[30:31]
+define void @void_func_v3i64_inreg(<3 x i64> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v3i64_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: void_func_v2f16:
+; GFX11-LABEL: void_func_v3i64_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b64 v[0:1], v[4:5], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  store <2 x half> %arg0, ptr addrspace(1) undef
+  store <3 x i64> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v4i64_inreg(<4 x i64> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v4i64_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v4i64_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX11-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX11-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
+; GFX11-NEXT:    v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[4:7], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <4 x i64> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v5i64_inreg(<5 x i64> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v5i64_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s12
+; GFX9-NEXT:    v_mov_b32_e32 v1, s13
+; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v5i64_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX11-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX11-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
+; GFX11-NEXT:    v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3
+; GFX11-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[4:7], off
+; GFX11-NEXT:    global_store_b64 v[0:1], v[8:9], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <5 x i64> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v8i64_inreg(<8 x i64> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v8i64_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s16
+; GFX9-NEXT:    v_mov_b32_e32 v1, s17
+; GFX9-NEXT:    v_mov_b32_e32 v2, s18
+; GFX9-NEXT:    v_mov_b32_e32 v3, s19
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s12
+; GFX9-NEXT:    v_mov_b32_e32 v1, s13
+; GFX9-NEXT:    v_mov_b32_e32 v2, s14
+; GFX9-NEXT:    v_mov_b32_e32 v3, s15
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v8i64_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-NEXT:    v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-NEXT:    v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v5, s9
+; GFX11-NEXT:    v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v7, s11
+; GFX11-NEXT:    v_dual_mov_b32 v8, s4 :: v_dual_mov_b32 v9, s5
+; GFX11-NEXT:    v_dual_mov_b32 v10, s6 :: v_dual_mov_b32 v11, s7
+; GFX11-NEXT:    v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v13, s1
+; GFX11-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v15, s3
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[4:7], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[8:11], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[12:15], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <8 x i64> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v16i64_inreg(<16 x i64> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v16i64_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v7, v1
+; GFX9-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, s28
+; GFX9-NEXT:    v_mov_b32_e32 v5, s29
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off
+; GFX9-NEXT:    v_mov_b32_e32 v0, s24
+; GFX9-NEXT:    v_mov_b32_e32 v1, s25
+; GFX9-NEXT:    v_mov_b32_e32 v2, s26
+; GFX9-NEXT:    v_mov_b32_e32 v3, s27
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s20
+; GFX9-NEXT:    v_mov_b32_e32 v1, s21
+; GFX9-NEXT:    v_mov_b32_e32 v2, s22
+; GFX9-NEXT:    v_mov_b32_e32 v3, s23
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s16
+; GFX9-NEXT:    v_mov_b32_e32 v1, s17
+; GFX9-NEXT:    v_mov_b32_e32 v2, s18
+; GFX9-NEXT:    v_mov_b32_e32 v3, s19
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s12
+; GFX9-NEXT:    v_mov_b32_e32 v1, s13
+; GFX9-NEXT:    v_mov_b32_e32 v2, s14
+; GFX9-NEXT:    v_mov_b32_e32 v3, s15
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v16i64_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX11-NEXT:    v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v1, s29
+; GFX11-NEXT:    v_dual_mov_b32 v4, s24 :: v_dual_mov_b32 v5, s25
+; GFX11-NEXT:    v_dual_mov_b32 v6, s26 :: v_dual_mov_b32 v7, s27
+; GFX11-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[4:7], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[8:11], off
+; GFX11-NEXT:    v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v1, s17
+; GFX11-NEXT:    v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v3, s19
+; GFX11-NEXT:    v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13
+; GFX11-NEXT:    v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15
+; GFX11-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
+; GFX11-NEXT:    v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
+; GFX11-NEXT:    v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v13, s5
+; GFX11-NEXT:    v_dual_mov_b32 v14, s6 :: v_dual_mov_b32 v15, s7
+; GFX11-NEXT:    v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v17, s1
+; GFX11-NEXT:    v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v19, s3
+; GFX11-NEXT:    s_clause 0x4
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[4:7], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[8:11], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[12:15], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[16:19], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <16 x i64> %arg0, ptr addrspace(1) undef
   ret void
 }
 
 define void @void_func_v2f16_inreg(<2 x half> inreg %arg0) #0 {
-; GFX89-LABEL: void_func_v2f16_inreg:
-; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX89-NEXT:    s_mov_b32 s7, 0xf000
-; GFX89-NEXT:    s_mov_b32 s6, -1
-; GFX89-NEXT:    v_mov_b32_e32 v0, s4
-; GFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
-; GFX89-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: void_func_v2f16_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v2f16_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <2 x half> %arg0, ptr addrspace(1) undef
   ret void
 }
 
+define void @void_func_v3f16_inreg(<3 x half> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v3f16_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s5
+; GFX9-NEXT:    global_store_short v[0:1], v0, off
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v3f16_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX11-NEXT:    global_store_b32 v[0:1], v1, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <3 x half> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v4f16_inreg(<4 x half> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v4f16_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v4f16_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <4 x half> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v8f16_inreg(<8 x half> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v8f16_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v8f16_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <8 x half> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v16f16_inreg(<16 x half> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v16f16_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v16f16_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX11-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX11-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
+; GFX11-NEXT:    v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[4:7], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <16 x half> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v2f32_inreg(<2 x float> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v2f32_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v2f32_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <2 x float> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v3f32_inreg(<3 x float> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v3f32_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    global_store_dwordx3 v[0:1], v[0:2], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v3f32_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    global_store_b96 v[0:1], v[0:2], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <3 x float> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v4f32_inreg(<4 x float> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v4f32_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v4f32_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <4 x float> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v8f32_inreg(<8 x float> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v8f32_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v8f32_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX11-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX11-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
+; GFX11-NEXT:    v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[4:7], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <8 x float> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v16f32_inreg(<16 x float> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v16f32_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s16
+; GFX9-NEXT:    v_mov_b32_e32 v1, s17
+; GFX9-NEXT:    v_mov_b32_e32 v2, s18
+; GFX9-NEXT:    v_mov_b32_e32 v3, s19
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s12
+; GFX9-NEXT:    v_mov_b32_e32 v1, s13
+; GFX9-NEXT:    v_mov_b32_e32 v2, s14
+; GFX9-NEXT:    v_mov_b32_e32 v3, s15
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v16f32_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-NEXT:    v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-NEXT:    v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v5, s9
+; GFX11-NEXT:    v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v7, s11
+; GFX11-NEXT:    v_dual_mov_b32 v8, s4 :: v_dual_mov_b32 v9, s5
+; GFX11-NEXT:    v_dual_mov_b32 v10, s6 :: v_dual_mov_b32 v11, s7
+; GFX11-NEXT:    v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v13, s1
+; GFX11-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v15, s3
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[4:7], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[8:11], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[12:15], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <16 x float> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v2f64_inreg(<2 x double> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v2f64_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v2f64_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <2 x double> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v3f64_inreg(<3 x double> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v3f64_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v3f64_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b64 v[0:1], v[4:5], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <3 x double> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v4f64_inreg(<4 x double> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v4f64_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v4f64_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX11-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX11-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
+; GFX11-NEXT:    v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[4:7], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <4 x double> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v8f64_inreg(<8 x double> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v8f64_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s16
+; GFX9-NEXT:    v_mov_b32_e32 v1, s17
+; GFX9-NEXT:    v_mov_b32_e32 v2, s18
+; GFX9-NEXT:    v_mov_b32_e32 v3, s19
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s12
+; GFX9-NEXT:    v_mov_b32_e32 v1, s13
+; GFX9-NEXT:    v_mov_b32_e32 v2, s14
+; GFX9-NEXT:    v_mov_b32_e32 v3, s15
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v8f64_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-NEXT:    v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-NEXT:    v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v5, s9
+; GFX11-NEXT:    v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v7, s11
+; GFX11-NEXT:    v_dual_mov_b32 v8, s4 :: v_dual_mov_b32 v9, s5
+; GFX11-NEXT:    v_dual_mov_b32 v10, s6 :: v_dual_mov_b32 v11, s7
+; GFX11-NEXT:    v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v13, s1
+; GFX11-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v15, s3
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[4:7], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[8:11], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[12:15], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <8 x double> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v16f64_inreg(<16 x double> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v16f64_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v7, v1
+; GFX9-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, s28
+; GFX9-NEXT:    v_mov_b32_e32 v5, s29
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off
+; GFX9-NEXT:    v_mov_b32_e32 v0, s24
+; GFX9-NEXT:    v_mov_b32_e32 v1, s25
+; GFX9-NEXT:    v_mov_b32_e32 v2, s26
+; GFX9-NEXT:    v_mov_b32_e32 v3, s27
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s20
+; GFX9-NEXT:    v_mov_b32_e32 v1, s21
+; GFX9-NEXT:    v_mov_b32_e32 v2, s22
+; GFX9-NEXT:    v_mov_b32_e32 v3, s23
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s16
+; GFX9-NEXT:    v_mov_b32_e32 v1, s17
+; GFX9-NEXT:    v_mov_b32_e32 v2, s18
+; GFX9-NEXT:    v_mov_b32_e32 v3, s19
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s12
+; GFX9-NEXT:    v_mov_b32_e32 v1, s13
+; GFX9-NEXT:    v_mov_b32_e32 v2, s14
+; GFX9-NEXT:    v_mov_b32_e32 v3, s15
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v16f64_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX11-NEXT:    v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v1, s29
+; GFX11-NEXT:    v_dual_mov_b32 v4, s24 :: v_dual_mov_b32 v5, s25
+; GFX11-NEXT:    v_dual_mov_b32 v6, s26 :: v_dual_mov_b32 v7, s27
+; GFX11-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[4:7], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[8:11], off
+; GFX11-NEXT:    v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v1, s17
+; GFX11-NEXT:    v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v3, s19
+; GFX11-NEXT:    v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13
+; GFX11-NEXT:    v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15
+; GFX11-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
+; GFX11-NEXT:    v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
+; GFX11-NEXT:    v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v13, s5
+; GFX11-NEXT:    v_dual_mov_b32 v14, s6 :: v_dual_mov_b32 v15, s7
+; GFX11-NEXT:    v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v17, s1
+; GFX11-NEXT:    v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v19, s3
+; GFX11-NEXT:    s_clause 0x4
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[4:7], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[8:11], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[12:15], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[16:19], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <16 x double> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v32i32_i1_i8_i16_f32_inreg(<32 x i32> inreg %arg0, i1 inreg %arg1, i8 inreg %arg2, i16 inreg %arg3, half inreg %arg4) #0 {
+; GFX9-LABEL: void_func_v32i32_i1_i8_i16_f32_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v13, v1
+; GFX9-NEXT:    v_mov_b32_e32 v12, v0
+; GFX9-NEXT:    v_mov_b32_e32 v10, s28
+; GFX9-NEXT:    v_mov_b32_e32 v11, s29
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[10:13], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s24
+; GFX9-NEXT:    v_mov_b32_e32 v1, s25
+; GFX9-NEXT:    v_mov_b32_e32 v2, s26
+; GFX9-NEXT:    v_mov_b32_e32 v3, s27
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s20
+; GFX9-NEXT:    v_mov_b32_e32 v1, s21
+; GFX9-NEXT:    v_mov_b32_e32 v2, s22
+; GFX9-NEXT:    v_mov_b32_e32 v3, s23
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s16
+; GFX9-NEXT:    v_mov_b32_e32 v1, s17
+; GFX9-NEXT:    v_mov_b32_e32 v2, s18
+; GFX9-NEXT:    v_mov_b32_e32 v3, s19
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s12
+; GFX9-NEXT:    v_mov_b32_e32 v1, s13
+; GFX9-NEXT:    v_mov_b32_e32 v2, s14
+; GFX9-NEXT:    v_mov_b32_e32 v3, s15
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v6
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_byte v[0:1], v7, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_short v[0:1], v8, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_short v[0:1], v9, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v32i32_i1_i8_i16_f32_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
+; GFX11-NEXT:    v_dual_mov_b32 v6, s28 :: v_dual_mov_b32 v7, s29
+; GFX11-NEXT:    v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v11, s25
+; GFX11-NEXT:    v_dual_mov_b32 v12, s26 :: v_dual_mov_b32 v13, s27
+; GFX11-NEXT:    v_dual_mov_b32 v14, s20 :: v_dual_mov_b32 v15, s21
+; GFX11-NEXT:    v_dual_mov_b32 v16, s22 :: v_dual_mov_b32 v17, s23
+; GFX11-NEXT:    v_dual_mov_b32 v18, s16 :: v_dual_mov_b32 v19, s17
+; GFX11-NEXT:    v_dual_mov_b32 v20, s18 :: v_dual_mov_b32 v21, s19
+; GFX11-NEXT:    global_store_b128 v[0:1], v[6:9], off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b128 v[0:1], v[10:13], off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b128 v[0:1], v[14:17], off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b128 v[0:1], v[18:21], off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_dual_mov_b32 v6, s12 :: v_dual_mov_b32 v7, s13
+; GFX11-NEXT:    v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v9, s15
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v2
+; GFX11-NEXT:    v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v11, s9
+; GFX11-NEXT:    v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v13, s11
+; GFX11-NEXT:    v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v15, s5
+; GFX11-NEXT:    v_dual_mov_b32 v16, s6 :: v_dual_mov_b32 v17, s7
+; GFX11-NEXT:    v_dual_mov_b32 v18, s0 :: v_dual_mov_b32 v19, s1
+; GFX11-NEXT:    v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v21, s3
+; GFX11-NEXT:    global_store_b128 v[0:1], v[6:9], off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b128 v[0:1], v[10:13], off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b128 v[0:1], v[14:17], off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b128 v[0:1], v[18:21], off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b8 v[0:1], v0, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b8 v[0:1], v3, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b16 v[0:1], v4, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b16 v[0:1], v5, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store volatile <32 x i32> %arg0, ptr addrspace(1) undef
+  store volatile i1 %arg1, ptr addrspace(1) undef
+  store volatile i8 %arg2, ptr addrspace(1) undef
+  store volatile i16 %arg3, ptr addrspace(1) undef
+  store volatile half %arg4, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v32i32_v2i32_v2f32_inreg(<32 x i32> inreg %arg0, <2 x i32> inreg %arg1, <2 x float> inreg %arg2) #0 {
+; GFX9-LABEL: void_func_v32i32_v2i32_v2f32_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v13, v1
+; GFX9-NEXT:    v_mov_b32_e32 v12, v0
+; GFX9-NEXT:    v_mov_b32_e32 v10, s28
+; GFX9-NEXT:    v_mov_b32_e32 v11, s29
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[10:13], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s24
+; GFX9-NEXT:    v_mov_b32_e32 v1, s25
+; GFX9-NEXT:    v_mov_b32_e32 v2, s26
+; GFX9-NEXT:    v_mov_b32_e32 v3, s27
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s20
+; GFX9-NEXT:    v_mov_b32_e32 v1, s21
+; GFX9-NEXT:    v_mov_b32_e32 v2, s22
+; GFX9-NEXT:    v_mov_b32_e32 v3, s23
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s16
+; GFX9-NEXT:    v_mov_b32_e32 v1, s17
+; GFX9-NEXT:    v_mov_b32_e32 v2, s18
+; GFX9-NEXT:    v_mov_b32_e32 v3, s19
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s12
+; GFX9-NEXT:    v_mov_b32_e32 v1, s13
+; GFX9-NEXT:    v_mov_b32_e32 v2, s14
+; GFX9-NEXT:    v_mov_b32_e32 v3, s15
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v32i32_v2i32_v2f32_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
+; GFX11-NEXT:    v_dual_mov_b32 v6, s28 :: v_dual_mov_b32 v7, s29
+; GFX11-NEXT:    v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v11, s25
+; GFX11-NEXT:    v_dual_mov_b32 v12, s26 :: v_dual_mov_b32 v13, s27
+; GFX11-NEXT:    v_dual_mov_b32 v14, s20 :: v_dual_mov_b32 v15, s21
+; GFX11-NEXT:    v_dual_mov_b32 v16, s22 :: v_dual_mov_b32 v17, s23
+; GFX11-NEXT:    global_store_b128 v[0:1], v[6:9], off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b128 v[0:1], v[10:13], off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b128 v[0:1], v[14:17], off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_dual_mov_b32 v6, s16 :: v_dual_mov_b32 v7, s17
+; GFX11-NEXT:    v_dual_mov_b32 v8, s18 :: v_dual_mov_b32 v9, s19
+; GFX11-NEXT:    v_dual_mov_b32 v10, s12 :: v_dual_mov_b32 v11, s13
+; GFX11-NEXT:    v_dual_mov_b32 v12, s14 :: v_dual_mov_b32 v13, s15
+; GFX11-NEXT:    v_dual_mov_b32 v14, s8 :: v_dual_mov_b32 v15, s9
+; GFX11-NEXT:    v_dual_mov_b32 v16, s10 :: v_dual_mov_b32 v17, s11
+; GFX11-NEXT:    v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5
+; GFX11-NEXT:    v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7
+; GFX11-NEXT:    v_dual_mov_b32 v22, s0 :: v_dual_mov_b32 v23, s1
+; GFX11-NEXT:    v_dual_mov_b32 v24, s2 :: v_dual_mov_b32 v25, s3
+; GFX11-NEXT:    global_store_b128 v[0:1], v[6:9], off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b128 v[0:1], v[10:13], off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b128 v[0:1], v[14:17], off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b128 v[0:1], v[18:21], off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b128 v[0:1], v[22:25], off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b64 v[0:1], v[4:5], off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store volatile <32 x i32> %arg0, ptr addrspace(1) undef
+  store volatile <2 x i32> %arg1, ptr addrspace(1) undef
+  store volatile <2 x float> %arg2, ptr addrspace(1) undef
+  ret void
+}
+
+define void @too_many_args_use_workitem_id_x_inreg(
+  i32 inreg %arg0, i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3, i32 inreg %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 inreg %arg7,
+  i32 inreg %arg8, i32 inreg %arg9, i32 inreg %arg10, i32 inreg %arg11, i32 inreg %arg12, i32 inreg %arg13, i32 inreg %arg14, i32 inreg %arg15,
+  i32 inreg %arg16, i32 inreg %arg17, i32 inreg %arg18, i32 inreg %arg19, i32 inreg %arg20, i32 inreg %arg21, i32 inreg %arg22, i32 inreg %arg23,
+  i32 inreg %arg24, i32 inreg %arg25, i32 inreg %arg26, i32 inreg %arg27, i32 inreg %arg28, i32 inreg %arg29, i32 inreg %arg30, i32 inreg %arg31) {
+; GFX9-LABEL: {{^}}too_many_args_use_workitem_id_x_inreg:
+; GFX9:        ; %bb.0:
+; GFX9-NEXT:       s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v6, s4
+; GFX9-NEXT:       global_store_dword v[0:1], v6, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v6, s5
+; GFX9-NEXT:       global_store_dword v[0:1], v6, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v6, s6
+; GFX9-NEXT:       global_store_dword v[0:1], v6, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v6, s7
+; GFX9-NEXT:       global_store_dword v[0:1], v6, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v6, s8
+; GFX9-NEXT:       global_store_dword v[0:1], v6, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v6, s9
+; GFX9-NEXT:       global_store_dword v[0:1], v6, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v6, s10
+; GFX9-NEXT:       global_store_dword v[0:1], v6, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v6, s11
+; GFX9-NEXT:       global_store_dword v[0:1], v6, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v6, s12
+; GFX9-NEXT:       global_store_dword v[0:1], v6, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v6, s13
+; GFX9-NEXT:       global_store_dword v[0:1], v6, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v6, s14
+; GFX9-NEXT:       global_store_dword v[0:1], v6, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v6, s15
+; GFX9-NEXT:       global_store_dword v[0:1], v6, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v6, s16
+; GFX9-NEXT:       global_store_dword v[0:1], v6, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v6, s17
+; GFX9-NEXT:       global_store_dword v[0:1], v6, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v6, s18
+; GFX9-NEXT:       global_store_dword v[0:1], v6, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v6, s19
+; GFX9-NEXT:       global_store_dword v[0:1], v6, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v6, s20
+; GFX9-NEXT:       global_store_dword v[0:1], v6, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v6, s21
+; GFX9-NEXT:       global_store_dword v[0:1], v6, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v6, s22
+; GFX9-NEXT:       global_store_dword v[0:1], v6, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v6, s23
+; GFX9-NEXT:       global_store_dword v[0:1], v6, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v6, s24
+; GFX9-NEXT:       global_store_dword v[0:1], v6, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v6, s25
+; GFX9-NEXT:       global_store_dword v[0:1], v6, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v6, s26
+; GFX9-NEXT:       global_store_dword v[0:1], v6, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v6, s27
+; GFX9-NEXT:       global_store_dword v[0:1], v6, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v6, s28
+; GFX9-NEXT:       global_store_dword v[0:1], v6, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       v_mov_b32_e32 v6, s29
+; GFX9-NEXT:       global_store_dword v[0:1], v6, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       global_store_dword v[0:1], v0, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       global_store_dword v[0:1], v1, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       global_store_dword v[0:1], v2, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       global_store_dword v[0:1], v3, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       global_store_dword v[0:1], v4, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       global_store_dword v[0:1], v5, off
+; GFX9-NEXT:       s_waitcnt vmcnt(0)
+; GFX9-NEXT:       s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: {{^}}too_many_args_use_workitem_id_x_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX11-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
+; GFX11-NEXT:    v_mov_b32_e32 v6, s4
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v[0:1], v3, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v[0:1], v4, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v[0:1], v5, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v[0:1], v6, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_dual_mov_b32 v3, s6 :: v_dual_mov_b32 v2, s5
+; GFX11-NEXT:    v_dual_mov_b32 v5, s8 :: v_dual_mov_b32 v4, s7
+; GFX11-NEXT:    v_mov_b32_e32 v6, s9
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v[0:1], v3, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v[0:1], v4, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v[0:1], v5, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v[0:1], v6, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_dual_mov_b32 v5, s13 :: v_dual_mov_b32 v2, s10
+; GFX11-NEXT:    v_dual_mov_b32 v3, s11 :: v_dual_mov_b32 v4, s12
+; GFX11-NEXT:    v_mov_b32_e32 v6, s14
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v[0:1], v3, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v[0:1], v4, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v[0:1], v5, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v[0:1], v6, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_dual_mov_b32 v2, s15 :: v_dual_mov_b32 v3, s16
+; GFX11-NEXT:    v_dual_mov_b32 v4, s17 :: v_dual_mov_b32 v5, s18
+; GFX11-NEXT:    v_mov_b32_e32 v6, s19
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v[0:1], v3, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v[0:1], v4, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v[0:1], v5, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v[0:1], v6, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_dual_mov_b32 v2, s20 :: v_dual_mov_b32 v3, s21
+; GFX11-NEXT:    v_dual_mov_b32 v4, s22 :: v_dual_mov_b32 v5, s23
+; GFX11-NEXT:    v_mov_b32_e32 v6, s24
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v[0:1], v3, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v[0:1], v4, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v[0:1], v5, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v[0:1], v6, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_dual_mov_b32 v5, s28 :: v_dual_mov_b32 v2, s25
+; GFX11-NEXT:    v_dual_mov_b32 v3, s26 :: v_dual_mov_b32 v4, s27
+; GFX11-NEXT:    v_mov_b32_e32 v6, s29
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v[0:1], v3, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v[0:1], v4, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v[0:1], v5, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v[0:1], v6, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v[0:1], v0, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v[0:1], v1, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  ;%val = call i32 @llvm.amdgcn.workitem.id.x()
+  ;store volatile i32 %val, ptr addrspace(1) undef
+
+  store volatile i32 %arg0, ptr addrspace(1) undef
+  store volatile i32 %arg1, ptr addrspace(1) undef
+  store volatile i32 %arg2, ptr addrspace(1) undef
+  store volatile i32 %arg3, ptr addrspace(1) undef
+  store volatile i32 %arg4, ptr addrspace(1) undef
+  store volatile i32 %arg5, ptr addrspace(1) undef
+  store volatile i32 %arg6, ptr addrspace(1) undef
+  store volatile i32 %arg7, ptr addrspace(1) undef
+
+  store volatile i32 %arg8, ptr addrspace(1) undef
+  store volatile i32 %arg9, ptr addrspace(1) undef
+  store volatile i32 %arg10, ptr addrspace(1) undef
+  store volatile i32 %arg11, ptr addrspace(1) undef
+  store volatile i32 %arg12, ptr addrspace(1) undef
+  store volatile i32 %arg13, ptr addrspace(1) undef
+  store volatile i32 %arg14, ptr addrspace(1) undef
+  store volatile i32 %arg15, ptr addrspace(1) undef
+
+  store volatile i32 %arg16, ptr addrspace(1) undef
+  store volatile i32 %arg17, ptr addrspace(1) undef
+  store volatile i32 %arg18, ptr addrspace(1) undef
+  store volatile i32 %arg19, ptr addrspace(1) undef
+  store volatile i32 %arg20, ptr addrspace(1) undef
+  store volatile i32 %arg21, ptr addrspace(1) undef
+  store volatile i32 %arg22, ptr addrspace(1) undef
+  store volatile i32 %arg23, ptr addrspace(1) undef
+
+  store volatile i32 %arg24, ptr addrspace(1) undef
+  store volatile i32 %arg25, ptr addrspace(1) undef
+  store volatile i32 %arg26, ptr addrspace(1) undef
+  store volatile i32 %arg27, ptr addrspace(1) undef
+  store volatile i32 %arg28, ptr addrspace(1) undef
+  store volatile i32 %arg29, ptr addrspace(1) undef
+  store volatile i32 %arg30, ptr addrspace(1) undef
+  store volatile i32 %arg31, ptr addrspace(1) undef
+
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind noinline }
+



More information about the llvm-commits mailing list