[llvm-branch-commits] [llvm] AMDGPU: Use default shouldRewriteCopySrc (PR #125535)
Matt Arsenault via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon Feb 3 22:04:06 PST 2025
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/125535
>From e7b88d2c349059c01ddf463bf014a0c66d7c3b7e Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Thu, 23 Jan 2025 14:39:10 +0700
Subject: [PATCH] AMDGPU: Use default shouldRewriteCopySrc
This was ultimately working around bugs in subregister handling
in peephole-opt. In the common case, it would give up on folding
anything into a subregister extract copy.
---
llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 24 -
llvm/lib/Target/AMDGPU/SIRegisterInfo.h | 5 -
llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll | 44 +-
llvm/test/CodeGen/AMDGPU/ctpop64.ll | 36 +-
llvm/test/CodeGen/AMDGPU/idot2.ll | 182 +++----
llvm/test/CodeGen/AMDGPU/load-global-i32.ll | 85 ++--
.../peephole-opt-fold-reg-sequence-subreg.mir | 8 +-
.../AMDGPU/peephole-opt-regseq-removal.mir | 4 +-
.../CodeGen/AMDGPU/spill-scavenge-offset.ll | 476 +++++++++---------
9 files changed, 418 insertions(+), 446 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 6fc57dec6a8264..71c720ed09b5fb 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -3516,30 +3516,6 @@ bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
OpType <= AMDGPU::OPERAND_SRC_LAST;
}
-bool SIRegisterInfo::shouldRewriteCopySrc(
- const TargetRegisterClass *DefRC,
- unsigned DefSubReg,
- const TargetRegisterClass *SrcRC,
- unsigned SrcSubReg) const {
- // We want to prefer the smallest register class possible, so we don't want to
- // stop and rewrite on anything that looks like a subregister
- // extract. Operations mostly don't care about the super register class, so we
- // only want to stop on the most basic of copies between the same register
- // class.
- //
- // e.g. if we have something like
- // %0 = ...
- // %1 = ...
- // %2 = REG_SEQUENCE %0, sub0, %1, sub1, %2, sub2
- // %3 = COPY %2, sub0
- //
- // We want to look through the COPY to find:
- // => %3 = COPY %0
-
- // Plain copy.
- return getCommonSubClass(DefRC, SrcRC) != nullptr;
-}
-
bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const {
// TODO: 64-bit operands have extending behavior from 32-bit literal.
return OpType >= AMDGPU::OPERAND_REG_IMM_FIRST &&
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 8e481e3ac23043..a434efb70d0525 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -275,11 +275,6 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
const TargetRegisterClass *SubRC,
unsigned SubIdx) const;
- bool shouldRewriteCopySrc(const TargetRegisterClass *DefRC,
- unsigned DefSubReg,
- const TargetRegisterClass *SrcRC,
- unsigned SrcSubReg) const override;
-
/// \returns True if operands defined with this operand type can accept
/// a literal constant (i.e. any 32-bit immediate).
bool opCanUseLiteralConstant(unsigned OpType) const;
diff --git a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
index c6c0b9cf8f027f..cc2f775ff22bc5 100644
--- a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
+++ b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
@@ -163,33 +163,33 @@ define amdgpu_kernel void @test_copy_v4i8_x3(ptr addrspace(1) %out0, ptr addrspa
define amdgpu_kernel void @test_copy_v4i8_x4(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3, ptr addrspace(1) %in) nounwind {
; SI-LABEL: test_copy_v4i8_x4:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x11
-; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s10, 0
-; SI-NEXT: s_mov_b32 s11, s3
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x11
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s2, 0
+; SI-NEXT: s_mov_b32 s3, s11
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; SI-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x9
-; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: s_mov_b32 s14, s2
-; SI-NEXT: s_mov_b32 s15, s3
-; SI-NEXT: s_mov_b32 s18, s2
+; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_mov_b32 s18, s10
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s0, s4
-; SI-NEXT: s_mov_b32 s1, s5
-; SI-NEXT: s_mov_b32 s19, s3
-; SI-NEXT: s_mov_b32 s22, s2
-; SI-NEXT: s_mov_b32 s23, s3
-; SI-NEXT: s_mov_b32 s12, s6
-; SI-NEXT: s_mov_b32 s13, s7
-; SI-NEXT: s_mov_b32 s16, s8
-; SI-NEXT: s_mov_b32 s17, s9
-; SI-NEXT: s_mov_b32 s20, s10
-; SI-NEXT: s_mov_b32 s21, s11
+; SI-NEXT: s_mov_b32 s8, s0
+; SI-NEXT: s_mov_b32 s9, s1
+; SI-NEXT: s_mov_b32 s19, s11
+; SI-NEXT: s_mov_b32 s22, s10
+; SI-NEXT: s_mov_b32 s23, s11
+; SI-NEXT: s_mov_b32 s12, s2
+; SI-NEXT: s_mov_b32 s13, s3
+; SI-NEXT: s_mov_b32 s16, s4
+; SI-NEXT: s_mov_b32 s17, s5
+; SI-NEXT: s_mov_b32 s20, s6
+; SI-NEXT: s_mov_b32 s21, s7
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0
; SI-NEXT: buffer_store_dword v0, off, s[16:19], 0
; SI-NEXT: buffer_store_dword v0, off, s[20:23], 0
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop64.ll b/llvm/test/CodeGen/AMDGPU/ctpop64.ll
index 3504546801c93b..2258f6a7b5483a 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop64.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop64.ll
@@ -334,58 +334,58 @@ define amdgpu_kernel void @v_ctpop_v4i64(ptr addrspace(1) noalias %out, ptr addr
define amdgpu_kernel void @ctpop_i64_in_br(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %ctpop_arg, i32 %cond) {
; SI-LABEL: ctpop_i64_in_br:
; SI: ; %bb.0: ; %entry
-; SI-NEXT: s_load_dword s8, s[4:5], 0xf
+; SI-NEXT: s_load_dword s6, s[4:5], 0xf
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
+; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_cmp_lg_u32 s8, 0
+; SI-NEXT: s_cmp_lg_u32 s6, 0
; SI-NEXT: s_cbranch_scc0 .LBB7_4
; SI-NEXT: ; %bb.1: ; %else
-; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2
+; SI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x2
; SI-NEXT: s_mov_b64 s[2:3], 0
; SI-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 vcc, vcc
; SI-NEXT: s_cbranch_vccnz .LBB7_3
; SI-NEXT: .LBB7_2: ; %if
-; SI-NEXT: s_bcnt1_i32_b64 s4, s[6:7]
-; SI-NEXT: s_mov_b32 s5, 0
+; SI-NEXT: s_bcnt1_i32_b64 s6, s[4:5]
+; SI-NEXT: s_mov_b32 s7, 0
; SI-NEXT: .LBB7_3: ; %endif
-; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: v_mov_b32_e32 v0, s6
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: v_mov_b32_e32 v1, s5
+; SI-NEXT: v_mov_b32_e32 v1, s7
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
; SI-NEXT: .LBB7_4:
-; SI-NEXT: ; implicit-def: $sgpr4_sgpr5
+; SI-NEXT: ; implicit-def: $sgpr6_sgpr7
; SI-NEXT: s_branch .LBB7_2
;
; VI-LABEL: ctpop_i64_in_br:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s8, s[4:5], 0x3c
+; VI-NEXT: s_load_dword s6, s[4:5], 0x3c
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_cmp_lg_u32 s8, 0
+; VI-NEXT: s_cmp_lg_u32 s6, 0
; VI-NEXT: s_cbranch_scc0 .LBB7_4
; VI-NEXT: ; %bb.1: ; %else
-; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8
+; VI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x8
; VI-NEXT: s_cbranch_execnz .LBB7_3
; VI-NEXT: .LBB7_2: ; %if
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bcnt1_i32_b64 s4, s[6:7]
-; VI-NEXT: s_mov_b32 s5, 0
+; VI-NEXT: s_bcnt1_i32_b64 s6, s[4:5]
+; VI-NEXT: s_mov_b32 s7, 0
; VI-NEXT: .LBB7_3: ; %endif
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
; VI-NEXT: .LBB7_4:
-; VI-NEXT: ; implicit-def: $sgpr4_sgpr5
+; VI-NEXT: ; implicit-def: $sgpr6_sgpr7
; VI-NEXT: s_branch .LBB7_2
entry:
%tmp0 = icmp eq i32 %cond, 0
diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll
index cd85c301e16d5b..b443e654350c5e 100644
--- a/llvm/test/CodeGen/AMDGPU/idot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot2.ll
@@ -151,20 +151,20 @@ entry:
define amdgpu_kernel void @udot2_MulMul(ptr addrspace(1) %src1,
; GFX7-LABEL: udot2_MulMul:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GFX7-NEXT: s_mov_b32 s3, 0xf000
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, s3
+; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s10, 0
+; GFX7-NEXT: s_mov_b32 s11, s7
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
-; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
+; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
@@ -174,8 +174,8 @@ define amdgpu_kernel void @udot2_MulMul(ptr addrspace(1) %src1,
; GFX7-NEXT: v_mul_u32_u24_e32 v0, v0, v2
; GFX7-NEXT: v_mad_u32_u24 v0, v3, v1, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, s4, v0
-; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, s0, v0
+; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: udot2_MulMul:
@@ -1698,20 +1698,20 @@ entry:
define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX7-LABEL: udot2_MultipleUses_add1:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GFX7-NEXT: s_mov_b32 s3, 0xf000
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, s3
+; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s10, 0
+; GFX7-NEXT: s_mov_b32 s11, s7
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
-; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
+; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
@@ -1719,10 +1719,10 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, s4
+; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, s0
; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: udot2_MultipleUses_add1:
@@ -1851,20 +1851,20 @@ entry:
define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX7-LABEL: idot2_MultipleUses_add1:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GFX7-NEXT: s_mov_b32 s3, 0xf000
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, s3
+; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s10, 0
+; GFX7-NEXT: s_mov_b32 s11, s7
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
-; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
+; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 16
; GFX7-NEXT: v_ashrrev_i32_e32 v2, 16, v2
@@ -1872,10 +1872,10 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX7-NEXT: v_bfe_i32 v3, v0, 0, 16
; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, s4
+; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, s0
; GFX7-NEXT: v_mad_i32_i24 v1, v3, v1, v0
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v1, v0
-; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: idot2_MultipleUses_add1:
@@ -2004,20 +2004,20 @@ entry:
define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1,
; GFX7-LABEL: udot2_MultipleUses_mul1:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GFX7-NEXT: s_mov_b32 s3, 0xf000
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, s3
+; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s10, 0
+; GFX7-NEXT: s_mov_b32 s11, s7
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
-; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
+; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
@@ -2025,10 +2025,10 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1,
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v4, v0, v2, s4
+; GFX7-NEXT: v_mad_u32_u24 v4, v0, v2, s0
; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, v4
; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1
-; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: udot2_MultipleUses_mul1:
@@ -2163,20 +2163,20 @@ entry:
define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1,
; GFX7-LABEL: idot2_MultipleUses_mul1:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GFX7-NEXT: s_mov_b32 s3, 0xf000
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, s3
+; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s10, 0
+; GFX7-NEXT: s_mov_b32 s11, s7
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
-; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
+; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 16
; GFX7-NEXT: v_ashrrev_i32_e32 v2, 16, v2
@@ -2184,10 +2184,10 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1,
; GFX7-NEXT: v_bfe_i32 v3, v0, 0, 16
; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_i32_i24 v4, v3, v1, s4
+; GFX7-NEXT: v_mad_i32_i24 v4, v3, v1, s0
; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, v4
; GFX7-NEXT: v_mad_i32_i24 v0, v3, v1, v0
-; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: idot2_MultipleUses_mul1:
@@ -2322,31 +2322,31 @@ entry:
define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX7-LABEL: udot2_MultipleUses_mul2:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GFX7-NEXT: s_mov_b32 s3, 0xf000
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, s3
+; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s10, 0
+; GFX7-NEXT: s_mov_b32 s11, s7
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
-; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
+; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v4, v3, v1, s4
+; GFX7-NEXT: v_mad_u32_u24 v4, v3, v1, s0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, v4
; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1
-; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: udot2_MultipleUses_mul2:
@@ -2479,20 +2479,20 @@ entry:
define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX7-LABEL: idot2_MultipleUses_mul2:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GFX7-NEXT: s_mov_b32 s3, 0xf000
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, s3
+; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s10, 0
+; GFX7-NEXT: s_mov_b32 s11, s7
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
-; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
+; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 16
; GFX7-NEXT: v_ashrrev_i32_e32 v2, 16, v2
@@ -2500,10 +2500,10 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX7-NEXT: v_bfe_i32 v3, v0, 0, 16
; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_i32_i24 v4, v0, v2, s4
+; GFX7-NEXT: v_mad_i32_i24 v4, v0, v2, s0
; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, v4
; GFX7-NEXT: v_mad_i32_i24 v0, v3, v1, v0
-; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: idot2_MultipleUses_mul2:
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
index 0573de4a7f2d1d..fe693b4af67f31 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
@@ -2373,21 +2373,21 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
-; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1]
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 48
+; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1]
; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32
-; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1]
; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5
; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16
-; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2
+; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1]
+; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5
; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
@@ -2406,10 +2406,10 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou
; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s0
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCNX3-HSA-NEXT: s_waitcnt vmcnt(3)
-; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v13
-; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v12
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v12
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v13
+; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v9
+; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v8
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v8
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v9
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19]
; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s3
; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s2
@@ -2418,51 +2418,50 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou
; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3
; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s2
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 32
-; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v15
-; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v14
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v14
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v15
+; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v11
+; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v10
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v10
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v11
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19]
; GCNX3-HSA-NEXT: s_waitcnt vmcnt(4)
-; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v11
-; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v10
-; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v9
-; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v8
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v8
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v9
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, v10
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, v11
+; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v15
+; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v14
+; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v13
+; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v12
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v12
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v13
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v14
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v15
; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 48
; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, s3
; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[24:25], v[16:19]
-; GCNX3-HSA-NEXT: flat_store_dwordx4 v[26:27], v[12:15]
-; GCNX3-HSA-NEXT: s_waitcnt vmcnt(5)
-; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v7
-; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v6
-; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v5
-; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v4
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, v4
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, v5
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v6
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v7
+; GCNX3-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11]
+; GCNX3-HSA-NEXT: s_waitcnt vmcnt(4)
+; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v1
+; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v3
+; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v2
+; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v0
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, v0
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, v1
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v2
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v3
; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v31, s1
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[20:21], v[12:15]
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[8:11]
; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, s0
-; GCNX3-HSA-NEXT: s_waitcnt vmcnt(6)
-; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v1
-; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v0
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v0
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v1
-; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v3
-; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v2
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, v2
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, v3
+; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v5
+; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v4
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v4
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v5
+; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v7
+; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v6
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v6
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v7
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[22:23], v[8:11]
-; GCNX3-HSA-NEXT: flat_store_dwordx4 v[30:31], v[4:7]
+; GCNX3-HSA-NEXT: flat_store_dwordx4 v[30:31], v[0:3]
; GCNX3-HSA-NEXT: s_endpgm
;
; GCNX3-NOHSA-LABEL: global_sextload_v16i32_to_v16i64:
diff --git a/llvm/test/CodeGen/AMDGPU/peephole-opt-fold-reg-sequence-subreg.mir b/llvm/test/CodeGen/AMDGPU/peephole-opt-fold-reg-sequence-subreg.mir
index d32163bb692357..057769372c0418 100644
--- a/llvm/test/CodeGen/AMDGPU/peephole-opt-fold-reg-sequence-subreg.mir
+++ b/llvm/test/CodeGen/AMDGPU/peephole-opt-fold-reg-sequence-subreg.mir
@@ -14,7 +14,7 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]].sub0, %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY1]]
%0:vreg_64_align2 = COPY $vgpr0_vgpr1
%1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
@@ -60,7 +60,7 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub1, [[COPY]].sub0, %subreg.sub0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY1]]
%0:vreg_64_align2 = COPY $vgpr0_vgpr1
%1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
@@ -106,7 +106,7 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY]].sub0, %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1, [[COPY]].sub1, %subreg.sub2
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY1]]
%0:vreg_64 = COPY $vgpr1_vgpr2
%1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
@@ -129,7 +129,7 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY]].sub0, %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1, [[COPY]].sub1, %subreg.sub2
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY1]]
%0:vreg_64 = COPY $vgpr1_vgpr2
%1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/peephole-opt-regseq-removal.mir b/llvm/test/CodeGen/AMDGPU/peephole-opt-regseq-removal.mir
index c5e51aa1b4fe87..e1ff42125ce9a4 100644
--- a/llvm/test/CodeGen/AMDGPU/peephole-opt-regseq-removal.mir
+++ b/llvm/test/CodeGen/AMDGPU/peephole-opt-regseq-removal.mir
@@ -23,8 +23,8 @@ body: |
; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[REG_SEQUENCE]].sub1, %subreg.sub0, [[REG_SEQUENCE]].sub0, %subreg.sub1
- ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
- ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
; GCN-NEXT: KILL [[COPY3]], implicit [[COPY2]]
%0:vgpr_32 = COPY $vgpr0
%1:vgpr_32 = COPY $vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
index e2bcf3f6a2e2cd..3a872a6080952b 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
@@ -9779,111 +9779,118 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX6-NEXT: v_lshlrev_b32_e32 v5, 8, v0
; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:240
-; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:224
-; GFX6-NEXT: s_mov_b32 s2, 0x86a00
-; GFX6-NEXT: s_mov_b64 s[8:9], exec
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:208
; GFX6-NEXT: s_mov_b32 s2, 0x86600
+; GFX6-NEXT: s_mov_b64 s[8:9], exec
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:192
+; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:224
; GFX6-NEXT: s_mov_b32 s2, 0x86200
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:176
+; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:208
; GFX6-NEXT: s_mov_b32 s2, 0x85e00
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:160
+; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:192
; GFX6-NEXT: s_mov_b32 s2, 0x85a00
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:144
+; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:176
; GFX6-NEXT: s_mov_b32 s2, 0x85600
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:128
+; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:160
; GFX6-NEXT: s_mov_b32 s2, 0x85200
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:112
+; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:144
; GFX6-NEXT: s_mov_b32 s2, 0x84e00
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:96
+; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:128
; GFX6-NEXT: s_mov_b32 s2, 0x84a00
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:80
+; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:112
; GFX6-NEXT: s_mov_b32 s2, 0x84600
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:64
+; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:96
; GFX6-NEXT: s_mov_b32 s2, 0x84200
; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:80
+; GFX6-NEXT: s_mov_b32 s2, 0x83e00
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:64
+; GFX6-NEXT: s_mov_b32 s2, 0x83a00
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64
+; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:16
+; GFX6-NEXT: s_mov_b32 s2, 0x83200
+; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: buffer_load_dwordx4 v[8:11], v[5:6], s[4:7], 0 addr64
-; GFX6-NEXT: buffer_load_dwordx4 v[12:15], v[5:6], s[4:7], 0 addr64 offset:16
+; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:32
; GFX6-NEXT: s_mov_b32 s2, 0x83600
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dword v12, off, s[40:43], s2 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v13, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v14, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v15, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: buffer_load_dwordx4 v[12:15], v[5:6], s[4:7], 0 addr64 offset:32
-; GFX6-NEXT: s_mov_b32 s2, 0x83a00
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dword v12, off, s[40:43], s2 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v13, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v14, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v15, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
; GFX6-NEXT: s_mov_b64 exec, 15
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0
@@ -9898,16 +9905,17 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: s_mov_b64 exec, s[8:9]
-; GFX6-NEXT: buffer_load_dwordx4 v[12:15], v[5:6], s[4:7], 0 addr64 offset:48
-; GFX6-NEXT: s_mov_b32 s0, 0x83e00
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 13, v8
+; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:48
+; GFX6-NEXT: s_mov_b32 s0, 0x86a00
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 13, v0
; GFX6-NEXT: v_add_i32_e32 v4, vcc, 16, v4
-; GFX6-NEXT: v_mov_b32_e32 v7, 1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dword v12, off, s[40:43], s0 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v13, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v14, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v15, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s0 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT: s_waitcnt expcnt(3)
+; GFX6-NEXT: v_mov_b32_e32 v7, 1
; GFX6-NEXT: s_mov_b64 s[0:1], exec
; GFX6-NEXT: buffer_store_dword v7, v4, s[40:43], 0 offen
; GFX6-NEXT: ;;#ASMSTART
@@ -9930,7 +9938,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: s_mov_b64 exec, s[0:1]
-; GFX6-NEXT: s_mov_b32 s6, 0x83200
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX6-NEXT: ;;#ASMSTART
; GFX6-NEXT: ; def s[8:15]
; GFX6-NEXT: ;;#ASMEND
@@ -9949,11 +9957,6 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: ;;#ASMSTART
; GFX6-NEXT: ; def s33
; GFX6-NEXT: ;;#ASMEND
-; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s6 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s6 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s6 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v11, off, s[40:43], s6 offset:12 ; 4-byte Folded Spill
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8
; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GFX6-NEXT: s_mov_b64 vcc, s[6:7]
; GFX6-NEXT: s_cbranch_execz .LBB1_2
@@ -10184,127 +10187,126 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: s_mov_b64 exec, s[4:5]
-; GFX6-NEXT: s_mov_b64 s[38:39], s[2:3]
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:240
-; GFX6-NEXT: s_mov_b32 s0, 0x86a00
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
; GFX6-NEXT: s_mov_b32 s0, 0x86600
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:224
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT: s_mov_b64 s[38:39], s[2:3]
; GFX6-NEXT: s_mov_b32 s0, 0x86200
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:208
+; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:240
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
; GFX6-NEXT: s_mov_b32 s0, 0x85e00
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:192
+; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:224
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
; GFX6-NEXT: s_mov_b32 s0, 0x85a00
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:176
+; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:208
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
; GFX6-NEXT: s_mov_b32 s0, 0x85600
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:160
+; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:192
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
; GFX6-NEXT: s_mov_b32 s0, 0x85200
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:144
+; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:176
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
; GFX6-NEXT: s_mov_b32 s0, 0x84e00
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:128
+; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:160
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
; GFX6-NEXT: s_mov_b32 s0, 0x84a00
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:112
+; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:144
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
; GFX6-NEXT: s_mov_b32 s0, 0x84600
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:96
+; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:128
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
; GFX6-NEXT: s_mov_b32 s0, 0x84200
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:80
+; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:112
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
; GFX6-NEXT: s_mov_b32 s0, 0x83e00
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:64
+; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:96
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
; GFX6-NEXT: s_mov_b32 s0, 0x83a00
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:48
+; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:80
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT: s_mov_b32 s0, 0x86a00
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:64
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
; GFX6-NEXT: s_mov_b32 s0, 0x83600
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:32
+; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:48
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
; GFX6-NEXT: s_mov_b32 s0, 0x83200
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:16
+; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:32
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:16
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64
; GFX6-NEXT: s_endpgm
;
@@ -10322,59 +10324,60 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 16
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:224
-; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2040
-; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[11:14], v5, s[38:39] offset:224
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:208
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2030
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[19:22], v5, s[38:39] offset:192
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:192
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2020
+; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[15:18], v5, s[38:39] offset:176
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[11:14], v5, s[38:39] offset:160
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:160
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2040
+; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:144
-; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2010
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:128
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2010
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[19:22], v5, s[38:39]
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:112
-; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2020
-; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(2)
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:112
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20c0
-; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 1
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[7:10], v5, s[38:39]
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(2)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:96
+; GFX9-FLATSCR-NEXT: v_lshl_add_u32 v4, v19, 13, v4
+; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19
+; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:96
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20b0
-; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(2)
-; GFX9-FLATSCR-NEXT: v_lshl_add_u32 v4, v7, 13, v4
-; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
-; GFX9-FLATSCR-NEXT: scratch_store_dword v4, v6, off
-; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:80
+; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:80
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20a0
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:64
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:64
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2090
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:48
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:48
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2080
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:32
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:32
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2070
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:16
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:16
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2060
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: s_nop 0
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 1
+; GFX9-FLATSCR-NEXT: scratch_store_dword v4, v6, off
; GFX9-FLATSCR-NEXT: ;;#ASMSTART
; GFX9-FLATSCR-NEXT: ; def s[0:7]
; GFX9-FLATSCR-NEXT: ;;#ASMEND
@@ -10403,27 +10406,26 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX9-FLATSCR-NEXT: ; use s[0:7],s[8:15],s[16:23],s[24:31],s[40:43],s[38:39]
; GFX9-FLATSCR-NEXT: ;;#ASMEND
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20d0
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20e0
-; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, v11
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[19:22], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20f0
-; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v12
-; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v13
-; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v14
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: s_nop 0
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, v19
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[11:14], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v20
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v21
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v22
; GFX9-FLATSCR-NEXT: ;;#ASMSTART
; GFX9-FLATSCR-NEXT: ;;#ASMEND
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[11:14], off, s0 ; 16-byte Folded Reload
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20e0
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[19:22], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20d0
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, v3
; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[15:18], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, v3
-; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, v2
-; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, v1
-; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, v0
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20d0
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v1
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, v0
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
; GFX9-FLATSCR-NEXT: ;;#ASMSTART
; GFX9-FLATSCR-NEXT: ;;#ASMEND
; GFX9-FLATSCR-NEXT: ;;#ASMSTART
@@ -10439,53 +10441,53 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX9-FLATSCR-NEXT: .LBB1_2: ; %ret
; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20c0
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20b0
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[36:37] offset:112
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[6:9], s[36:37] offset:112
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20a0
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[36:37] offset:96
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[6:9], s[36:37] offset:96
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2090
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[36:37] offset:80
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[6:9], s[36:37] offset:80
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2080
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[36:37] offset:64
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[6:9], s[36:37] offset:64
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2070
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[36:37] offset:48
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[6:9], s[36:37] offset:48
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2060
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[36:37] offset:32
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2050
-; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[36:37] offset:16
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[7:10], s[36:37]
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[6:9], s[36:37] offset:32
; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2040
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2050
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[6:9], s[36:37] offset:240
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[6:9], s[36:37] offset:16
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[19:22], s[36:37]
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[19:22], off, s0 ; 16-byte Folded Reload
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2030
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[6:9], s[36:37] offset:224
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2010
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[19:22], s[36:37] offset:240
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[11:14], s[36:37] offset:224
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[11:14], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2020
+; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[11:14], s[36:37] offset:208
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[11:14], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2040
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[6:9], s[36:37] offset:208
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[19:22], s[36:37] offset:192
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[11:14], s[36:37] offset:192
; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[15:18], s[36:37] offset:176
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[11:14], s[36:37] offset:160
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2020
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[11:14], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2010
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[11:14], s[36:37] offset:160
; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[36:37] offset:144
; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
More information about the llvm-branch-commits
mailing list