[llvm] [AMDGPU] Enable reordering of VMEM loads during clustering (PR #107986)
Carl Ritson via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 2 00:19:01 PST 2026
https://github.com/perlfu updated https://github.com/llvm/llvm-project/pull/107986
>From b86c25a8cfb7eb05e62265ad6910ef89b1dd0b76 Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Mon, 8 Dec 2025 10:18:24 +0900
Subject: [PATCH 1/3] [AMDGPU] Enable reordering of VMEM loads during
clustering
The intention of doing this is to allow some additional overlap
of computation with memory loads, as loads will be issued in
an order closer to their usage, more incremental s_waitcnt can
be introduced.
On average this yields a very small reduction in VGPR pressure,
although edge cases may see increased pressure.
For the benefit of future tuning add support for function metadata
"amdgpu-reorder--while-clustering" to disable/enable reordering
behaviour per function.
---
llvm/lib/CodeGen/MachineScheduler.cpp | 7 +++--
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 28 +++++++++++++++----
2 files changed, 28 insertions(+), 7 deletions(-)
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 6697c0a110dc3..df2a3f1269c82 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -2120,8 +2120,11 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps(
SUnit *SUa = MemOpa.SU;
SUnit *SUb = MemOpb.SU;
-
- if (!ReorderWhileClustering && SUa->NodeNum > SUb->NodeNum)
+ bool CanReorder = false;
+ if (ReorderWhileClustering)
+ CanReorder = TII->areMemAccessesTriviallyDisjoint(*SUa->getInstr(),
+ *SUb->getInstr());
+ if (SUa->NodeNum > SUb->NodeNum && !CanReorder)
std::swap(SUa, SUb);
// FIXME: Is this check really required?
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 1114a1105237c..c0761cdb1b0d1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -605,6 +605,11 @@ static cl::opt<bool> EnableUniformIntrinsicCombine(
cl::desc("Enable/Disable the Uniform Intrinsic Combine Pass"),
cl::init(true), cl::Hidden);
+static cl::opt<bool>
+ ReorderWhileClustering("amdgpu-reorder-while-clustering",
+ cl::desc("Enable reordering during load clustering"),
+ cl::init(true), cl::Hidden);
+
extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
// Register the target
RegisterTargetMachine<R600TargetMachine> X(getTheR600Target());
@@ -703,12 +708,21 @@ static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
return new SIScheduleDAGMI(C);
}
+static bool getReorderWhileClustering(const MachineFunction *MF) {
+ if (!ReorderWhileClustering)
+ return false;
+ Attribute FnAttr =
+ MF->getFunction().getFnAttribute("amdgpu-reorder-while-clustering");
+ return !FnAttr.isValid() || FnAttr.getValueAsBool();
+}
+
static ScheduleDAGInstrs *
createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
ScheduleDAGMILive *DAG =
new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
- DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+ DAG->addMutation(createLoadClusterDAGMutation(
+ DAG->TII, DAG->TRI, getReorderWhileClustering(C->MF)));
if (ST.shouldClusterStores())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
@@ -732,7 +746,8 @@ createGCNMaxMemoryClauseMachineScheduler(MachineSchedContext *C) {
const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
ScheduleDAGMILive *DAG = new GCNScheduleDAGMILive(
C, std::make_unique<GCNMaxMemoryClauseSchedStrategy>(C));
- DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+ DAG->addMutation(createLoadClusterDAGMutation(
+ DAG->TII, DAG->TRI, getReorderWhileClustering(C->MF)));
if (ST.shouldClusterStores())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
@@ -746,7 +761,8 @@ createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
auto *DAG = new GCNIterativeScheduler(
C, GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
- DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+ DAG->addMutation(createLoadClusterDAGMutation(
+ DAG->TII, DAG->TRI, getReorderWhileClustering(C->MF)));
if (ST.shouldClusterStores())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
@@ -764,7 +780,8 @@ static ScheduleDAGInstrs *
createIterativeILPMachineScheduler(MachineSchedContext *C) {
const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
auto *DAG = new GCNIterativeScheduler(C, GCNIterativeScheduler::SCHEDULE_ILP);
- DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+ DAG->addMutation(createLoadClusterDAGMutation(
+ DAG->TII, DAG->TRI, getReorderWhileClustering(C->MF)));
if (ST.shouldClusterStores())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
@@ -863,7 +880,8 @@ llvm::ScheduleDAGInstrs *
AMDGPUTargetMachine::createMachineScheduler(MachineSchedContext *C) const {
const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
ScheduleDAGMILive *DAG = createSchedLive(C);
- DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+ DAG->addMutation(createLoadClusterDAGMutation(
+ DAG->TII, DAG->TRI, getReorderWhileClustering(C->MF)));
if (ST.shouldClusterStores())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
return DAG;
>From d07fb82be9bade4ed02123e39b054621ebd49800 Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Mon, 2 Mar 2026 15:13:18 +0900
Subject: [PATCH 2/3] - Reviewer comments
---
llvm/lib/CodeGen/MachineScheduler.cpp | 7 ++---
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 28 +++++++------------
2 files changed, 13 insertions(+), 22 deletions(-)
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index df2a3f1269c82..b9b9ed18105c7 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -2120,10 +2120,9 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps(
SUnit *SUa = MemOpa.SU;
SUnit *SUb = MemOpb.SU;
- bool CanReorder = false;
- if (ReorderWhileClustering)
- CanReorder = TII->areMemAccessesTriviallyDisjoint(*SUa->getInstr(),
- *SUb->getInstr());
+ bool CanReorder = ReorderWhileClustering &&
+ TII->areMemAccessesTriviallyDisjoint(*SUa->getInstr(),
+ *SUb->getInstr());
if (SUa->NodeNum > SUb->NodeNum && !CanReorder)
std::swap(SUa, SUb);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index c0761cdb1b0d1..bc5a48c63519c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -708,21 +708,13 @@ static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
return new SIScheduleDAGMI(C);
}
-static bool getReorderWhileClustering(const MachineFunction *MF) {
- if (!ReorderWhileClustering)
- return false;
- Attribute FnAttr =
- MF->getFunction().getFnAttribute("amdgpu-reorder-while-clustering");
- return !FnAttr.isValid() || FnAttr.getValueAsBool();
-}
-
static ScheduleDAGInstrs *
createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
ScheduleDAGMILive *DAG =
new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
- DAG->addMutation(createLoadClusterDAGMutation(
- DAG->TII, DAG->TRI, getReorderWhileClustering(C->MF)));
+ DAG->addMutation(
+ createLoadClusterDAGMutation(DAG->TII, DAG->TRI, ReorderWhileClustering));
if (ST.shouldClusterStores())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
@@ -746,8 +738,8 @@ createGCNMaxMemoryClauseMachineScheduler(MachineSchedContext *C) {
const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
ScheduleDAGMILive *DAG = new GCNScheduleDAGMILive(
C, std::make_unique<GCNMaxMemoryClauseSchedStrategy>(C));
- DAG->addMutation(createLoadClusterDAGMutation(
- DAG->TII, DAG->TRI, getReorderWhileClustering(C->MF)));
+ DAG->addMutation(
+ createLoadClusterDAGMutation(DAG->TII, DAG->TRI, ReorderWhileClustering));
if (ST.shouldClusterStores())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
@@ -761,8 +753,8 @@ createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
auto *DAG = new GCNIterativeScheduler(
C, GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
- DAG->addMutation(createLoadClusterDAGMutation(
- DAG->TII, DAG->TRI, getReorderWhileClustering(C->MF)));
+ DAG->addMutation(
+ createLoadClusterDAGMutation(DAG->TII, DAG->TRI, ReorderWhileClustering));
if (ST.shouldClusterStores())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
@@ -780,8 +772,8 @@ static ScheduleDAGInstrs *
createIterativeILPMachineScheduler(MachineSchedContext *C) {
const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
auto *DAG = new GCNIterativeScheduler(C, GCNIterativeScheduler::SCHEDULE_ILP);
- DAG->addMutation(createLoadClusterDAGMutation(
- DAG->TII, DAG->TRI, getReorderWhileClustering(C->MF)));
+ DAG->addMutation(
+ createLoadClusterDAGMutation(DAG->TII, DAG->TRI, ReorderWhileClustering));
if (ST.shouldClusterStores())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
@@ -880,8 +872,8 @@ llvm::ScheduleDAGInstrs *
AMDGPUTargetMachine::createMachineScheduler(MachineSchedContext *C) const {
const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
ScheduleDAGMILive *DAG = createSchedLive(C);
- DAG->addMutation(createLoadClusterDAGMutation(
- DAG->TII, DAG->TRI, getReorderWhileClustering(C->MF)));
+ DAG->addMutation(
+ createLoadClusterDAGMutation(DAG->TII, DAG->TRI, ReorderWhileClustering));
if (ST.shouldClusterStores())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
return DAG;
>From 295ed3f3d28592b52c9fb3d20d712fc372b81a7f Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Tue, 17 Feb 2026 18:54:55 +0900
Subject: [PATCH 3/3] - Test changes
---
llvm/test/CodeGen/AMDGPU/bf16.ll | 427 ++-
.../CodeGen/AMDGPU/call-argument-types.ll | 33 +-
llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll | 26 +-
llvm/test/CodeGen/AMDGPU/ds_read2.ll | 128 +-
.../CodeGen/AMDGPU/extract_vector_dynelt.ll | 22 +-
llvm/test/CodeGen/AMDGPU/fma-combine.ll | 12 +-
llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll | 2 +-
.../AMDGPU/implicit-kernarg-backend-usage.ll | 12 +-
.../CodeGen/AMDGPU/indirect-addressing-si.ll | 265 +-
.../CodeGen/AMDGPU/insert_vector_dynelt.ll | 836 ++--
.../insert_waitcnt_for_precise_memory.ll | 4 +-
...e92561-restore-undef-scc-verifier-error.ll | 55 +-
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll | 40 +-
.../AMDGPU/llvm.amdgcn.smfmac.gfx950.ll | 96 +-
llvm/test/CodeGen/AMDGPU/load-constant-i16.ll | 48 +-
llvm/test/CodeGen/AMDGPU/load-global-i16.ll | 177 +-
llvm/test/CodeGen/AMDGPU/load-global-i32.ll | 759 ++--
llvm/test/CodeGen/AMDGPU/load-local-i16.ll | 664 ++--
llvm/test/CodeGen/AMDGPU/load-local.128.ll | 114 +-
llvm/test/CodeGen/AMDGPU/load-local.96.ll | 77 +-
llvm/test/CodeGen/AMDGPU/max.i16.ll | 8 +-
llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll | 88 +-
.../CodeGen/AMDGPU/memintrinsic-unroll.ll | 3415 +++++++++--------
.../AMDGPU/memmove-param-combinations.ll | 178 +-
llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll | 48 +-
.../CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll | 196 +-
llvm/test/CodeGen/AMDGPU/neg_ashr64_reduce.ll | 24 +-
llvm/test/CodeGen/AMDGPU/packed-fp32.ll | 315 +-
llvm/test/CodeGen/AMDGPU/permute_i8.ll | 34 +-
llvm/test/CodeGen/AMDGPU/pr51516.mir | 7 +-
.../AMDGPU/promote-constOffset-to-imm.ll | 54 +-
.../AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll | 24 +-
.../rewrite-vgpr-mfma-to-agpr.gfx90a.ll | 20 +-
.../rewrite-vgpr-mfma-to-agpr.gfx950.ll | 64 +-
.../AMDGPU/rewrite-vgpr-mfma-to-agpr.ll | 192 +-
llvm/test/CodeGen/AMDGPU/sub.ll | 24 +-
.../test/CodeGen/AMDGPU/vni8-across-blocks.ll | 91 +-
llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll | 6 +-
38 files changed, 4260 insertions(+), 4325 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 247051b4a224d..aa0edd3ff9fd8 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -8845,135 +8845,137 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
; GFX950-NEXT: v_accvgpr_write_b32 a7, v47 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a8, v56 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a9, v57 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a10, v58 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a12, v60 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a13, v61 ; Reload Reuse
; GFX950-NEXT: global_load_ushort v1, v[2:3], off offset:2
-; GFX950-NEXT: global_load_ushort v4, v[2:3], off offset:12
-; GFX950-NEXT: global_load_ushort v5, v[2:3], off offset:8
+; GFX950-NEXT: global_load_ushort v4, v[2:3], off
+; GFX950-NEXT: global_load_ushort v5, v[2:3], off offset:6
; GFX950-NEXT: global_load_ushort v6, v[2:3], off offset:4
-; GFX950-NEXT: global_load_ushort v7, v[2:3], off
-; GFX950-NEXT: global_load_ushort v8, v[2:3], off offset:6
-; GFX950-NEXT: global_load_ushort v9, v[2:3], off offset:10
-; GFX950-NEXT: global_load_ushort v10, v[2:3], off offset:14
+; GFX950-NEXT: global_load_ushort v7, v[2:3], off offset:10
+; GFX950-NEXT: global_load_ushort v8, v[2:3], off offset:8
+; GFX950-NEXT: global_load_ushort v9, v[2:3], off offset:14
+; GFX950-NEXT: global_load_ushort v10, v[2:3], off offset:12
; GFX950-NEXT: global_load_ushort v11, v[2:3], off offset:18
-; GFX950-NEXT: global_load_ushort v12, v[2:3], off offset:28
-; GFX950-NEXT: global_load_ushort v13, v[2:3], off offset:24
+; GFX950-NEXT: global_load_ushort v12, v[2:3], off offset:16
+; GFX950-NEXT: global_load_ushort v13, v[2:3], off offset:22
; GFX950-NEXT: global_load_ushort v14, v[2:3], off offset:20
-; GFX950-NEXT: global_load_ushort v15, v[2:3], off offset:16
-; GFX950-NEXT: global_load_ushort v16, v[2:3], off offset:22
-; GFX950-NEXT: global_load_ushort v17, v[2:3], off offset:26
-; GFX950-NEXT: global_load_ushort v18, v[2:3], off offset:30
+; GFX950-NEXT: global_load_ushort v15, v[2:3], off offset:26
+; GFX950-NEXT: global_load_ushort v16, v[2:3], off offset:24
+; GFX950-NEXT: global_load_ushort v17, v[2:3], off offset:30
+; GFX950-NEXT: global_load_ushort v18, v[2:3], off offset:28
; GFX950-NEXT: global_load_ushort v19, v[2:3], off offset:34
-; GFX950-NEXT: global_load_ushort v20, v[2:3], off offset:44
-; GFX950-NEXT: global_load_ushort v21, v[2:3], off offset:40
+; GFX950-NEXT: global_load_ushort v20, v[2:3], off offset:32
+; GFX950-NEXT: global_load_ushort v21, v[2:3], off offset:38
; GFX950-NEXT: global_load_ushort v22, v[2:3], off offset:36
-; GFX950-NEXT: global_load_ushort v23, v[2:3], off offset:32
-; GFX950-NEXT: global_load_ushort v24, v[2:3], off offset:38
-; GFX950-NEXT: global_load_ushort v25, v[2:3], off offset:42
-; GFX950-NEXT: global_load_ushort v26, v[2:3], off offset:46
+; GFX950-NEXT: global_load_ushort v23, v[2:3], off offset:42
+; GFX950-NEXT: global_load_ushort v24, v[2:3], off offset:40
+; GFX950-NEXT: global_load_ushort v25, v[2:3], off offset:46
+; GFX950-NEXT: global_load_ushort v26, v[2:3], off offset:44
; GFX950-NEXT: global_load_ushort v42, v[2:3], off offset:50
; GFX950-NEXT: global_load_ushort v43, v[2:3], off offset:62
; GFX950-NEXT: global_load_ushort v46, v[2:3], off offset:60
-; GFX950-NEXT: global_load_ushort v47, v[2:3], off offset:56
-; GFX950-NEXT: global_load_ushort v60, v[2:3], off offset:52
-; GFX950-NEXT: global_load_ushort v56, v[2:3], off offset:48
-; GFX950-NEXT: global_load_ushort v57, v[2:3], off offset:54
-; GFX950-NEXT: global_load_ushort v58, v[2:3], off offset:58
+; GFX950-NEXT: global_load_ushort v47, v[2:3], off offset:48
+; GFX950-NEXT: global_load_ushort v56, v[2:3], off offset:54
+; GFX950-NEXT: global_load_ushort v57, v[2:3], off offset:58
+; GFX950-NEXT: global_load_ushort v60, v[2:3], off offset:56
+; GFX950-NEXT: global_load_ushort v61, v[2:3], off offset:52
; GFX950-NEXT: v_accvgpr_write_b32 a4, v44 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v41 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a5, v45 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a10, v58 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a11, v59 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v41 ; Reload Reuse
; GFX950-NEXT: s_waitcnt vmcnt(31)
; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX950-NEXT: s_waitcnt vmcnt(30)
-; GFX950-NEXT: v_lshlrev_b32_e32 v30, 16, v4
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v4
; GFX950-NEXT: s_waitcnt vmcnt(29)
-; GFX950-NEXT: v_lshlrev_b32_e32 v28, 16, v5
+; GFX950-NEXT: v_lshlrev_b32_e32 v27, 16, v5
; GFX950-NEXT: v_cvt_f64_f32_e32 v[4:5], v1
; GFX950-NEXT: s_waitcnt vmcnt(27)
-; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v7
+; GFX950-NEXT: v_lshlrev_b32_e32 v28, 16, v7
; GFX950-NEXT: s_waitcnt vmcnt(26)
-; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v8
+; GFX950-NEXT: v_lshlrev_b32_e32 v29, 16, v8
; GFX950-NEXT: s_waitcnt vmcnt(25)
-; GFX950-NEXT: v_lshlrev_b32_e32 v27, 16, v9
+; GFX950-NEXT: v_lshlrev_b32_e32 v30, 16, v9
; GFX950-NEXT: s_waitcnt vmcnt(24)
-; GFX950-NEXT: v_lshlrev_b32_e32 v29, 16, v10
+; GFX950-NEXT: v_lshlrev_b32_e32 v31, 16, v10
; GFX950-NEXT: s_waitcnt vmcnt(23)
-; GFX950-NEXT: v_lshlrev_b32_e32 v31, 16, v11
+; GFX950-NEXT: v_lshlrev_b32_e32 v32, 16, v11
; GFX950-NEXT: s_waitcnt vmcnt(22)
-; GFX950-NEXT: v_lshlrev_b32_e32 v38, 16, v12
+; GFX950-NEXT: v_lshlrev_b32_e32 v33, 16, v12
; GFX950-NEXT: s_waitcnt vmcnt(21)
-; GFX950-NEXT: v_lshlrev_b32_e32 v36, 16, v13
+; GFX950-NEXT: v_lshlrev_b32_e32 v34, 16, v13
; GFX950-NEXT: s_waitcnt vmcnt(20)
-; GFX950-NEXT: v_lshlrev_b32_e32 v34, 16, v14
+; GFX950-NEXT: v_lshlrev_b32_e32 v35, 16, v14
; GFX950-NEXT: s_waitcnt vmcnt(19)
-; GFX950-NEXT: v_lshlrev_b32_e32 v32, 16, v15
+; GFX950-NEXT: v_lshlrev_b32_e32 v36, 16, v15
; GFX950-NEXT: s_waitcnt vmcnt(18)
-; GFX950-NEXT: v_lshlrev_b32_e32 v33, 16, v16
-; GFX950-NEXT: v_cvt_f64_f32_e32 v[12:13], v27
+; GFX950-NEXT: v_lshlrev_b32_e32 v37, 16, v16
+; GFX950-NEXT: s_waitcnt vmcnt(17)
+; GFX950-NEXT: v_lshlrev_b32_e32 v38, 16, v17
; GFX950-NEXT: s_waitcnt vmcnt(16)
-; GFX950-NEXT: v_lshlrev_b32_e32 v37, 16, v18
+; GFX950-NEXT: v_lshlrev_b32_e32 v39, 16, v18
; GFX950-NEXT: s_waitcnt vmcnt(15)
-; GFX950-NEXT: v_lshlrev_b32_e32 v39, 16, v19
+; GFX950-NEXT: v_lshlrev_b32_e32 v48, 16, v19
; GFX950-NEXT: s_waitcnt vmcnt(14)
-; GFX950-NEXT: v_lshlrev_b32_e32 v44, 16, v20
+; GFX950-NEXT: v_lshlrev_b32_e32 v49, 16, v20
; GFX950-NEXT: s_waitcnt vmcnt(13)
-; GFX950-NEXT: v_lshlrev_b32_e32 v40, 16, v21
-; GFX950-NEXT: v_cvt_f64_f32_e32 v[14:15], v30
-; GFX950-NEXT: v_cvt_f64_f32_e32 v[20:21], v31
-; GFX950-NEXT: s_waitcnt vmcnt(10)
-; GFX950-NEXT: v_lshlrev_b32_e32 v49, 16, v24
-; GFX950-NEXT: s_waitcnt vmcnt(9)
-; GFX950-NEXT: v_lshlrev_b32_e32 v53, 16, v25
+; GFX950-NEXT: v_lshlrev_b32_e32 v50, 16, v21
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[16:17], v30
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[14:15], v31
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[20:21], v32
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[18:19], v33
; GFX950-NEXT: s_waitcnt vmcnt(8)
-; GFX950-NEXT: v_lshlrev_b32_e32 v41, 16, v26
+; GFX950-NEXT: v_lshlrev_b32_e32 v44, 16, v26
; GFX950-NEXT: s_waitcnt vmcnt(7)
; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v42
; GFX950-NEXT: s_waitcnt vmcnt(6)
; GFX950-NEXT: v_lshlrev_b32_e32 v42, 16, v43
-; GFX950-NEXT: v_cvt_f64_f32_e32 v[18:19], v32
-; GFX950-NEXT: v_cvt_f64_f32_e32 v[24:25], v33
-; GFX950-NEXT: v_cvt_f64_f32_e32 v[26:27], v36
-; GFX950-NEXT: v_cvt_f64_f32_e32 v[32:33], v37
-; GFX950-NEXT: v_cvt_f64_f32_e32 v[30:31], v38
-; GFX950-NEXT: v_cvt_f64_f32_e32 v[36:37], v39
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[32:33], v38
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[30:31], v39
; GFX950-NEXT: v_cvt_f64_f32_e32 v[38:39], v44
; GFX950-NEXT: v_cvt_f64_f32_e32 v[44:45], v42
; GFX950-NEXT: s_waitcnt vmcnt(5)
; GFX950-NEXT: v_lshlrev_b32_e32 v42, 16, v46
; GFX950-NEXT: v_cvt_f64_f32_e32 v[42:43], v42
-; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_lshlrev_b32_e32 v46, 16, v58
+; GFX950-NEXT: s_waitcnt vmcnt(2)
+; GFX950-NEXT: v_lshlrev_b32_e32 v46, 16, v57
; GFX950-NEXT: scratch_store_dwordx4 v0, v[42:45], off offset:240
; GFX950-NEXT: v_cvt_f64_f32_e32 v[58:59], v46
-; GFX950-NEXT: v_lshlrev_b32_e32 v46, 16, v47
+; GFX950-NEXT: s_waitcnt vmcnt(2)
+; GFX950-NEXT: v_lshlrev_b32_e32 v46, 16, v60
; GFX950-NEXT: v_cvt_f64_f32_e32 v[44:45], v1
-; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v56
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v47
; GFX950-NEXT: v_cvt_f64_f32_e32 v[42:43], v1
-; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v57
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v56
; GFX950-NEXT: v_cvt_f64_f32_e32 v[56:57], v46
-; GFX950-NEXT: v_lshlrev_b32_e32 v35, 16, v17
-; GFX950-NEXT: v_lshlrev_b32_e32 v48, 16, v23
; GFX950-NEXT: v_lshlrev_b32_e32 v52, 16, v22
+; GFX950-NEXT: v_lshlrev_b32_e32 v53, 16, v23
+; GFX950-NEXT: v_lshlrev_b32_e32 v40, 16, v24
+; GFX950-NEXT: v_lshlrev_b32_e32 v41, 16, v25
; GFX950-NEXT: scratch_store_dwordx4 v0, v[56:59], off offset:224
; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX950-NEXT: v_cvt_f64_f32_e32 v[10:11], v28
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[8:9], v27
; GFX950-NEXT: v_cvt_f64_f32_e32 v[58:59], v1
-; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v60
-; GFX950-NEXT: v_cvt_f64_f32_e32 v[16:17], v29
-; GFX950-NEXT: v_cvt_f64_f32_e32 v[22:23], v34
-; GFX950-NEXT: v_cvt_f64_f32_e32 v[28:29], v35
-; GFX950-NEXT: v_cvt_f64_f32_e32 v[34:35], v48
-; GFX950-NEXT: v_cvt_f64_f32_e32 v[50:51], v49
+; GFX950-NEXT: s_waitcnt vmcnt(2)
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v61
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[12:13], v28
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[10:11], v29
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[24:25], v34
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[22:23], v35
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[28:29], v36
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[26:27], v37
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[36:37], v48
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[34:35], v49
; GFX950-NEXT: v_cvt_f64_f32_e32 v[48:49], v52
; GFX950-NEXT: v_cvt_f64_f32_e32 v[54:55], v53
; GFX950-NEXT: v_cvt_f64_f32_e32 v[52:53], v40
; GFX950-NEXT: v_cvt_f64_f32_e32 v[40:41], v41
; GFX950-NEXT: v_cvt_f64_f32_e32 v[56:57], v1
; GFX950-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
-; GFX950-NEXT: v_cvt_f64_f32_e32 v[8:9], v7
; GFX950-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[50:51], v50
; GFX950-NEXT: scratch_store_dwordx4 v0, v[56:59], off offset:208
; GFX950-NEXT: scratch_store_dwordx4 v0, v[42:45], off offset:192
; GFX950-NEXT: scratch_store_dwordx4 v0, v[38:41], off offset:176
@@ -8988,6 +8990,7 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
; GFX950-NEXT: scratch_store_dwordx4 v0, v[10:13], off offset:32
; GFX950-NEXT: scratch_store_dwordx4 v0, v[6:9], off offset:16
; GFX950-NEXT: scratch_store_dwordx4 v0, v[2:5], off
+; GFX950-NEXT: v_accvgpr_read_b32 v61, a13 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v60, a12 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v59, a11 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v58, a10 ; Reload Reuse
@@ -9202,130 +9205,130 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1f
; GFX11-NEXT: global_load_u16 v3, v[1:2], off offset:2
-; GFX11-NEXT: global_load_u16 v4, v[1:2], off offset:12
-; GFX11-NEXT: global_load_u16 v5, v[1:2], off offset:8
+; GFX11-NEXT: global_load_u16 v4, v[1:2], off
+; GFX11-NEXT: global_load_u16 v5, v[1:2], off offset:6
; GFX11-NEXT: global_load_u16 v6, v[1:2], off offset:4
-; GFX11-NEXT: global_load_u16 v7, v[1:2], off
-; GFX11-NEXT: global_load_u16 v8, v[1:2], off offset:6
-; GFX11-NEXT: global_load_u16 v9, v[1:2], off offset:10
-; GFX11-NEXT: global_load_u16 v10, v[1:2], off offset:14
+; GFX11-NEXT: global_load_u16 v7, v[1:2], off offset:10
+; GFX11-NEXT: global_load_u16 v8, v[1:2], off offset:8
+; GFX11-NEXT: global_load_u16 v9, v[1:2], off offset:14
+; GFX11-NEXT: global_load_u16 v10, v[1:2], off offset:12
; GFX11-NEXT: global_load_u16 v11, v[1:2], off offset:18
-; GFX11-NEXT: global_load_u16 v12, v[1:2], off offset:28
-; GFX11-NEXT: global_load_u16 v13, v[1:2], off offset:24
+; GFX11-NEXT: global_load_u16 v12, v[1:2], off offset:16
+; GFX11-NEXT: global_load_u16 v13, v[1:2], off offset:22
; GFX11-NEXT: global_load_u16 v14, v[1:2], off offset:20
-; GFX11-NEXT: global_load_u16 v15, v[1:2], off offset:16
-; GFX11-NEXT: global_load_u16 v16, v[1:2], off offset:22
-; GFX11-NEXT: global_load_u16 v17, v[1:2], off offset:26
-; GFX11-NEXT: global_load_u16 v18, v[1:2], off offset:30
+; GFX11-NEXT: global_load_u16 v15, v[1:2], off offset:26
+; GFX11-NEXT: global_load_u16 v16, v[1:2], off offset:24
+; GFX11-NEXT: global_load_u16 v17, v[1:2], off offset:30
+; GFX11-NEXT: global_load_u16 v18, v[1:2], off offset:28
; GFX11-NEXT: global_load_u16 v19, v[1:2], off offset:34
-; GFX11-NEXT: global_load_u16 v20, v[1:2], off offset:44
-; GFX11-NEXT: global_load_u16 v21, v[1:2], off offset:40
+; GFX11-NEXT: global_load_u16 v20, v[1:2], off offset:32
+; GFX11-NEXT: global_load_u16 v21, v[1:2], off offset:38
; GFX11-NEXT: global_load_u16 v22, v[1:2], off offset:36
-; GFX11-NEXT: global_load_u16 v23, v[1:2], off offset:32
-; GFX11-NEXT: global_load_u16 v24, v[1:2], off offset:38
-; GFX11-NEXT: global_load_u16 v25, v[1:2], off offset:42
-; GFX11-NEXT: global_load_u16 v26, v[1:2], off offset:46
+; GFX11-NEXT: global_load_u16 v23, v[1:2], off offset:42
+; GFX11-NEXT: global_load_u16 v24, v[1:2], off offset:40
+; GFX11-NEXT: global_load_u16 v25, v[1:2], off offset:46
+; GFX11-NEXT: global_load_u16 v26, v[1:2], off offset:44
; GFX11-NEXT: global_load_u16 v27, v[1:2], off offset:50
-; GFX11-NEXT: global_load_u16 v28, v[1:2], off offset:60
-; GFX11-NEXT: global_load_u16 v29, v[1:2], off offset:56
+; GFX11-NEXT: global_load_u16 v28, v[1:2], off offset:48
+; GFX11-NEXT: global_load_u16 v29, v[1:2], off offset:54
; GFX11-NEXT: global_load_u16 v30, v[1:2], off offset:52
-; GFX11-NEXT: global_load_u16 v31, v[1:2], off offset:48
-; GFX11-NEXT: global_load_u16 v32, v[1:2], off offset:54
-; GFX11-NEXT: global_load_u16 v33, v[1:2], off offset:58
-; GFX11-NEXT: global_load_u16 v1, v[1:2], off offset:62
+; GFX11-NEXT: global_load_u16 v31, v[1:2], off offset:58
+; GFX11-NEXT: global_load_u16 v32, v[1:2], off offset:56
+; GFX11-NEXT: global_load_u16 v33, v[1:2], off offset:62
+; GFX11-NEXT: global_load_u16 v1, v[1:2], off offset:60
; GFX11-NEXT: s_waitcnt vmcnt(31)
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX11-NEXT: s_waitcnt vmcnt(30)
-; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v4
; GFX11-NEXT: s_waitcnt vmcnt(29)
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX11-NEXT: s_waitcnt vmcnt(28)
; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX11-NEXT: s_waitcnt vmcnt(27)
-; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v7
+; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v7
; GFX11-NEXT: s_waitcnt vmcnt(26)
-; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v8
+; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v8
; GFX11-NEXT: s_waitcnt vmcnt(25)
; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; GFX11-NEXT: s_waitcnt vmcnt(24)
; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; GFX11-NEXT: s_waitcnt vmcnt(23)
-; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v11
+; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v11
; GFX11-NEXT: s_waitcnt vmcnt(22)
-; GFX11-NEXT: v_lshlrev_b32_e32 v100, 16, v12
+; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v12
; GFX11-NEXT: s_waitcnt vmcnt(21)
; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GFX11-NEXT: s_waitcnt vmcnt(20)
; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; GFX11-NEXT: s_waitcnt vmcnt(19)
-; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v15
+; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v15
; GFX11-NEXT: s_waitcnt vmcnt(18)
-; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v16
+; GFX11-NEXT: v_lshlrev_b32_e32 v100, 16, v16
; GFX11-NEXT: s_waitcnt vmcnt(17)
; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17
; GFX11-NEXT: s_waitcnt vmcnt(16)
; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v18
; GFX11-NEXT: s_waitcnt vmcnt(15)
-; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v19
+; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v19
; GFX11-NEXT: s_waitcnt vmcnt(14)
-; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v20
+; GFX11-NEXT: v_lshlrev_b32_e32 v101, 16, v20
; GFX11-NEXT: s_waitcnt vmcnt(13)
; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21
; GFX11-NEXT: s_waitcnt vmcnt(12)
; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22
; GFX11-NEXT: s_waitcnt vmcnt(11)
-; GFX11-NEXT: v_lshlrev_b32_e32 v101, 16, v23
+; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v23
; GFX11-NEXT: s_waitcnt vmcnt(10)
-; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v24
+; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v24
; GFX11-NEXT: s_waitcnt vmcnt(9)
; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25
; GFX11-NEXT: s_waitcnt vmcnt(8)
; GFX11-NEXT: v_lshlrev_b32_e32 v26, 16, v26
; GFX11-NEXT: s_waitcnt vmcnt(7)
-; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v27
+; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v27
; GFX11-NEXT: s_waitcnt vmcnt(6)
-; GFX11-NEXT: v_lshlrev_b32_e32 v65, 16, v28
+; GFX11-NEXT: v_lshlrev_b32_e32 v64, 16, v28
; GFX11-NEXT: s_waitcnt vmcnt(5)
; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v29
; GFX11-NEXT: s_waitcnt vmcnt(4)
; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v30
; GFX11-NEXT: s_waitcnt vmcnt(3)
-; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v31
+; GFX11-NEXT: v_lshlrev_b32_e32 v65, 16, v31
; GFX11-NEXT: s_waitcnt vmcnt(2)
-; GFX11-NEXT: v_lshlrev_b32_e32 v64, 16, v32
+; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v32
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v33
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[96:97], v65
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[84:85], v29
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[82:83], v64
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[86:87], v33
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[98:99], v1
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[82:83], v29
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[86:87], v65
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[84:85], v68
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[98:99], v33
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[96:97], v1
; GFX11-NEXT: v_cvt_f64_f32_e32 v[80:81], v30
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[70:71], v52
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[68:69], v53
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[66:67], v26
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[64:65], v49
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[54:55], v25
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[52:53], v21
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[50:51], v48
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[70:71], v53
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[68:69], v64
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[66:67], v25
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[64:65], v26
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[54:55], v49
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[52:53], v52
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[19:20], v35
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[27:28], v36
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[35:36], v48
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[50:51], v21
; GFX11-NEXT: v_cvt_f64_f32_e32 v[48:49], v22
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[19:20], v34
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[23:24], v35
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[35:36], v36
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[11:12], v34
; GFX11-NEXT: v_cvt_f64_f32_e32 v[33:34], v101
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[31:32], v18
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[29:30], v100
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[27:28], v17
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[25:26], v13
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[31:32], v17
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[29:30], v18
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[25:26], v100
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[23:24], v13
; GFX11-NEXT: v_cvt_f64_f32_e32 v[21:22], v14
; GFX11-NEXT: v_cvt_f64_f32_e32 v[17:18], v39
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[15:16], v10
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[13:14], v38
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[11:12], v9
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[9:10], v5
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[7:8], v7
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[15:16], v9
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[13:14], v10
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[9:10], v38
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[7:8], v5
; GFX11-NEXT: v_cvt_f64_f32_e32 v[5:6], v6
; GFX11-NEXT: v_cvt_f64_f32_e32 v[3:4], v2
; GFX11-NEXT: v_cvt_f64_f32_e32 v[1:2], v37
@@ -9355,99 +9358,101 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
; GFX1250-NEXT: s_clause 0x1f
; GFX1250-NEXT: global_load_u16 v1, v[2:3], off offset:2
-; GFX1250-NEXT: global_load_u16 v10, v[2:3], off offset:12
-; GFX1250-NEXT: global_load_u16 v6, v[2:3], off offset:8
-; GFX1250-NEXT: global_load_u16 v4, v[2:3], off offset:4
-; GFX1250-NEXT: global_load_u16 v5, v[2:3], off
-; GFX1250-NEXT: global_load_u16 v7, v[2:3], off offset:6
-; GFX1250-NEXT: global_load_u16 v8, v[2:3], off offset:62
-; GFX1250-NEXT: global_load_u16 v9, v[2:3], off offset:60
-; GFX1250-NEXT: global_load_u16 v11, v[2:3], off offset:58
-; GFX1250-NEXT: global_load_u16 v12, v[2:3], off offset:56
-; GFX1250-NEXT: global_load_u16 v13, v[2:3], off offset:10
-; GFX1250-NEXT: global_load_u16 v14, v[2:3], off offset:14
+; GFX1250-NEXT: global_load_u16 v4, v[2:3], off
+; GFX1250-NEXT: global_load_u16 v5, v[2:3], off offset:6
+; GFX1250-NEXT: global_load_u16 v6, v[2:3], off offset:62
+; GFX1250-NEXT: global_load_u16 v7, v[2:3], off offset:60
+; GFX1250-NEXT: global_load_u16 v8, v[2:3], off offset:4
+; GFX1250-NEXT: global_load_u16 v9, v[2:3], off offset:58
+; GFX1250-NEXT: global_load_u16 v10, v[2:3], off offset:56
+; GFX1250-NEXT: global_load_u16 v11, v[2:3], off offset:10
+; GFX1250-NEXT: global_load_u16 v12, v[2:3], off offset:8
+; GFX1250-NEXT: global_load_u16 v13, v[2:3], off offset:14
+; GFX1250-NEXT: global_load_u16 v14, v[2:3], off offset:12
; GFX1250-NEXT: global_load_u16 v15, v[2:3], off offset:18
-; GFX1250-NEXT: global_load_u16 v16, v[2:3], off offset:28
-; GFX1250-NEXT: global_load_u16 v17, v[2:3], off offset:24
+; GFX1250-NEXT: global_load_u16 v16, v[2:3], off offset:16
+; GFX1250-NEXT: global_load_u16 v17, v[2:3], off offset:22
; GFX1250-NEXT: global_load_u16 v18, v[2:3], off offset:20
-; GFX1250-NEXT: global_load_u16 v19, v[2:3], off offset:16
-; GFX1250-NEXT: global_load_u16 v20, v[2:3], off offset:22
-; GFX1250-NEXT: global_load_u16 v21, v[2:3], off offset:26
-; GFX1250-NEXT: global_load_u16 v22, v[2:3], off offset:30
+; GFX1250-NEXT: global_load_u16 v19, v[2:3], off offset:26
+; GFX1250-NEXT: global_load_u16 v20, v[2:3], off offset:24
+; GFX1250-NEXT: global_load_u16 v21, v[2:3], off offset:30
+; GFX1250-NEXT: global_load_u16 v22, v[2:3], off offset:28
; GFX1250-NEXT: global_load_u16 v23, v[2:3], off offset:34
-; GFX1250-NEXT: global_load_u16 v24, v[2:3], off offset:44
-; GFX1250-NEXT: global_load_u16 v25, v[2:3], off offset:40
+; GFX1250-NEXT: global_load_u16 v24, v[2:3], off offset:32
+; GFX1250-NEXT: global_load_u16 v25, v[2:3], off offset:38
; GFX1250-NEXT: global_load_u16 v26, v[2:3], off offset:36
-; GFX1250-NEXT: global_load_u16 v27, v[2:3], off offset:32
-; GFX1250-NEXT: global_load_u16 v28, v[2:3], off offset:38
-; GFX1250-NEXT: global_load_u16 v29, v[2:3], off offset:42
-; GFX1250-NEXT: global_load_u16 v30, v[2:3], off offset:46
+; GFX1250-NEXT: global_load_u16 v27, v[2:3], off offset:42
+; GFX1250-NEXT: global_load_u16 v28, v[2:3], off offset:40
+; GFX1250-NEXT: global_load_u16 v29, v[2:3], off offset:46
+; GFX1250-NEXT: global_load_u16 v30, v[2:3], off offset:44
; GFX1250-NEXT: global_load_u16 v31, v[2:3], off offset:50
-; GFX1250-NEXT: global_load_u16 v32, v[2:3], off offset:52
-; GFX1250-NEXT: global_load_u16 v33, v[2:3], off offset:48
-; GFX1250-NEXT: global_load_u16 v34, v[2:3], off offset:54
+; GFX1250-NEXT: global_load_u16 v32, v[2:3], off offset:48
+; GFX1250-NEXT: global_load_u16 v33, v[2:3], off offset:54
+; GFX1250-NEXT: global_load_u16 v34, v[2:3], off offset:52
; GFX1250-NEXT: s_wait_loadcnt 0x1e
-; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v37, 16, v10
-; GFX1250-NEXT: s_wait_loadcnt 0x1b
-; GFX1250-NEXT: v_dual_lshlrev_b32 v85, 16, v4 :: v_dual_lshlrev_b32 v84, 16, v5
-; GFX1250-NEXT: s_wait_loadcnt 0x19
+; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v84, 16, v4
+; GFX1250-NEXT: s_wait_loadcnt 0x1c
; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: v_dual_lshlrev_b32 v35, 16, v7 :: v_dual_lshlrev_b32 v2, 16, v8
-; GFX1250-NEXT: s_wait_loadcnt 0x17
-; GFX1250-NEXT: v_dual_lshlrev_b32 v3, 16, v9 :: v_dual_lshlrev_b32 v7, 16, v11
-; GFX1250-NEXT: s_wait_loadcnt 0x15
-; GFX1250-NEXT: v_dual_lshlrev_b32 v11, 16, v12 :: v_dual_lshlrev_b32 v12, 16, v13
-; GFX1250-NEXT: v_lshlrev_b32_e32 v13, 16, v6
+; GFX1250-NEXT: v_dual_lshlrev_b32 v35, 16, v5 :: v_dual_lshlrev_b32 v2, 16, v6
+; GFX1250-NEXT: s_wait_loadcnt 0x1a
+; GFX1250-NEXT: v_dual_lshlrev_b32 v3, 16, v7 :: v_dual_lshlrev_b32 v85, 16, v8
+; GFX1250-NEXT: s_wait_loadcnt 0x18
+; GFX1250-NEXT: v_dual_lshlrev_b32 v6, 16, v9 :: v_dual_lshlrev_b32 v7, 16, v10
; GFX1250-NEXT: v_cvt_f64_f32_e32 v[4:5], v2
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1250-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
-; GFX1250-NEXT: v_cvt_f64_f32_e32 v[8:9], v7
-; GFX1250-NEXT: v_cvt_f64_f32_e32 v[6:7], v11
-; GFX1250-NEXT: s_wait_loadcnt 0x13
-; GFX1250-NEXT: v_dual_lshlrev_b32 v36, 16, v14 :: v_dual_lshlrev_b32 v38, 16, v15
+; GFX1250-NEXT: s_wait_loadcnt 0x14
+; GFX1250-NEXT: v_dual_lshlrev_b32 v13, 16, v13 :: v_dual_lshlrev_b32 v36, 16, v14
+; GFX1250-NEXT: v_cvt_f64_f32_e32 v[8:9], v6
+; GFX1250-NEXT: v_cvt_f64_f32_e32 v[6:7], v7
+; GFX1250-NEXT: s_wait_loadcnt 0x12
+; GFX1250-NEXT: v_dual_lshlrev_b32 v37, 16, v15 :: v_dual_lshlrev_b32 v81, 16, v16
+; GFX1250-NEXT: s_wait_loadcnt 0x10
+; GFX1250-NEXT: v_dual_lshlrev_b32 v38, 16, v17 :: v_dual_lshlrev_b32 v82, 16, v18
; GFX1250-NEXT: s_wait_loadcnt 0xc
; GFX1250-NEXT: v_dual_lshlrev_b32 v21, 16, v21 :: v_dual_lshlrev_b32 v39, 16, v22
-; GFX1250-NEXT: s_wait_loadcnt 0xb
-; GFX1250-NEXT: v_dual_lshlrev_b32 v49, 16, v23 :: v_dual_lshlrev_b32 v68, 16, v17
-; GFX1250-NEXT: s_wait_loadcnt 0x9
-; GFX1250-NEXT: v_dual_lshlrev_b32 v48, 16, v16 :: v_dual_lshlrev_b32 v25, 16, v25
-; GFX1250-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; GFX1250-NEXT: s_wait_loadcnt 0x5
-; GFX1250-NEXT: v_dual_lshlrev_b32 v50, 16, v28 :: v_dual_lshlrev_b32 v64, 16, v29
-; GFX1250-NEXT: s_wait_loadcnt 0x3
-; GFX1250-NEXT: v_dual_lshlrev_b32 v51, 16, v30 :: v_dual_lshlrev_b32 v52, 16, v31
-; GFX1250-NEXT: v_dual_lshlrev_b32 v69, 16, v27 :: v_dual_lshlrev_b32 v70, 16, v26
+; GFX1250-NEXT: s_wait_loadcnt 0xa
+; GFX1250-NEXT: v_dual_lshlrev_b32 v48, 16, v23 :: v_dual_lshlrev_b32 v24, 16, v24
+; GFX1250-NEXT: s_wait_loadcnt 0x8
+; GFX1250-NEXT: v_dual_lshlrev_b32 v25, 16, v25 :: v_dual_lshlrev_b32 v69, 16, v26
+; GFX1250-NEXT: s_wait_loadcnt 0x6
+; GFX1250-NEXT: v_dual_lshlrev_b32 v64, 16, v27 :: v_dual_lshlrev_b32 v65, 16, v28
+; GFX1250-NEXT: s_wait_loadcnt 0x4
+; GFX1250-NEXT: v_dual_lshlrev_b32 v49, 16, v29 :: v_dual_lshlrev_b32 v50, 16, v30
+; GFX1250-NEXT: s_wait_loadcnt 0x2
+; GFX1250-NEXT: v_dual_lshlrev_b32 v51, 16, v31 :: v_dual_lshlrev_b32 v32, 16, v32
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_dual_lshlrev_b32 v53, 16, v34 :: v_dual_lshlrev_b32 v32, 16, v32
-; GFX1250-NEXT: v_lshlrev_b32_e32 v33, 16, v33
-; GFX1250-NEXT: v_cvt_f64_f32_e32 v[22:23], v38
-; GFX1250-NEXT: v_cvt_f64_f32_e32 v[30:31], v39
-; GFX1250-NEXT: v_cvt_f64_f32_e32 v[38:39], v50
-; GFX1250-NEXT: v_cvt_f64_f32_e32 v[54:55], v53
+; GFX1250-NEXT: v_dual_lshlrev_b32 v33, 16, v33 :: v_dual_lshlrev_b32 v52, 16, v34
; GFX1250-NEXT: v_cvt_f64_f32_e32 v[14:15], v35
-; GFX1250-NEXT: v_cvt_f64_f32_e32 v[28:29], v48
-; GFX1250-NEXT: v_cvt_f64_f32_e32 v[34:35], v49
-; GFX1250-NEXT: v_cvt_f64_f32_e32 v[48:49], v33
-; GFX1250-NEXT: v_dual_lshlrev_b32 v20, 16, v20 :: v_dual_lshlrev_b32 v81, 16, v18
+; GFX1250-NEXT: v_cvt_f64_f32_e32 v[34:35], v48
+; GFX1250-NEXT: v_dual_lshlrev_b32 v68, 16, v19 :: v_dual_lshlrev_b32 v20, 16, v20
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX1250-NEXT: v_cvt_f64_f32_e32 v[54:55], v33
+; GFX1250-NEXT: v_cvt_f64_f32_e32 v[52:53], v52
; GFX1250-NEXT: v_cvt_f64_f32_e32 v[66:67], v64
-; GFX1250-NEXT: v_cvt_f64_f32_e32 v[64:65], v25
-; GFX1250-NEXT: v_lshlrev_b32_e32 v80, 16, v19
-; GFX1250-NEXT: v_cvt_f64_f32_e32 v[18:19], v36
-; GFX1250-NEXT: v_cvt_f64_f32_e32 v[16:17], v37
-; GFX1250-NEXT: v_cvt_f64_f32_e32 v[36:37], v70
+; GFX1250-NEXT: v_cvt_f64_f32_e32 v[64:65], v65
+; GFX1250-NEXT: v_cvt_f64_f32_e32 v[16:17], v36
+; GFX1250-NEXT: v_cvt_f64_f32_e32 v[22:23], v37
+; GFX1250-NEXT: v_cvt_f64_f32_e32 v[26:27], v38
+; GFX1250-NEXT: v_cvt_f64_f32_e32 v[28:29], v39
+; GFX1250-NEXT: v_cvt_f64_f32_e32 v[38:39], v25
+; GFX1250-NEXT: v_cvt_f64_f32_e32 v[36:37], v69
; GFX1250-NEXT: scratch_store_b128 v0, v[2:5], off offset:240
; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: v_cvt_f64_f32_e32 v[4:5], v51
-; GFX1250-NEXT: v_cvt_f64_f32_e32 v[50:51], v52
-; GFX1250-NEXT: v_cvt_f64_f32_e32 v[52:53], v32
-; GFX1250-NEXT: v_cvt_f64_f32_e32 v[2:3], v24
-; GFX1250-NEXT: v_cvt_f64_f32_e32 v[32:33], v69
-; GFX1250-NEXT: v_cvt_f64_f32_e32 v[70:71], v21
-; GFX1250-NEXT: v_cvt_f64_f32_e32 v[68:69], v68
-; GFX1250-NEXT: v_cvt_f64_f32_e32 v[26:27], v20
-; GFX1250-NEXT: v_cvt_f64_f32_e32 v[24:25], v81
-; GFX1250-NEXT: v_cvt_f64_f32_e32 v[20:21], v80
-; GFX1250-NEXT: v_cvt_f64_f32_e32 v[82:83], v12
-; GFX1250-NEXT: v_cvt_f64_f32_e32 v[80:81], v13
+; GFX1250-NEXT: v_cvt_f64_f32_e32 v[4:5], v49
+; GFX1250-NEXT: v_cvt_f64_f32_e32 v[2:3], v50
+; GFX1250-NEXT: v_cvt_f64_f32_e32 v[50:51], v51
+; GFX1250-NEXT: v_cvt_f64_f32_e32 v[48:49], v32
+; GFX1250-NEXT: v_cvt_f64_f32_e32 v[32:33], v24
+; GFX1250-NEXT: v_dual_lshlrev_b32 v80, 16, v11 :: v_dual_lshlrev_b32 v12, 16, v12
+; GFX1250-NEXT: v_cvt_f64_f32_e32 v[30:31], v21
+; GFX1250-NEXT: v_cvt_f64_f32_e32 v[70:71], v68
+; GFX1250-NEXT: v_cvt_f64_f32_e32 v[68:69], v20
+; GFX1250-NEXT: v_cvt_f64_f32_e32 v[24:25], v82
+; GFX1250-NEXT: v_cvt_f64_f32_e32 v[20:21], v81
+; GFX1250-NEXT: v_cvt_f64_f32_e32 v[18:19], v13
+; GFX1250-NEXT: v_cvt_f64_f32_e32 v[82:83], v80
+; GFX1250-NEXT: v_cvt_f64_f32_e32 v[80:81], v12
; GFX1250-NEXT: v_cvt_f64_f32_e32 v[12:13], v85
; GFX1250-NEXT: v_cvt_f64_f32_e32 v[10:11], v1
; GFX1250-NEXT: scratch_store_b128 v0, v[6:9], off offset:224
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index 293e24f2d1b9d..ada8615212f79 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -6929,51 +6929,52 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28
-; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32
; VI-NEXT: s_getpc_b64 s[4:5]
; VI-NEXT: s_add_u32 s4, s4, byval_align16_f64_arg at rel32@lo+4
; VI-NEXT: s_addc_u32 s5, s5, byval_align16_f64_arg at rel32@hi+12
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:20
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:16
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32
; VI-NEXT: s_setpc_b64 s[4:5]
;
; CI-LABEL: tail_call_byval_align16:
; CI: ; %bb.0: ; %entry
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28
-; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32
; CI-NEXT: s_getpc_b64 s[4:5]
; CI-NEXT: s_add_u32 s4, s4, byval_align16_f64_arg at rel32@lo+4
; CI-NEXT: s_addc_u32 s5, s5, byval_align16_f64_arg at rel32@hi+12
-; CI-NEXT: s_waitcnt vmcnt(1)
+; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:20
; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24
-; CI-NEXT: s_waitcnt vmcnt(2)
-; CI-NEXT: buffer_store_dword v32, off, s[0:3], s32
+; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32
; CI-NEXT: s_waitcnt vmcnt(1)
; CI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:16
+; CI-NEXT: s_waitcnt vmcnt(1)
+; CI-NEXT: buffer_store_dword v32, off, s[0:3], s32
; CI-NEXT: s_setpc_b64 s[4:5]
;
; SDAG-LABEL: tail_call_byval_align16:
; SDAG: ; %bb.0: ; %entry
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28
-; SDAG-NEXT: buffer_load_dword v32, off, s[0:3], s32
; SDAG-NEXT: s_getpc_b64 s[4:5]
; SDAG-NEXT: s_add_u32 s4, s4, byval_align16_f64_arg at rel32@lo+4
; SDAG-NEXT: s_addc_u32 s5, s5, byval_align16_f64_arg at rel32@hi+12
-; SDAG-NEXT: s_waitcnt vmcnt(1)
+; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:20
; SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24
-; SDAG-NEXT: s_waitcnt vmcnt(2)
-; SDAG-NEXT: buffer_store_dword v32, off, s[0:3], s32
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: buffer_load_dword v32, off, s[0:3], s32
; SDAG-NEXT: s_waitcnt vmcnt(1)
; SDAG-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:16
+; SDAG-NEXT: s_waitcnt vmcnt(1)
+; SDAG-NEXT: buffer_store_dword v32, off, s[0:3], s32
; SDAG-NEXT: s_setpc_b64 s[4:5]
;
; GFX11-LABEL: tail_call_byval_align16:
@@ -6994,17 +6995,17 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
; HSA: ; %bb.0: ; %entry
; HSA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; HSA-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28
-; HSA-NEXT: buffer_load_dword v32, off, s[0:3], s32
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, byval_align16_f64_arg at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, byval_align16_f64_arg at rel32@hi+12
-; HSA-NEXT: s_waitcnt vmcnt(1)
+; HSA-NEXT: s_waitcnt vmcnt(0)
; HSA-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:20
; HSA-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24
-; HSA-NEXT: s_waitcnt vmcnt(2)
-; HSA-NEXT: buffer_store_dword v32, off, s[0:3], s32
+; HSA-NEXT: buffer_load_dword v32, off, s[0:3], s32
; HSA-NEXT: s_waitcnt vmcnt(1)
; HSA-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:16
+; HSA-NEXT: s_waitcnt vmcnt(1)
+; HSA-NEXT: buffer_store_dword v32, off, s[0:3], s32
; HSA-NEXT: s_setpc_b64 s[4:5]
;
; GISEL-LABEL: tail_call_byval_align16:
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index 0eed0ba50092b..c9a3c57ffb69b 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -1915,19 +1915,20 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr
; GFX10-NEXT: v_mov_b32_e32 v7, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x5
-; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] offset:6
; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3
; GFX10-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2
-; GFX10-NEXT: global_load_ubyte v6, v0, s[2:3] offset:1
+; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] offset:1
+; GFX10-NEXT: global_load_ubyte v6, v0, s[2:3] offset:6
; GFX10-NEXT: global_load_short_d16 v4, v0, s[2:3] offset:4
; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3]
-; GFX10-NEXT: s_waitcnt vmcnt(4)
+; GFX10-NEXT: s_waitcnt vmcnt(5)
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, v1
-; GFX10-NEXT: s_waitcnt vmcnt(3)
+; GFX10-NEXT: s_waitcnt vmcnt(4)
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
+; GFX10-NEXT: s_waitcnt vmcnt(3)
+; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v5
; GFX10-NEXT: s_waitcnt vmcnt(2)
-; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v6
-; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, v5
+; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, v6
; GFX10-NEXT: s_waitcnt vmcnt(1)
; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v5, v4
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v4
@@ -1975,19 +1976,20 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x5
-; GFX11-NEXT: global_load_u8 v5, v0, s[2:3] offset:6
; GFX11-NEXT: global_load_u8 v1, v0, s[2:3] offset:3
; GFX11-NEXT: global_load_u8 v2, v0, s[2:3] offset:2
-; GFX11-NEXT: global_load_u8 v6, v0, s[2:3] offset:1
+; GFX11-NEXT: global_load_u8 v5, v0, s[2:3] offset:1
+; GFX11-NEXT: global_load_u8 v6, v0, s[2:3] offset:6
; GFX11-NEXT: global_load_d16_b16 v4, v0, s[2:3] offset:4
; GFX11-NEXT: global_load_u8 v0, v0, s[2:3]
-; GFX11-NEXT: s_waitcnt vmcnt(4)
+; GFX11-NEXT: s_waitcnt vmcnt(5)
; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v3, v1
-; GFX11-NEXT: s_waitcnt vmcnt(3)
+; GFX11-NEXT: s_waitcnt vmcnt(4)
; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
+; GFX11-NEXT: s_waitcnt vmcnt(3)
+; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v5
; GFX11-NEXT: s_waitcnt vmcnt(2)
-; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v6
-; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v6, v5
+; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v6, v6
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v5, v4
; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v4, v4
diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
index 9f1b55ea3b1ef..03977565086fb 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
@@ -529,30 +529,29 @@ define amdgpu_kernel void @unaligned_read2_f32(ptr addrspace(1) %out, ptr addrsp
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v0
-; CI-NEXT: ds_read_u8 v2, v1 offset:1
-; CI-NEXT: ds_read_u8 v3, v1 offset:34
-; CI-NEXT: ds_read_u8 v4, v1 offset:32
-; CI-NEXT: ds_read_u8 v5, v1 offset:2
-; CI-NEXT: ds_read_u8 v6, v1
-; CI-NEXT: ds_read_u8 v7, v1 offset:3
-; CI-NEXT: ds_read_u8 v8, v1 offset:33
+; CI-NEXT: ds_read_u8 v2, v1
+; CI-NEXT: ds_read_u8 v3, v1 offset:1
+; CI-NEXT: ds_read_u8 v4, v1 offset:2
+; CI-NEXT: ds_read_u8 v5, v1 offset:3
+; CI-NEXT: ds_read_u8 v6, v1 offset:32
+; CI-NEXT: ds_read_u8 v7, v1 offset:33
+; CI-NEXT: ds_read_u8 v8, v1 offset:34
; CI-NEXT: ds_read_u8 v1, v1 offset:35
-; CI-NEXT: s_waitcnt lgkmcnt(7)
-; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; CI-NEXT: s_waitcnt lgkmcnt(3)
-; CI-NEXT: v_or_b32_e32 v2, v2, v6
-; CI-NEXT: s_waitcnt lgkmcnt(2)
-; CI-NEXT: v_lshlrev_b32_e32 v6, 8, v7
-; CI-NEXT: v_or_b32_e32 v5, v6, v5
-; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; CI-NEXT: s_waitcnt lgkmcnt(6)
+; CI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
+; CI-NEXT: v_or_b32_e32 v2, v3, v2
+; CI-NEXT: s_waitcnt lgkmcnt(4)
+; CI-NEXT: v_lshlrev_b32_e32 v3, 8, v5
+; CI-NEXT: v_or_b32_e32 v3, v3, v4
+; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; CI-NEXT: v_or_b32_e32 v2, v5, v2
-; CI-NEXT: v_lshlrev_b32_e32 v5, 8, v8
-; CI-NEXT: v_or_b32_e32 v1, v1, v3
-; CI-NEXT: v_or_b32_e32 v4, v5, v4
+; CI-NEXT: v_or_b32_e32 v2, v3, v2
+; CI-NEXT: v_lshlrev_b32_e32 v3, 8, v7
+; CI-NEXT: v_or_b32_e32 v1, v1, v8
+; CI-NEXT: v_or_b32_e32 v3, v3, v6
; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; CI-NEXT: v_or_b32_e32 v1, v1, v4
+; CI-NEXT: v_or_b32_e32 v1, v1, v3
; CI-NEXT: v_add_f32_e32 v2, v2, v1
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_mov_b32_e32 v1, 0
@@ -622,30 +621,29 @@ define amdgpu_kernel void @unaligned_offset_read2_f32(ptr addrspace(1) %out, ptr
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v0
-; CI-NEXT: ds_read_u8 v2, v1 offset:6
-; CI-NEXT: ds_read_u8 v3, v1 offset:11
-; CI-NEXT: ds_read_u8 v4, v1 offset:9
-; CI-NEXT: ds_read_u8 v5, v1 offset:7
-; CI-NEXT: ds_read_u8 v6, v1 offset:5
-; CI-NEXT: ds_read_u8 v7, v1 offset:8
-; CI-NEXT: ds_read_u8 v8, v1 offset:10
+; CI-NEXT: ds_read_u8 v2, v1 offset:5
+; CI-NEXT: ds_read_u8 v3, v1 offset:6
+; CI-NEXT: ds_read_u8 v4, v1 offset:7
+; CI-NEXT: ds_read_u8 v5, v1 offset:8
+; CI-NEXT: ds_read_u8 v6, v1 offset:9
+; CI-NEXT: ds_read_u8 v7, v1 offset:10
+; CI-NEXT: ds_read_u8 v8, v1 offset:11
; CI-NEXT: ds_read_u8 v1, v1 offset:12
-; CI-NEXT: s_waitcnt lgkmcnt(7)
-; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; CI-NEXT: s_waitcnt lgkmcnt(3)
-; CI-NEXT: v_or_b32_e32 v2, v2, v6
-; CI-NEXT: s_waitcnt lgkmcnt(2)
-; CI-NEXT: v_lshlrev_b32_e32 v6, 8, v7
-; CI-NEXT: v_or_b32_e32 v5, v6, v5
-; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; CI-NEXT: s_waitcnt lgkmcnt(6)
+; CI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
+; CI-NEXT: v_or_b32_e32 v2, v3, v2
+; CI-NEXT: s_waitcnt lgkmcnt(4)
+; CI-NEXT: v_lshlrev_b32_e32 v3, 8, v5
+; CI-NEXT: v_or_b32_e32 v3, v3, v4
+; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; CI-NEXT: v_or_b32_e32 v2, v5, v2
-; CI-NEXT: v_lshlrev_b32_e32 v5, 8, v8
-; CI-NEXT: v_or_b32_e32 v1, v1, v3
-; CI-NEXT: v_or_b32_e32 v4, v5, v4
+; CI-NEXT: v_or_b32_e32 v2, v3, v2
+; CI-NEXT: v_lshlrev_b32_e32 v3, 8, v7
+; CI-NEXT: v_or_b32_e32 v1, v1, v8
+; CI-NEXT: v_or_b32_e32 v3, v3, v6
; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; CI-NEXT: v_or_b32_e32 v1, v1, v4
+; CI-NEXT: v_or_b32_e32 v1, v1, v3
; CI-NEXT: v_add_f32_e32 v2, v2, v1
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_mov_b32_e32 v1, 0
@@ -716,15 +714,15 @@ define amdgpu_kernel void @misaligned_2_simple_read2_f32(ptr addrspace(1) %out,
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_add_i32_e32 v1, vcc, s0, v0
; CI-NEXT: ds_read_u16 v2, v1 offset:2
-; CI-NEXT: ds_read_u16 v3, v1 offset:32
-; CI-NEXT: ds_read_u16 v4, v1
-; CI-NEXT: ds_read_u16 v1, v1 offset:34
+; CI-NEXT: ds_read_u16 v3, v1
+; CI-NEXT: ds_read_u16 v4, v1 offset:34
+; CI-NEXT: ds_read_u16 v1, v1 offset:32
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; CI-NEXT: v_or_b32_e32 v2, v2, v4
-; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; CI-NEXT: v_or_b32_e32 v1, v1, v3
+; CI-NEXT: v_or_b32_e32 v2, v2, v3
+; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; CI-NEXT: v_or_b32_e32 v1, v3, v1
; CI-NEXT: v_add_f32_e32 v2, v2, v1
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
@@ -1453,32 +1451,28 @@ define amdgpu_kernel void @read2_v2i32_align1_odd_offset(ptr addrspace(1) %out)
; CI: ; %bb.0: ; %entry
; CI-NEXT: v_mov_b32_e32 v0, 0
; CI-NEXT: s_mov_b32 m0, -1
-; CI-NEXT: ds_read_u8 v1, v0 offset:70
-; CI-NEXT: ds_read_u8 v2, v0 offset:72
-; CI-NEXT: ds_read_u8 v3, v0 offset:71
-; CI-NEXT: ds_read_u8 v4, v0 offset:69
+; CI-NEXT: ds_read_u8 v2, v0 offset:65
+; CI-NEXT: ds_read_u8 v3, v0 offset:66
+; CI-NEXT: ds_read_u8 v4, v0 offset:67
; CI-NEXT: ds_read_u8 v5, v0 offset:68
-; CI-NEXT: s_waitcnt lgkmcnt(4)
-; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; CI-NEXT: s_waitcnt lgkmcnt(3)
-; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; CI-NEXT: s_waitcnt lgkmcnt(2)
-; CI-NEXT: v_or_b32_e32 v2, v2, v3
-; CI-NEXT: s_waitcnt lgkmcnt(1)
-; CI-NEXT: v_or_b32_e32 v1, v1, v4
-; CI-NEXT: ds_read_u8 v4, v0 offset:66
-; CI-NEXT: ds_read_u8 v6, v0 offset:67
-; CI-NEXT: ds_read_u8 v0, v0 offset:65
-; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; CI-NEXT: ds_read_u8 v1, v0 offset:69
+; CI-NEXT: ds_read_u8 v6, v0 offset:70
+; CI-NEXT: ds_read_u8 v7, v0 offset:71
+; CI-NEXT: ds_read_u8 v0, v0 offset:72
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; CI-NEXT: v_or_b32_e32 v1, v2, v1
+; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v4
-; CI-NEXT: v_or_b32_e32 v0, v2, v0
+; CI-NEXT: v_lshlrev_b32_e32 v6, 8, v6
+; CI-NEXT: v_or_b32_e32 v1, v6, v1
+; CI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
+; CI-NEXT: v_or_b32_e32 v0, v0, v7
+; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; CI-NEXT: v_or_b32_e32 v1, v0, v1
+; CI-NEXT: v_lshlrev_b32_e32 v0, 8, v3
+; CI-NEXT: v_or_b32_e32 v0, v0, v2
; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v5
-; CI-NEXT: v_or_b32_e32 v2, v2, v6
+; CI-NEXT: v_or_b32_e32 v2, v2, v4
; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, -1
; CI-NEXT: v_or_b32_e32 v0, v2, v0
; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
index 975695b03c114..7b8a389453abb 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
@@ -1091,12 +1091,14 @@ entry:
define amdgpu_kernel void @double15_extelt(ptr addrspace(1) %out, i32 %sel) {
; GCN-LABEL: double15_extelt:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT: s_load_dword s0, s[4:5], 0x2c
; GCN-NEXT: s_mov_b32 s36, 0
; GCN-NEXT: s_mov_b32 s65, 0x402e0000
; GCN-NEXT: s_mov_b32 s63, 0x402c0000
; GCN-NEXT: s_mov_b32 s61, 0x402a0000
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_lshl_b32 m0, s0, 1
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: s_mov_b32 s59, 0x40280000
; GCN-NEXT: s_mov_b32 s57, 0x40260000
; GCN-NEXT: s_mov_b32 s55, 0x40240000
@@ -1126,8 +1128,6 @@ define amdgpu_kernel void @double15_extelt(ptr addrspace(1) %out, i32 %sel) {
; GCN-NEXT: v_mov_b32_e32 v0, s36
; GCN-NEXT: v_mov_b32_e32 v1, s37
; GCN-NEXT: v_mov_b32_e32 v31, s67
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshl_b32 m0, s2, 1
; GCN-NEXT: v_mov_b32_e32 v2, s38
; GCN-NEXT: v_mov_b32_e32 v3, s39
; GCN-NEXT: v_mov_b32_e32 v4, s40
@@ -1159,6 +1159,7 @@ define amdgpu_kernel void @double15_extelt(ptr addrspace(1) %out, i32 %sel) {
; GCN-NEXT: v_mov_b32_e32 v30, s66
; GCN-NEXT: v_movrels_b32_e32 v32, v1
; GCN-NEXT: v_movrels_b32_e32 v31, v0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: flat_store_dwordx2 v[0:1], v[31:32]
@@ -1347,12 +1348,14 @@ entry:
define amdgpu_kernel void @double16_extelt(ptr addrspace(1) %out, i32 %sel) {
; GCN-LABEL: double16_extelt:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT: s_load_dword s0, s[4:5], 0x2c
; GCN-NEXT: s_mov_b32 s36, 0
; GCN-NEXT: s_mov_b32 s67, 0x40300000
; GCN-NEXT: s_mov_b32 s65, 0x402e0000
; GCN-NEXT: s_mov_b32 s63, 0x402c0000
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_lshl_b32 m0, s0, 1
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: s_mov_b32 s61, 0x402a0000
; GCN-NEXT: s_mov_b32 s59, 0x40280000
; GCN-NEXT: s_mov_b32 s57, 0x40260000
@@ -1384,8 +1387,6 @@ define amdgpu_kernel void @double16_extelt(ptr addrspace(1) %out, i32 %sel) {
; GCN-NEXT: v_mov_b32_e32 v0, s36
; GCN-NEXT: v_mov_b32_e32 v1, s37
; GCN-NEXT: v_mov_b32_e32 v31, s67
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshl_b32 m0, s2, 1
; GCN-NEXT: v_mov_b32_e32 v2, s38
; GCN-NEXT: v_mov_b32_e32 v3, s39
; GCN-NEXT: v_mov_b32_e32 v4, s40
@@ -1417,6 +1418,7 @@ define amdgpu_kernel void @double16_extelt(ptr addrspace(1) %out, i32 %sel) {
; GCN-NEXT: v_mov_b32_e32 v30, s66
; GCN-NEXT: v_movrels_b32_e32 v32, v1
; GCN-NEXT: v_movrels_b32_e32 v31, v0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: flat_store_dwordx2 v[0:1], v[31:32]
@@ -1605,14 +1607,14 @@ entry:
define amdgpu_kernel void @float32_extelt(ptr addrspace(1) %out, i32 %sel) {
; GCN-LABEL: float32_extelt:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c
; GCN-NEXT: v_mov_b32_e32 v0, 1.0
; GCN-NEXT: v_mov_b32_e32 v1, 2.0
; GCN-NEXT: v_mov_b32_e32 v2, 0x40400000
+; GCN-NEXT: v_mov_b32_e32 v3, 4.0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b32 m0, s2
-; GCN-NEXT: v_mov_b32_e32 v3, 4.0
; GCN-NEXT: v_mov_b32_e32 v4, 0x40a00000
; GCN-NEXT: v_mov_b32_e32 v5, 0x40c00000
; GCN-NEXT: v_mov_b32_e32 v6, 0x40e00000
diff --git a/llvm/test/CodeGen/AMDGPU/fma-combine.ll b/llvm/test/CodeGen/AMDGPU/fma-combine.ll
index 8fc6904f5009c..4f65825c4d8c5 100644
--- a/llvm/test/CodeGen/AMDGPU/fma-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma-combine.ll
@@ -2707,14 +2707,14 @@ define amdgpu_kernel void @fma_neg_b_c_v4f32(ptr addrspace(1) %out, ptr addrspac
; GFX11-NEXT: v_lshlrev_b32_e32 v12, 4, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x2
-; GFX11-NEXT: global_load_b128 v[0:3], v12, s[2:3] offset:16
-; GFX11-NEXT: global_load_b128 v[4:7], v12, s[2:3]
+; GFX11-NEXT: global_load_b128 v[0:3], v12, s[2:3]
+; GFX11-NEXT: global_load_b128 v[4:7], v12, s[2:3] offset:16
; GFX11-NEXT: global_load_b128 v[8:11], v12, s[2:3] offset:48
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_fma_f32 v3, v11, -v7, -v3
-; GFX11-NEXT: v_fma_f32 v2, v10, -v6, -v2
-; GFX11-NEXT: v_fma_f32 v1, v9, -v5, -v1
-; GFX11-NEXT: v_fma_f32 v0, v8, -v4, -v0
+; GFX11-NEXT: v_fma_f32 v3, v11, -v3, -v7
+; GFX11-NEXT: v_fma_f32 v2, v10, -v2, -v6
+; GFX11-NEXT: v_fma_f32 v1, v9, -v1, -v5
+; GFX11-NEXT: v_fma_f32 v0, v8, -v0, -v4
; GFX11-NEXT: global_store_b128 v12, v[0:3], s[0:1]
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll b/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll
index 0658997d087bf..b8a3aa97d2b4e 100644
--- a/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll
+++ b/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll
@@ -34,8 +34,8 @@ define amdgpu_kernel void @divergent_or3_b64(ptr addrspace(1) %arg) {
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_lshlrev_b32_e32 v6, 5, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_load_dwordx2 v[4:5], v6, s[0:1] offset:16
; GCN-NEXT: global_load_dwordx4 v[0:3], v6, s[0:1]
+; GCN-NEXT: global_load_dwordx2 v[4:5], v6, s[0:1] offset:16
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_or3_b32 v1, v3, v1, v5
; GCN-NEXT: v_or3_b32 v0, v2, v0, v4
diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
index 2daed9b69384f..6c5b6b799a2b6 100644
--- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
@@ -124,10 +124,10 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) #0 {
;
; GFX8V5-LABEL: llvm_amdgcn_is_shared:
; GFX8V5: ; %bb.0:
-; GFX8V5-NEXT: s_load_dword s0, s[8:9], 0xcc
-; GFX8V5-NEXT: s_load_dword s1, s[8:9], 0x4
+; GFX8V5-NEXT: s_load_dword s0, s[8:9], 0x4
+; GFX8V5-NEXT: s_load_dword s1, s[8:9], 0xcc
; GFX8V5-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0
+; GFX8V5-NEXT: s_cmp_eq_u32 s0, s1
; GFX8V5-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX8V5-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; GFX8V5-NEXT: flat_store_dword v[0:1], v0
@@ -178,10 +178,10 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) #0 {
;
; GFX8V5-LABEL: llvm_amdgcn_is_private:
; GFX8V5: ; %bb.0:
-; GFX8V5-NEXT: s_load_dword s0, s[8:9], 0xc8
-; GFX8V5-NEXT: s_load_dword s1, s[8:9], 0x4
+; GFX8V5-NEXT: s_load_dword s0, s[8:9], 0x4
+; GFX8V5-NEXT: s_load_dword s1, s[8:9], 0xc8
; GFX8V5-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0
+; GFX8V5-NEXT: s_cmp_eq_u32 s0, s1
; GFX8V5-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX8V5-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; GFX8V5-NEXT: flat_store_dword v[0:1], v0
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
index 8fcf1ad3fbc95..283046201eb9b 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -6918,186 +6918,193 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o
;
; VI-MOVREL-LABEL: insert_w_offset_multiple_in_block:
; VI-MOVREL: ; %bb.0: ; %entry
-; VI-MOVREL-NEXT: s_load_dword s2, s[4:5], 0x2c
-; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0
-; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0
-; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000
-; VI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0
+; VI-MOVREL-NEXT: s_load_dword s0, s[4:5], 0x2c
+; VI-MOVREL-NEXT: v_mov_b32_e32 v16, 1.0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v17, 2.0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v18, 0x40400000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v19, 4.0
; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-MOVREL-NEXT: s_add_i32 m0, s2, 1
-; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000
-; VI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000
-; VI-MOVREL-NEXT: v_mov_b32_e32 v6, 0x40e00000
-; VI-MOVREL-NEXT: v_mov_b32_e32 v7, 0x41000000
-; VI-MOVREL-NEXT: v_mov_b32_e32 v8, 0x41100000
-; VI-MOVREL-NEXT: v_mov_b32_e32 v9, 0x41200000
-; VI-MOVREL-NEXT: v_mov_b32_e32 v10, 0x41300000
-; VI-MOVREL-NEXT: v_mov_b32_e32 v11, 0x41400000
-; VI-MOVREL-NEXT: v_mov_b32_e32 v12, 0x41500000
-; VI-MOVREL-NEXT: v_mov_b32_e32 v13, 0x41600000
-; VI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000
-; VI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000
+; VI-MOVREL-NEXT: s_add_i32 m0, s0, 1
+; VI-MOVREL-NEXT: v_mov_b32_e32 v20, 0x40a00000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v21, 0x40c00000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v22, 0x40e00000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v23, 0x41000000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v24, 0x41100000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v25, 0x41200000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v26, 0x41300000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v27, 0x41400000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v28, 0x41500000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v29, 0x41600000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v30, 0x41700000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v31, 0x41800000
; VI-MOVREL-NEXT: v_mov_b32_e32 v32, 0x41880000
-; VI-MOVREL-NEXT: v_movreld_b32_e32 v0, v32
-; VI-MOVREL-NEXT: s_add_i32 m0, s2, 2
-; VI-MOVREL-NEXT: v_mov_b32_e32 v31, v15
+; VI-MOVREL-NEXT: v_movreld_b32_e32 v16, v32
+; VI-MOVREL-NEXT: s_add_i32 m0, s0, 2
+; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-MOVREL-NEXT: v_mov_b32_e32 v0, v16
+; VI-MOVREL-NEXT: v_mov_b32_e32 v1, v17
+; VI-MOVREL-NEXT: v_mov_b32_e32 v2, v18
+; VI-MOVREL-NEXT: v_mov_b32_e32 v3, v19
+; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48
-; VI-MOVREL-NEXT: v_mov_b32_e32 v30, v14
-; VI-MOVREL-NEXT: v_mov_b32_e32 v29, v13
-; VI-MOVREL-NEXT: v_mov_b32_e32 v28, v12
-; VI-MOVREL-NEXT: v_mov_b32_e32 v27, v11
-; VI-MOVREL-NEXT: v_mov_b32_e32 v26, v10
-; VI-MOVREL-NEXT: v_mov_b32_e32 v25, v9
-; VI-MOVREL-NEXT: v_mov_b32_e32 v24, v8
-; VI-MOVREL-NEXT: v_mov_b32_e32 v23, v7
-; VI-MOVREL-NEXT: v_mov_b32_e32 v22, v6
-; VI-MOVREL-NEXT: v_mov_b32_e32 v21, v5
-; VI-MOVREL-NEXT: v_mov_b32_e32 v20, v4
-; VI-MOVREL-NEXT: v_mov_b32_e32 v19, v3
-; VI-MOVREL-NEXT: v_mov_b32_e32 v18, v2
-; VI-MOVREL-NEXT: v_mov_b32_e32 v17, v1
-; VI-MOVREL-NEXT: v_mov_b32_e32 v16, v0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v4, v20
+; VI-MOVREL-NEXT: v_mov_b32_e32 v5, v21
+; VI-MOVREL-NEXT: v_mov_b32_e32 v6, v22
+; VI-MOVREL-NEXT: v_mov_b32_e32 v7, v23
+; VI-MOVREL-NEXT: v_mov_b32_e32 v8, v24
+; VI-MOVREL-NEXT: v_mov_b32_e32 v9, v25
+; VI-MOVREL-NEXT: v_mov_b32_e32 v10, v26
+; VI-MOVREL-NEXT: v_mov_b32_e32 v11, v27
+; VI-MOVREL-NEXT: v_mov_b32_e32 v12, v28
+; VI-MOVREL-NEXT: v_mov_b32_e32 v13, v29
+; VI-MOVREL-NEXT: v_mov_b32_e32 v14, v30
+; VI-MOVREL-NEXT: v_mov_b32_e32 v15, v31
; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0
-; VI-MOVREL-NEXT: v_movreld_b32_e32 v16, v32
+; VI-MOVREL-NEXT: v_movreld_b32_e32 v0, v32
; VI-MOVREL-NEXT: v_mov_b32_e32 v33, s3
; VI-MOVREL-NEXT: v_mov_b32_e32 v32, s2
; VI-MOVREL-NEXT: s_add_u32 s2, s0, 32
; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0
-; VI-MOVREL-NEXT: flat_store_dwordx4 v[32:33], v[12:15]
+; VI-MOVREL-NEXT: flat_store_dwordx4 v[32:33], v[28:31]
; VI-MOVREL-NEXT: s_nop 0
-; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3
-; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s2
+; VI-MOVREL-NEXT: v_mov_b32_e32 v29, s3
+; VI-MOVREL-NEXT: v_mov_b32_e32 v28, s2
; VI-MOVREL-NEXT: s_add_u32 s2, s0, 16
; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0
-; VI-MOVREL-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
+; VI-MOVREL-NEXT: flat_store_dwordx4 v[28:29], v[24:27]
; VI-MOVREL-NEXT: s_nop 0
-; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s3
-; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s2
+; VI-MOVREL-NEXT: v_mov_b32_e32 v25, s3
+; VI-MOVREL-NEXT: v_mov_b32_e32 v24, s2
; VI-MOVREL-NEXT: s_add_u32 s2, s0, 64
-; VI-MOVREL-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0
-; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1
-; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0
+; VI-MOVREL-NEXT: flat_store_dwordx4 v[24:25], v[20:23]
; VI-MOVREL-NEXT: s_add_u32 s4, s0, 0x70
-; VI-MOVREL-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-MOVREL-NEXT: v_mov_b32_e32 v21, s1
+; VI-MOVREL-NEXT: v_mov_b32_e32 v20, s0
; VI-MOVREL-NEXT: s_addc_u32 s5, s1, 0
-; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s4
-; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s5
+; VI-MOVREL-NEXT: flat_store_dwordx4 v[20:21], v[16:19]
+; VI-MOVREL-NEXT: s_nop 0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s5
+; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s4
; VI-MOVREL-NEXT: s_add_u32 s4, s0, 0x60
-; VI-MOVREL-NEXT: flat_store_dwordx4 v[0:1], v[28:31]
; VI-MOVREL-NEXT: s_addc_u32 s5, s1, 0
-; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s4
-; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s5
+; VI-MOVREL-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
; VI-MOVREL-NEXT: s_add_u32 s0, s0, 0x50
-; VI-MOVREL-NEXT: flat_store_dwordx4 v[0:1], v[24:27]
+; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s5
+; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s4
; VI-MOVREL-NEXT: s_addc_u32 s1, s1, 0
-; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s0
-; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s1
-; VI-MOVREL-NEXT: flat_store_dwordx4 v[0:1], v[20:23]
-; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s2
-; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s3
-; VI-MOVREL-NEXT: flat_store_dwordx4 v[0:1], v[16:19]
+; VI-MOVREL-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
+; VI-MOVREL-NEXT: s_nop 0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s1
+; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s0
+; VI-MOVREL-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
+; VI-MOVREL-NEXT: s_nop 0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s3
+; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s2
+; VI-MOVREL-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-MOVREL-NEXT: s_endpgm
;
; VI-IDXMODE-LABEL: insert_w_offset_multiple_in_block:
; VI-IDXMODE: ; %bb.0: ; %entry
-; VI-IDXMODE-NEXT: s_load_dword s2, s[4:5], 0x2c
-; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0
+; VI-IDXMODE-NEXT: s_load_dword s0, s[4:5], 0x2c
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, 1.0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, 2.0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v18, 0x40400000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v19, 4.0
; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
-; VI-IDXMODE-NEXT: s_add_i32 s3, s2, 1
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, 0x40c00000
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, 0x40e00000
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, 0x41000000
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, 0x41100000
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, 0x41200000
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, 0x41300000
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, 0x41400000
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, 0x41500000
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, 0x41600000
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, 0x41700000
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000
+; VI-IDXMODE-NEXT: s_add_i32 s1, s0, 1
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v20, 0x40a00000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v21, 0x40c00000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v22, 0x40e00000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v23, 0x41000000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v24, 0x41100000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v25, 0x41200000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v26, 0x41300000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v27, 0x41400000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v28, 0x41500000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v29, 0x41600000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v30, 0x41700000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v31, 0x41800000
; VI-IDXMODE-NEXT: v_mov_b32_e32 v32, 0x41880000
-; VI-IDXMODE-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST)
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, v32
-; VI-IDXMODE-NEXT: s_set_gpr_idx_off
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v31, v15
-; VI-IDXMODE-NEXT: s_add_i32 s2, s2, 2
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v30, v14
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v29, v13
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v28, v12
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v27, v11
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v26, v10
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v25, v9
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v24, v8
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v23, v7
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v22, v6
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v21, v5
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v20, v4
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v19, v3
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v18, v2
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, v1
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, v0
-; VI-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST)
+; VI-IDXMODE-NEXT: s_set_gpr_idx_on s1, gpr_idx(DST)
; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, v32
; VI-IDXMODE-NEXT: s_set_gpr_idx_off
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, v16
+; VI-IDXMODE-NEXT: s_add_i32 s0, s0, 2
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, v17
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, v18
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, v19
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, v20
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, v21
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, v22
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, v23
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, v24
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, v25
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, v26
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, v27
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, v28
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, v29
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, v30
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, v31
+; VI-IDXMODE-NEXT: s_set_gpr_idx_on s0, gpr_idx(DST)
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, v32
+; VI-IDXMODE-NEXT: s_set_gpr_idx_off
+; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 48
; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0
; VI-IDXMODE-NEXT: v_mov_b32_e32 v33, s3
; VI-IDXMODE-NEXT: v_mov_b32_e32 v32, s2
; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 32
; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0
-; VI-IDXMODE-NEXT: flat_store_dwordx4 v[32:33], v[12:15]
+; VI-IDXMODE-NEXT: flat_store_dwordx4 v[32:33], v[28:31]
; VI-IDXMODE-NEXT: s_nop 0
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s2
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v29, s3
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v28, s2
; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 16
; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0
-; VI-IDXMODE-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
+; VI-IDXMODE-NEXT: flat_store_dwordx4 v[28:29], v[24:27]
; VI-IDXMODE-NEXT: s_nop 0
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s3
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, s2
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v25, s3
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v24, s2
; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 64
-; VI-IDXMODE-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0
+; VI-IDXMODE-NEXT: flat_store_dwordx4 v[24:25], v[20:23]
; VI-IDXMODE-NEXT: s_add_u32 s4, s0, 0x70
-; VI-IDXMODE-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v21, s1
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v20, s0
; VI-IDXMODE-NEXT: s_addc_u32 s5, s1, 0
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s4
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s5
+; VI-IDXMODE-NEXT: flat_store_dwordx4 v[20:21], v[16:19]
+; VI-IDXMODE-NEXT: s_nop 0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s5
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s4
; VI-IDXMODE-NEXT: s_add_u32 s4, s0, 0x60
-; VI-IDXMODE-NEXT: flat_store_dwordx4 v[0:1], v[28:31]
; VI-IDXMODE-NEXT: s_addc_u32 s5, s1, 0
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s4
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s5
+; VI-IDXMODE-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
; VI-IDXMODE-NEXT: s_add_u32 s0, s0, 0x50
-; VI-IDXMODE-NEXT: flat_store_dwordx4 v[0:1], v[24:27]
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s5
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s4
; VI-IDXMODE-NEXT: s_addc_u32 s1, s1, 0
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s0
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s1
-; VI-IDXMODE-NEXT: flat_store_dwordx4 v[0:1], v[20:23]
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s2
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s3
-; VI-IDXMODE-NEXT: flat_store_dwordx4 v[0:1], v[16:19]
+; VI-IDXMODE-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
+; VI-IDXMODE-NEXT: s_nop 0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s1
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, s0
+; VI-IDXMODE-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
+; VI-IDXMODE-NEXT: s_nop 0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s3
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s2
+; VI-IDXMODE-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-IDXMODE-NEXT: s_endpgm
;
; GFX9-IDXMODE-LABEL: insert_w_offset_multiple_in_block:
; GFX9-IDXMODE: ; %bb.0: ; %entry
-; GFX9-IDXMODE-NEXT: s_load_dword s2, s[4:5], 0x2c
-; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-IDXMODE-NEXT: s_load_dword s0, s[4:5], 0x2c
; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0
; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0
; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000
; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0
; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-IDXMODE-NEXT: s_add_i32 s3, s2, 1
+; GFX9-IDXMODE-NEXT: s_add_i32 s1, s0, 1
; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000
; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, 0x40c00000
; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, 0x40e00000
@@ -7111,11 +7118,11 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o
; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, 0x41700000
; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000
; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v32, 0x41880000
-; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST)
+; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s1, gpr_idx(DST)
; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, v32
; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off
; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v31, v15
-; GFX9-IDXMODE-NEXT: s_add_i32 s2, s2, 2
+; GFX9-IDXMODE-NEXT: s_add_i32 s0, s0, 2
; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v30, v14
; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v29, v13
; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v28, v12
@@ -7131,10 +7138,12 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o
; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v18, v2
; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v17, v1
; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, v0
-; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST)
+; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s0, gpr_idx(DST)
; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, v32
; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off
+; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v32, 0
+; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-IDXMODE-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
; GFX9-IDXMODE-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
; GFX9-IDXMODE-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index beeeaa32cacfd..e230bad79b8e0 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -2146,15 +2146,15 @@ define amdgpu_kernel void @double7_inselt(ptr addrspace(1) %out, <7 x double> %v
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x64
; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GCN-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x94
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x84
+; GCN-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x94
; GCN-NEXT: v_mov_b32_e32 v16, 0x3ff00000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: v_mov_b32_e32 v1, s9
-; GCN-NEXT: v_mov_b32_e32 v2, s10
; GCN-NEXT: v_mov_b32_e32 v8, s0
; GCN-NEXT: s_load_dword s0, s[4:5], 0xa4
+; GCN-NEXT: v_mov_b32_e32 v2, s10
; GCN-NEXT: v_mov_b32_e32 v3, s11
; GCN-NEXT: v_mov_b32_e32 v4, s12
; GCN-NEXT: v_mov_b32_e32 v5, s13
@@ -2705,22 +2705,22 @@ define amdgpu_kernel void @double15_inselt(ptr addrspace(1) %out, <15 x double>
; GCN-LABEL: double15_inselt:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xa4
-; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x114
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x104
+; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x114
; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0xe4
; GCN-NEXT: v_mov_b32_e32 v32, 0x3ff00000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s8
-; GCN-NEXT: v_mov_b32_e32 v1, s9
; GCN-NEXT: v_mov_b32_e32 v24, s0
; GCN-NEXT: s_load_dword s0, s[4:5], 0x124
; GCN-NEXT: v_mov_b32_e32 v25, s1
+; GCN-NEXT: v_mov_b32_e32 v1, s9
; GCN-NEXT: v_mov_b32_e32 v2, s10
; GCN-NEXT: v_mov_b32_e32 v3, s11
-; GCN-NEXT: v_mov_b32_e32 v4, s12
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_lshl_b32 m0, s0, 1
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT: v_mov_b32_e32 v4, s12
; GCN-NEXT: v_mov_b32_e32 v5, s13
; GCN-NEXT: v_mov_b32_e32 v6, s14
; GCN-NEXT: v_mov_b32_e32 v7, s15
@@ -3184,75 +3184,77 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec,
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
; GCN-NEXT: s_load_dword s6, s[4:5], 0x44
-; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
+; GCN-NEXT: s_mov_b64 s[90:91], s[4:5]
; GCN-NEXT: ; implicit-def: $vgpr6 : SGPR spill to VGPR lane
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_bfe_u32 s9, s0, 0xf0001
-; GCN-NEXT: s_lshr_b32 s42, s1, 16
-; GCN-NEXT: v_writelane_b32 v6, s4, 0
-; GCN-NEXT: v_writelane_b32 v6, s5, 1
; GCN-NEXT: s_lshr_b32 s4, s0, 16
-; GCN-NEXT: v_writelane_b32 v6, s4, 2
+; GCN-NEXT: v_writelane_b32 v6, s4, 0
; GCN-NEXT: s_lshr_b32 s4, s0, 17
-; GCN-NEXT: v_writelane_b32 v6, s4, 3
+; GCN-NEXT: v_writelane_b32 v6, s4, 1
; GCN-NEXT: s_lshr_b32 s4, s0, 18
-; GCN-NEXT: v_writelane_b32 v6, s4, 4
+; GCN-NEXT: v_writelane_b32 v6, s4, 2
; GCN-NEXT: s_lshr_b32 s4, s0, 19
-; GCN-NEXT: v_writelane_b32 v6, s4, 5
+; GCN-NEXT: v_writelane_b32 v6, s4, 3
; GCN-NEXT: s_lshr_b32 s4, s0, 20
-; GCN-NEXT: v_writelane_b32 v6, s4, 6
+; GCN-NEXT: v_writelane_b32 v6, s4, 4
; GCN-NEXT: s_lshr_b32 s4, s0, 21
-; GCN-NEXT: v_writelane_b32 v6, s4, 7
+; GCN-NEXT: v_writelane_b32 v6, s4, 5
; GCN-NEXT: s_lshr_b32 s4, s0, 22
-; GCN-NEXT: v_writelane_b32 v6, s4, 8
+; GCN-NEXT: v_writelane_b32 v6, s4, 6
; GCN-NEXT: s_lshr_b32 s4, s0, 23
-; GCN-NEXT: v_writelane_b32 v6, s4, 9
+; GCN-NEXT: v_writelane_b32 v6, s4, 7
; GCN-NEXT: s_lshr_b32 s4, s0, 24
-; GCN-NEXT: v_writelane_b32 v6, s4, 10
+; GCN-NEXT: v_writelane_b32 v6, s4, 8
; GCN-NEXT: s_lshr_b32 s4, s0, 25
-; GCN-NEXT: v_writelane_b32 v6, s4, 11
+; GCN-NEXT: v_writelane_b32 v6, s4, 9
; GCN-NEXT: s_lshr_b32 s4, s0, 26
-; GCN-NEXT: v_writelane_b32 v6, s4, 12
+; GCN-NEXT: v_writelane_b32 v6, s4, 10
; GCN-NEXT: s_lshr_b32 s4, s0, 27
-; GCN-NEXT: v_writelane_b32 v6, s4, 13
+; GCN-NEXT: v_writelane_b32 v6, s4, 11
; GCN-NEXT: s_lshr_b32 s4, s0, 28
-; GCN-NEXT: v_writelane_b32 v6, s4, 14
+; GCN-NEXT: v_writelane_b32 v6, s4, 12
; GCN-NEXT: s_lshr_b32 s4, s0, 29
-; GCN-NEXT: v_writelane_b32 v6, s4, 15
+; GCN-NEXT: v_writelane_b32 v6, s4, 13
; GCN-NEXT: s_lshr_b32 s4, s0, 30
-; GCN-NEXT: v_writelane_b32 v6, s4, 16
+; GCN-NEXT: v_writelane_b32 v6, s4, 14
; GCN-NEXT: s_lshr_b32 s4, s0, 31
-; GCN-NEXT: v_writelane_b32 v6, s4, 17
-; GCN-NEXT: v_writelane_b32 v6, s9, 18
+; GCN-NEXT: v_writelane_b32 v6, s4, 15
+; GCN-NEXT: s_bfe_u32 s9, s0, 0xf0001
+; GCN-NEXT: v_writelane_b32 v6, s9, 16
; GCN-NEXT: s_bfe_u32 s9, s0, 0xe0002
-; GCN-NEXT: v_writelane_b32 v6, s9, 19
+; GCN-NEXT: v_writelane_b32 v6, s9, 17
; GCN-NEXT: s_bfe_u32 s9, s0, 0xd0003
-; GCN-NEXT: v_writelane_b32 v6, s9, 20
+; GCN-NEXT: v_writelane_b32 v6, s9, 18
; GCN-NEXT: s_bfe_u32 s9, s0, 0xc0004
-; GCN-NEXT: v_writelane_b32 v6, s9, 21
+; GCN-NEXT: v_writelane_b32 v6, s9, 19
; GCN-NEXT: s_bfe_u32 s9, s0, 0xb0005
-; GCN-NEXT: v_writelane_b32 v6, s9, 22
+; GCN-NEXT: v_writelane_b32 v6, s9, 20
; GCN-NEXT: s_bfe_u32 s9, s0, 0xa0006
-; GCN-NEXT: v_writelane_b32 v6, s9, 23
+; GCN-NEXT: v_writelane_b32 v6, s9, 21
; GCN-NEXT: s_bfe_u32 s9, s0, 0x90007
-; GCN-NEXT: v_writelane_b32 v6, s9, 24
+; GCN-NEXT: v_writelane_b32 v6, s9, 22
; GCN-NEXT: s_bfe_u32 s9, s0, 0x80008
-; GCN-NEXT: v_writelane_b32 v6, s9, 25
+; GCN-NEXT: v_writelane_b32 v6, s9, 23
; GCN-NEXT: s_bfe_u32 s9, s0, 0x70009
-; GCN-NEXT: v_writelane_b32 v6, s9, 26
+; GCN-NEXT: v_writelane_b32 v6, s9, 24
; GCN-NEXT: s_bfe_u32 s9, s0, 0x6000a
-; GCN-NEXT: v_writelane_b32 v6, s9, 27
+; GCN-NEXT: v_writelane_b32 v6, s9, 25
; GCN-NEXT: s_bfe_u32 s9, s0, 0x5000b
-; GCN-NEXT: v_writelane_b32 v6, s9, 28
+; GCN-NEXT: v_writelane_b32 v6, s9, 26
; GCN-NEXT: s_bfe_u32 s9, s0, 0x4000c
-; GCN-NEXT: v_writelane_b32 v6, s9, 29
+; GCN-NEXT: v_writelane_b32 v6, s9, 27
; GCN-NEXT: s_bfe_u32 s9, s0, 0x3000d
-; GCN-NEXT: v_writelane_b32 v6, s9, 30
+; GCN-NEXT: v_writelane_b32 v6, s9, 28
; GCN-NEXT: s_bfe_u32 s9, s0, 0x2000e
-; GCN-NEXT: v_writelane_b32 v6, s9, 31
+; GCN-NEXT: v_writelane_b32 v6, s9, 29
; GCN-NEXT: s_bfe_u32 s9, s0, 0x1000f
-; GCN-NEXT: v_writelane_b32 v6, s9, 32
+; GCN-NEXT: v_writelane_b32 v6, s9, 30
; GCN-NEXT: s_bfe_u32 s9, s1, 0xf0001
+; GCN-NEXT: v_writelane_b32 v6, s9, 31
+; GCN-NEXT: s_bfe_u32 s9, s1, 0xe0002
+; GCN-NEXT: v_writelane_b32 v6, s9, 32
+; GCN-NEXT: s_bfe_u32 s9, s1, 0xd0003
+; GCN-NEXT: s_lshr_b32 s42, s1, 16
; GCN-NEXT: s_lshr_b32 s43, s1, 17
; GCN-NEXT: s_lshr_b32 s45, s1, 18
; GCN-NEXT: s_lshr_b32 s47, s1, 19
@@ -3272,14 +3274,14 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec,
; GCN-NEXT: s_lshr_b32 s74, s2, 17
; GCN-NEXT: s_lshr_b32 s77, s2, 18
; GCN-NEXT: s_lshr_b32 s78, s2, 19
-; GCN-NEXT: s_lshr_b32 s81, s2, 20
-; GCN-NEXT: s_lshr_b32 s82, s2, 21
-; GCN-NEXT: s_lshr_b32 s84, s2, 22
-; GCN-NEXT: s_lshr_b32 s86, s2, 23
-; GCN-NEXT: s_lshr_b32 s89, s2, 24
-; GCN-NEXT: s_lshr_b32 s90, s2, 25
-; GCN-NEXT: s_lshr_b32 s93, s2, 26
-; GCN-NEXT: s_lshr_b32 s94, s2, 27
+; GCN-NEXT: s_lshr_b32 s89, s2, 20
+; GCN-NEXT: s_lshr_b32 s41, s2, 21
+; GCN-NEXT: s_lshr_b32 s40, s2, 22
+; GCN-NEXT: s_lshr_b32 s92, s2, 23
+; GCN-NEXT: s_lshr_b32 s93, s2, 24
+; GCN-NEXT: s_lshr_b32 s94, s2, 25
+; GCN-NEXT: s_lshr_b32 s95, s2, 26
+; GCN-NEXT: s_lshr_b32 vcc_lo, s2, 27
; GCN-NEXT: s_lshr_b32 vcc_hi, s2, 28
; GCN-NEXT: s_lshr_b32 s39, s2, 29
; GCN-NEXT: s_lshr_b32 s38, s2, 30
@@ -3301,8 +3303,6 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec,
; GCN-NEXT: s_lshr_b32 s5, s3, 30
; GCN-NEXT: s_lshr_b32 s4, s3, 31
; GCN-NEXT: v_writelane_b32 v6, s9, 33
-; GCN-NEXT: s_bfe_u32 s40, s1, 0xe0002
-; GCN-NEXT: s_bfe_u32 s41, s1, 0xd0003
; GCN-NEXT: s_bfe_u32 s44, s1, 0xc0004
; GCN-NEXT: s_bfe_u32 s46, s1, 0xb0005
; GCN-NEXT: s_bfe_u32 s48, s1, 0xa0006
@@ -3322,14 +3322,14 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec,
; GCN-NEXT: s_bfe_u32 s76, s2, 0xb0005
; GCN-NEXT: s_bfe_u32 s79, s2, 0xa0006
; GCN-NEXT: s_bfe_u32 s80, s2, 0x90007
-; GCN-NEXT: s_bfe_u32 s83, s2, 0x80008
-; GCN-NEXT: s_bfe_u32 s85, s2, 0x70009
-; GCN-NEXT: s_bfe_u32 s87, s2, 0x6000a
-; GCN-NEXT: s_bfe_u32 s88, s2, 0x5000b
-; GCN-NEXT: s_bfe_u32 s91, s2, 0x4000c
-; GCN-NEXT: s_bfe_u32 s92, s2, 0x3000d
-; GCN-NEXT: s_bfe_u32 s95, s2, 0x2000e
-; GCN-NEXT: s_bfe_u32 vcc_lo, s2, 0x1000f
+; GCN-NEXT: s_bfe_u32 s81, s2, 0x80008
+; GCN-NEXT: s_bfe_u32 s82, s2, 0x70009
+; GCN-NEXT: s_bfe_u32 s83, s2, 0x6000a
+; GCN-NEXT: s_bfe_u32 s84, s2, 0x5000b
+; GCN-NEXT: s_bfe_u32 s85, s2, 0x4000c
+; GCN-NEXT: s_bfe_u32 s86, s2, 0x3000d
+; GCN-NEXT: s_bfe_u32 s87, s2, 0x2000e
+; GCN-NEXT: s_bfe_u32 s88, s2, 0x1000f
; GCN-NEXT: s_bfe_u32 s36, s3, 0xf0001
; GCN-NEXT: s_bfe_u32 s35, s3, 0xe0002
; GCN-NEXT: s_bfe_u32 s34, s3, 0xd0003
@@ -3521,498 +3521,500 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec,
; GCN-NEXT: s_or_b32 s4, s5, s4
; GCN-NEXT: s_lshl_b32 s4, s4, 12
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x5b
-; GCN-NEXT: s_cselect_b32 s5, s94, 1
+; GCN-NEXT: s_cselect_b32 s5, vcc_lo, 1
; GCN-NEXT: s_lshl_b32 s5, s5, 3
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x5a
-; GCN-NEXT: s_cselect_b32 s7, s93, 1
+; GCN-NEXT: s_cselect_b32 s7, s95, 1
; GCN-NEXT: s_and_b32 s7, s7, 1
; GCN-NEXT: s_lshl_b32 s7, s7, 2
; GCN-NEXT: s_or_b32 s5, s5, s7
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x59
-; GCN-NEXT: s_cselect_b32 s7, s90, 1
+; GCN-NEXT: s_cselect_b32 s7, s94, 1
; GCN-NEXT: s_lshl_b32 s7, s7, 1
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x58
-; GCN-NEXT: s_cselect_b32 s8, s89, 1
+; GCN-NEXT: s_cselect_b32 s8, s93, 1
; GCN-NEXT: s_and_b32 s8, s8, 1
; GCN-NEXT: s_or_b32 s7, s8, s7
; GCN-NEXT: s_and_b32 s7, s7, 3
; GCN-NEXT: s_or_b32 s5, s7, s5
; GCN-NEXT: s_and_b32 s5, s5, 15
; GCN-NEXT: s_lshl_b32 s5, s5, 8
-; GCN-NEXT: s_or_b32 s4, s4, s5
+; GCN-NEXT: s_or_b32 s7, s4, s5
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x57
-; GCN-NEXT: s_cselect_b32 s5, s86, 1
-; GCN-NEXT: s_lshl_b32 s5, s5, 3
+; GCN-NEXT: s_cselect_b32 s4, s92, 1
+; GCN-NEXT: s_lshl_b32 s4, s4, 3
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x56
-; GCN-NEXT: s_cselect_b32 s7, s84, 1
-; GCN-NEXT: s_and_b32 s7, s7, 1
-; GCN-NEXT: s_lshl_b32 s7, s7, 2
-; GCN-NEXT: s_or_b32 s5, s5, s7
+; GCN-NEXT: s_cselect_b32 s5, s40, 1
+; GCN-NEXT: s_and_b32 s5, s5, 1
+; GCN-NEXT: s_lshl_b32 s5, s5, 2
+; GCN-NEXT: s_or_b32 s4, s4, s5
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x55
-; GCN-NEXT: s_cselect_b32 s7, s82, 1
-; GCN-NEXT: s_lshl_b32 s7, s7, 1
+; GCN-NEXT: s_cselect_b32 s5, s41, 1
+; GCN-NEXT: s_lshl_b32 s5, s5, 1
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x54
-; GCN-NEXT: s_cselect_b32 s8, s81, 1
+; GCN-NEXT: s_cselect_b32 s8, s89, 1
; GCN-NEXT: s_and_b32 s8, s8, 1
-; GCN-NEXT: s_or_b32 s7, s8, s7
-; GCN-NEXT: s_and_b32 s7, s7, 3
-; GCN-NEXT: s_or_b32 s5, s7, s5
-; GCN-NEXT: s_lshl_b32 s5, s5, 4
+; GCN-NEXT: s_or_b32 s5, s8, s5
+; GCN-NEXT: s_and_b32 s5, s5, 3
+; GCN-NEXT: s_or_b32 s8, s5, s4
+; GCN-NEXT: s_lshl_b32 s8, s8, 4
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x53
-; GCN-NEXT: s_cselect_b32 s7, s78, 1
-; GCN-NEXT: s_lshl_b32 s7, s7, 3
+; GCN-NEXT: s_cselect_b32 s9, s78, 1
+; GCN-NEXT: s_lshl_b32 s9, s9, 3
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x52
-; GCN-NEXT: s_cselect_b32 s8, s77, 1
-; GCN-NEXT: s_and_b32 s8, s8, 1
-; GCN-NEXT: s_lshl_b32 s8, s8, 2
-; GCN-NEXT: s_or_b32 s7, s7, s8
+; GCN-NEXT: s_cselect_b32 s10, s77, 1
+; GCN-NEXT: s_and_b32 s10, s10, 1
+; GCN-NEXT: s_lshl_b32 s10, s10, 2
+; GCN-NEXT: s_or_b32 s9, s9, s10
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x51
-; GCN-NEXT: s_cselect_b32 s8, s74, 1
-; GCN-NEXT: s_lshl_b32 s8, s8, 1
+; GCN-NEXT: s_cselect_b32 s10, s74, 1
+; GCN-NEXT: s_lshl_b32 s10, s10, 1
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x50
-; GCN-NEXT: s_cselect_b32 s9, s73, 1
-; GCN-NEXT: s_and_b32 s9, s9, 1
+; GCN-NEXT: s_cselect_b32 s11, s73, 1
+; GCN-NEXT: s_and_b32 s11, s11, 1
+; GCN-NEXT: s_or_b32 s10, s11, s10
+; GCN-NEXT: s_and_b32 s10, s10, 3
+; GCN-NEXT: s_or_b32 s9, s10, s9
+; GCN-NEXT: s_and_b32 s9, s9, 15
; GCN-NEXT: s_or_b32 s8, s9, s8
-; GCN-NEXT: s_and_b32 s8, s8, 3
+; GCN-NEXT: s_and_b32 s8, s8, 0xff
; GCN-NEXT: s_or_b32 s7, s8, s7
-; GCN-NEXT: s_and_b32 s7, s7, 15
-; GCN-NEXT: s_or_b32 s5, s7, s5
-; GCN-NEXT: s_and_b32 s5, s5, 0xff
-; GCN-NEXT: s_or_b32 s4, s5, s4
-; GCN-NEXT: s_lshl_b32 s4, s4, 16
+; GCN-NEXT: s_lshl_b32 s7, s7, 16
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x4f
-; GCN-NEXT: s_cselect_b32 s5, vcc_lo, 1
-; GCN-NEXT: s_lshl_b32 s5, s5, 3
+; GCN-NEXT: s_cselect_b32 s8, s88, 1
+; GCN-NEXT: s_lshl_b32 s8, s8, 3
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x4e
-; GCN-NEXT: s_cselect_b32 s7, s95, 1
-; GCN-NEXT: s_and_b32 s7, s7, 1
-; GCN-NEXT: s_lshl_b32 s7, s7, 2
-; GCN-NEXT: s_or_b32 s5, s5, s7
+; GCN-NEXT: s_cselect_b32 s9, s87, 1
+; GCN-NEXT: s_and_b32 s9, s9, 1
+; GCN-NEXT: s_lshl_b32 s9, s9, 2
+; GCN-NEXT: s_or_b32 s8, s8, s9
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x4d
-; GCN-NEXT: s_cselect_b32 s7, s92, 1
-; GCN-NEXT: s_lshl_b32 s7, s7, 1
+; GCN-NEXT: s_cselect_b32 s9, s86, 1
+; GCN-NEXT: s_lshl_b32 s9, s9, 1
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x4c
-; GCN-NEXT: s_cselect_b32 s8, s91, 1
-; GCN-NEXT: s_and_b32 s8, s8, 1
-; GCN-NEXT: s_or_b32 s7, s8, s7
-; GCN-NEXT: s_and_b32 s7, s7, 3
-; GCN-NEXT: s_or_b32 s5, s7, s5
-; GCN-NEXT: s_lshl_b32 s5, s5, 12
+; GCN-NEXT: s_cselect_b32 s10, s85, 1
+; GCN-NEXT: s_and_b32 s10, s10, 1
+; GCN-NEXT: s_or_b32 s9, s10, s9
+; GCN-NEXT: s_and_b32 s9, s9, 3
+; GCN-NEXT: s_or_b32 s8, s9, s8
+; GCN-NEXT: s_lshl_b32 s8, s8, 12
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x4b
-; GCN-NEXT: s_cselect_b32 s7, s88, 1
-; GCN-NEXT: s_lshl_b32 s7, s7, 3
+; GCN-NEXT: s_cselect_b32 s9, s84, 1
+; GCN-NEXT: s_lshl_b32 s9, s9, 3
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x4a
-; GCN-NEXT: s_cselect_b32 s8, s87, 1
-; GCN-NEXT: s_and_b32 s8, s8, 1
-; GCN-NEXT: s_lshl_b32 s8, s8, 2
-; GCN-NEXT: s_or_b32 s7, s7, s8
+; GCN-NEXT: s_cselect_b32 s10, s83, 1
+; GCN-NEXT: s_and_b32 s10, s10, 1
+; GCN-NEXT: s_lshl_b32 s10, s10, 2
+; GCN-NEXT: s_or_b32 s9, s9, s10
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x49
-; GCN-NEXT: s_cselect_b32 s8, s85, 1
-; GCN-NEXT: s_lshl_b32 s8, s8, 1
+; GCN-NEXT: s_cselect_b32 s10, s82, 1
+; GCN-NEXT: s_lshl_b32 s10, s10, 1
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x48
-; GCN-NEXT: s_cselect_b32 s9, s83, 1
-; GCN-NEXT: s_and_b32 s9, s9, 1
-; GCN-NEXT: s_or_b32 s8, s9, s8
-; GCN-NEXT: s_and_b32 s8, s8, 3
-; GCN-NEXT: s_or_b32 s7, s8, s7
-; GCN-NEXT: s_and_b32 s7, s7, 15
-; GCN-NEXT: s_lshl_b32 s7, s7, 8
-; GCN-NEXT: s_or_b32 s5, s5, s7
+; GCN-NEXT: s_cselect_b32 s11, s81, 1
+; GCN-NEXT: s_and_b32 s11, s11, 1
+; GCN-NEXT: s_or_b32 s10, s11, s10
+; GCN-NEXT: s_and_b32 s10, s10, 3
+; GCN-NEXT: s_or_b32 s9, s10, s9
+; GCN-NEXT: s_and_b32 s9, s9, 15
+; GCN-NEXT: s_lshl_b32 s9, s9, 8
+; GCN-NEXT: s_or_b32 s8, s8, s9
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x47
-; GCN-NEXT: s_cselect_b32 s7, s80, 1
-; GCN-NEXT: s_lshl_b32 s7, s7, 3
+; GCN-NEXT: s_cselect_b32 s9, s80, 1
+; GCN-NEXT: s_lshl_b32 s9, s9, 3
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x46
-; GCN-NEXT: s_cselect_b32 s8, s79, 1
-; GCN-NEXT: s_and_b32 s8, s8, 1
-; GCN-NEXT: s_lshl_b32 s8, s8, 2
-; GCN-NEXT: s_or_b32 s7, s7, s8
+; GCN-NEXT: s_cselect_b32 s10, s79, 1
+; GCN-NEXT: s_and_b32 s10, s10, 1
+; GCN-NEXT: s_lshl_b32 s10, s10, 2
+; GCN-NEXT: s_or_b32 s9, s9, s10
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x45
-; GCN-NEXT: s_cselect_b32 s8, s76, 1
-; GCN-NEXT: s_lshl_b32 s8, s8, 1
+; GCN-NEXT: s_cselect_b32 s10, s76, 1
+; GCN-NEXT: s_lshl_b32 s10, s10, 1
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x44
-; GCN-NEXT: s_cselect_b32 s9, s75, 1
-; GCN-NEXT: s_and_b32 s9, s9, 1
-; GCN-NEXT: s_or_b32 s8, s9, s8
-; GCN-NEXT: s_and_b32 s8, s8, 3
-; GCN-NEXT: s_or_b32 s7, s8, s7
-; GCN-NEXT: s_lshl_b32 s7, s7, 4
+; GCN-NEXT: s_cselect_b32 s11, s75, 1
+; GCN-NEXT: s_and_b32 s11, s11, 1
+; GCN-NEXT: s_or_b32 s10, s11, s10
+; GCN-NEXT: s_and_b32 s10, s10, 3
+; GCN-NEXT: s_or_b32 s9, s10, s9
+; GCN-NEXT: s_lshl_b32 s9, s9, 4
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x43
-; GCN-NEXT: s_cselect_b32 s8, s72, 1
-; GCN-NEXT: s_lshl_b32 s8, s8, 3
+; GCN-NEXT: s_cselect_b32 s10, s72, 1
+; GCN-NEXT: s_lshl_b32 s10, s10, 3
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x42
-; GCN-NEXT: s_cselect_b32 s9, s71, 1
-; GCN-NEXT: s_and_b32 s9, s9, 1
-; GCN-NEXT: s_lshl_b32 s9, s9, 2
-; GCN-NEXT: s_or_b32 s8, s8, s9
+; GCN-NEXT: s_cselect_b32 s11, s71, 1
+; GCN-NEXT: s_and_b32 s11, s11, 1
+; GCN-NEXT: s_lshl_b32 s11, s11, 2
+; GCN-NEXT: s_or_b32 s10, s10, s11
; GCN-NEXT: s_cmp_lg_u32 s6, 64
; GCN-NEXT: s_cselect_b32 s2, s2, 1
; GCN-NEXT: s_and_b32 s2, s2, 1
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x41
-; GCN-NEXT: s_cselect_b32 s9, s70, 1
-; GCN-NEXT: s_lshl_b32 s9, s9, 1
-; GCN-NEXT: s_or_b32 s2, s2, s9
+; GCN-NEXT: s_cselect_b32 s11, s70, 1
+; GCN-NEXT: s_lshl_b32 s11, s11, 1
+; GCN-NEXT: s_or_b32 s2, s2, s11
; GCN-NEXT: s_and_b32 s2, s2, 3
-; GCN-NEXT: s_or_b32 s2, s2, s8
+; GCN-NEXT: s_or_b32 s2, s2, s10
; GCN-NEXT: s_and_b32 s2, s2, 15
-; GCN-NEXT: s_or_b32 s2, s2, s7
+; GCN-NEXT: s_or_b32 s2, s2, s9
; GCN-NEXT: s_and_b32 s2, s2, 0xff
-; GCN-NEXT: s_or_b32 s2, s2, s5
+; GCN-NEXT: s_or_b32 s2, s2, s8
; GCN-NEXT: s_and_b32 s2, s2, 0xffff
-; GCN-NEXT: s_or_b32 s2, s2, s4
+; GCN-NEXT: s_or_b32 s2, s2, s7
; GCN-NEXT: s_cmp_lg_u32 s6, 63
-; GCN-NEXT: s_cselect_b32 s4, s69, 1
-; GCN-NEXT: s_lshl_b32 s4, s4, 3
+; GCN-NEXT: s_cselect_b32 s7, s69, 1
+; GCN-NEXT: s_lshl_b32 s7, s7, 3
; GCN-NEXT: s_cmp_lg_u32 s6, 62
-; GCN-NEXT: s_cselect_b32 s5, s68, 1
-; GCN-NEXT: s_and_b32 s5, s5, 1
-; GCN-NEXT: s_lshl_b32 s5, s5, 2
-; GCN-NEXT: s_or_b32 s4, s4, s5
+; GCN-NEXT: s_cselect_b32 s8, s68, 1
+; GCN-NEXT: s_and_b32 s8, s8, 1
+; GCN-NEXT: s_lshl_b32 s8, s8, 2
+; GCN-NEXT: s_or_b32 s7, s7, s8
; GCN-NEXT: s_cmp_lg_u32 s6, 61
-; GCN-NEXT: s_cselect_b32 s5, s67, 1
-; GCN-NEXT: s_lshl_b32 s5, s5, 1
+; GCN-NEXT: s_cselect_b32 s8, s67, 1
+; GCN-NEXT: s_lshl_b32 s8, s8, 1
; GCN-NEXT: s_cmp_lg_u32 s6, 60
-; GCN-NEXT: s_cselect_b32 s7, s66, 1
-; GCN-NEXT: s_and_b32 s7, s7, 1
-; GCN-NEXT: s_or_b32 s5, s7, s5
-; GCN-NEXT: s_and_b32 s5, s5, 3
-; GCN-NEXT: s_or_b32 s4, s5, s4
-; GCN-NEXT: s_lshl_b32 s4, s4, 12
+; GCN-NEXT: s_cselect_b32 s9, s66, 1
+; GCN-NEXT: s_and_b32 s9, s9, 1
+; GCN-NEXT: s_or_b32 s8, s9, s8
+; GCN-NEXT: s_and_b32 s8, s8, 3
+; GCN-NEXT: s_or_b32 s7, s8, s7
+; GCN-NEXT: s_lshl_b32 s7, s7, 12
; GCN-NEXT: s_cmp_lg_u32 s6, 59
-; GCN-NEXT: s_cselect_b32 s5, s63, 1
-; GCN-NEXT: s_lshl_b32 s5, s5, 3
+; GCN-NEXT: s_cselect_b32 s8, s63, 1
+; GCN-NEXT: s_lshl_b32 s8, s8, 3
; GCN-NEXT: s_cmp_lg_u32 s6, 58
-; GCN-NEXT: s_cselect_b32 s7, s61, 1
-; GCN-NEXT: s_and_b32 s7, s7, 1
-; GCN-NEXT: s_lshl_b32 s7, s7, 2
-; GCN-NEXT: s_or_b32 s5, s5, s7
+; GCN-NEXT: s_cselect_b32 s9, s61, 1
+; GCN-NEXT: s_and_b32 s9, s9, 1
+; GCN-NEXT: s_lshl_b32 s9, s9, 2
+; GCN-NEXT: s_or_b32 s8, s8, s9
; GCN-NEXT: s_cmp_lg_u32 s6, 57
-; GCN-NEXT: s_cselect_b32 s7, s59, 1
-; GCN-NEXT: s_lshl_b32 s7, s7, 1
+; GCN-NEXT: s_cselect_b32 s9, s59, 1
+; GCN-NEXT: s_lshl_b32 s9, s9, 1
; GCN-NEXT: s_cmp_lg_u32 s6, 56
-; GCN-NEXT: s_cselect_b32 s8, s58, 1
-; GCN-NEXT: s_and_b32 s8, s8, 1
-; GCN-NEXT: s_or_b32 s7, s8, s7
-; GCN-NEXT: s_and_b32 s7, s7, 3
-; GCN-NEXT: s_or_b32 s5, s7, s5
-; GCN-NEXT: s_and_b32 s5, s5, 15
-; GCN-NEXT: s_lshl_b32 s5, s5, 8
-; GCN-NEXT: s_or_b32 s4, s4, s5
+; GCN-NEXT: s_cselect_b32 s10, s58, 1
+; GCN-NEXT: s_and_b32 s10, s10, 1
+; GCN-NEXT: s_or_b32 s9, s10, s9
+; GCN-NEXT: s_and_b32 s9, s9, 3
+; GCN-NEXT: s_or_b32 s8, s9, s8
+; GCN-NEXT: s_and_b32 s8, s8, 15
+; GCN-NEXT: s_lshl_b32 s8, s8, 8
+; GCN-NEXT: s_or_b32 s7, s7, s8
; GCN-NEXT: s_cmp_lg_u32 s6, 55
-; GCN-NEXT: s_cselect_b32 s5, s55, 1
-; GCN-NEXT: s_lshl_b32 s5, s5, 3
+; GCN-NEXT: s_cselect_b32 s8, s55, 1
+; GCN-NEXT: s_lshl_b32 s8, s8, 3
; GCN-NEXT: s_cmp_lg_u32 s6, 54
-; GCN-NEXT: s_cselect_b32 s7, s53, 1
-; GCN-NEXT: s_and_b32 s7, s7, 1
-; GCN-NEXT: s_lshl_b32 s7, s7, 2
-; GCN-NEXT: s_or_b32 s5, s5, s7
+; GCN-NEXT: s_cselect_b32 s9, s53, 1
+; GCN-NEXT: s_and_b32 s9, s9, 1
+; GCN-NEXT: s_lshl_b32 s9, s9, 2
+; GCN-NEXT: s_or_b32 s8, s8, s9
; GCN-NEXT: s_cmp_lg_u32 s6, 53
-; GCN-NEXT: s_cselect_b32 s7, s51, 1
-; GCN-NEXT: s_lshl_b32 s7, s7, 1
+; GCN-NEXT: s_cselect_b32 s9, s51, 1
+; GCN-NEXT: s_lshl_b32 s9, s9, 1
; GCN-NEXT: s_cmp_lg_u32 s6, 52
-; GCN-NEXT: s_cselect_b32 s8, s50, 1
-; GCN-NEXT: s_and_b32 s8, s8, 1
-; GCN-NEXT: s_or_b32 s7, s8, s7
-; GCN-NEXT: s_and_b32 s7, s7, 3
-; GCN-NEXT: s_or_b32 s5, s7, s5
-; GCN-NEXT: s_lshl_b32 s5, s5, 4
+; GCN-NEXT: s_cselect_b32 s10, s50, 1
+; GCN-NEXT: s_and_b32 s10, s10, 1
+; GCN-NEXT: s_or_b32 s9, s10, s9
+; GCN-NEXT: s_and_b32 s9, s9, 3
+; GCN-NEXT: s_or_b32 s8, s9, s8
+; GCN-NEXT: s_lshl_b32 s8, s8, 4
; GCN-NEXT: s_cmp_lg_u32 s6, 51
-; GCN-NEXT: s_cselect_b32 s7, s47, 1
-; GCN-NEXT: s_lshl_b32 s7, s7, 3
+; GCN-NEXT: s_cselect_b32 s9, s47, 1
+; GCN-NEXT: s_lshl_b32 s9, s9, 3
; GCN-NEXT: s_cmp_lg_u32 s6, 50
-; GCN-NEXT: s_cselect_b32 s8, s45, 1
-; GCN-NEXT: s_and_b32 s8, s8, 1
-; GCN-NEXT: s_lshl_b32 s8, s8, 2
-; GCN-NEXT: s_or_b32 s7, s7, s8
+; GCN-NEXT: s_cselect_b32 s10, s45, 1
+; GCN-NEXT: s_and_b32 s10, s10, 1
+; GCN-NEXT: s_lshl_b32 s10, s10, 2
+; GCN-NEXT: s_or_b32 s9, s9, s10
; GCN-NEXT: s_cmp_lg_u32 s6, 49
-; GCN-NEXT: s_cselect_b32 s8, s43, 1
-; GCN-NEXT: s_lshl_b32 s8, s8, 1
+; GCN-NEXT: s_cselect_b32 s10, s43, 1
+; GCN-NEXT: s_lshl_b32 s10, s10, 1
; GCN-NEXT: s_cmp_lg_u32 s6, 48
-; GCN-NEXT: s_cselect_b32 s9, s42, 1
-; GCN-NEXT: s_and_b32 s9, s9, 1
+; GCN-NEXT: s_cselect_b32 s11, s42, 1
+; GCN-NEXT: s_and_b32 s11, s11, 1
+; GCN-NEXT: s_or_b32 s10, s11, s10
+; GCN-NEXT: s_and_b32 s10, s10, 3
+; GCN-NEXT: s_or_b32 s9, s10, s9
+; GCN-NEXT: s_and_b32 s9, s9, 15
; GCN-NEXT: s_or_b32 s8, s9, s8
-; GCN-NEXT: s_and_b32 s8, s8, 3
+; GCN-NEXT: s_and_b32 s8, s8, 0xff
; GCN-NEXT: s_or_b32 s7, s8, s7
-; GCN-NEXT: s_and_b32 s7, s7, 15
-; GCN-NEXT: s_or_b32 s5, s7, s5
-; GCN-NEXT: s_and_b32 s5, s5, 0xff
-; GCN-NEXT: s_or_b32 s4, s5, s4
-; GCN-NEXT: s_lshl_b32 s4, s4, 16
+; GCN-NEXT: s_lshl_b32 s7, s7, 16
; GCN-NEXT: s_cmp_lg_u32 s6, 47
-; GCN-NEXT: s_cselect_b32 s5, s65, 1
-; GCN-NEXT: s_lshl_b32 s5, s5, 3
+; GCN-NEXT: s_cselect_b32 s8, s65, 1
+; GCN-NEXT: s_lshl_b32 s8, s8, 3
; GCN-NEXT: s_cmp_lg_u32 s6, 46
-; GCN-NEXT: s_cselect_b32 s7, s64, 1
-; GCN-NEXT: s_and_b32 s7, s7, 1
-; GCN-NEXT: s_lshl_b32 s7, s7, 2
-; GCN-NEXT: s_or_b32 s5, s5, s7
+; GCN-NEXT: s_cselect_b32 s9, s64, 1
+; GCN-NEXT: s_and_b32 s9, s9, 1
+; GCN-NEXT: s_lshl_b32 s9, s9, 2
+; GCN-NEXT: s_or_b32 s8, s8, s9
; GCN-NEXT: s_cmp_lg_u32 s6, 45
-; GCN-NEXT: s_cselect_b32 s7, s62, 1
-; GCN-NEXT: s_lshl_b32 s7, s7, 1
+; GCN-NEXT: s_cselect_b32 s9, s62, 1
+; GCN-NEXT: s_lshl_b32 s9, s9, 1
; GCN-NEXT: s_cmp_lg_u32 s6, 44
-; GCN-NEXT: s_cselect_b32 s8, s60, 1
-; GCN-NEXT: s_and_b32 s8, s8, 1
-; GCN-NEXT: s_or_b32 s7, s8, s7
-; GCN-NEXT: s_and_b32 s7, s7, 3
-; GCN-NEXT: s_or_b32 s5, s7, s5
-; GCN-NEXT: s_lshl_b32 s5, s5, 12
+; GCN-NEXT: s_cselect_b32 s10, s60, 1
+; GCN-NEXT: s_and_b32 s10, s10, 1
+; GCN-NEXT: s_or_b32 s9, s10, s9
+; GCN-NEXT: s_and_b32 s9, s9, 3
+; GCN-NEXT: s_or_b32 s8, s9, s8
+; GCN-NEXT: s_lshl_b32 s8, s8, 12
; GCN-NEXT: s_cmp_lg_u32 s6, 43
-; GCN-NEXT: s_cselect_b32 s7, s57, 1
-; GCN-NEXT: s_lshl_b32 s7, s7, 3
+; GCN-NEXT: s_cselect_b32 s9, s57, 1
+; GCN-NEXT: s_lshl_b32 s9, s9, 3
; GCN-NEXT: s_cmp_lg_u32 s6, 42
-; GCN-NEXT: s_cselect_b32 s8, s56, 1
-; GCN-NEXT: s_and_b32 s8, s8, 1
-; GCN-NEXT: s_lshl_b32 s8, s8, 2
-; GCN-NEXT: s_or_b32 s7, s7, s8
+; GCN-NEXT: s_cselect_b32 s10, s56, 1
+; GCN-NEXT: s_and_b32 s10, s10, 1
+; GCN-NEXT: s_lshl_b32 s10, s10, 2
+; GCN-NEXT: s_or_b32 s9, s9, s10
; GCN-NEXT: s_cmp_lg_u32 s6, 41
-; GCN-NEXT: s_cselect_b32 s8, s54, 1
-; GCN-NEXT: s_lshl_b32 s8, s8, 1
+; GCN-NEXT: s_cselect_b32 s10, s54, 1
+; GCN-NEXT: s_lshl_b32 s10, s10, 1
; GCN-NEXT: s_cmp_lg_u32 s6, 40
-; GCN-NEXT: s_cselect_b32 s9, s52, 1
-; GCN-NEXT: s_and_b32 s9, s9, 1
-; GCN-NEXT: s_or_b32 s8, s9, s8
-; GCN-NEXT: s_and_b32 s8, s8, 3
-; GCN-NEXT: s_or_b32 s7, s8, s7
-; GCN-NEXT: s_and_b32 s7, s7, 15
-; GCN-NEXT: s_lshl_b32 s7, s7, 8
-; GCN-NEXT: s_or_b32 s5, s5, s7
+; GCN-NEXT: s_cselect_b32 s11, s52, 1
+; GCN-NEXT: s_and_b32 s11, s11, 1
+; GCN-NEXT: s_or_b32 s10, s11, s10
+; GCN-NEXT: s_and_b32 s10, s10, 3
+; GCN-NEXT: s_or_b32 s9, s10, s9
+; GCN-NEXT: s_and_b32 s9, s9, 15
+; GCN-NEXT: s_lshl_b32 s9, s9, 8
+; GCN-NEXT: s_or_b32 s8, s8, s9
; GCN-NEXT: s_cmp_lg_u32 s6, 39
-; GCN-NEXT: s_cselect_b32 s7, s49, 1
-; GCN-NEXT: s_lshl_b32 s7, s7, 3
+; GCN-NEXT: s_cselect_b32 s9, s49, 1
+; GCN-NEXT: s_lshl_b32 s9, s9, 3
; GCN-NEXT: s_cmp_lg_u32 s6, 38
-; GCN-NEXT: s_cselect_b32 s8, s48, 1
-; GCN-NEXT: s_and_b32 s8, s8, 1
-; GCN-NEXT: s_lshl_b32 s8, s8, 2
-; GCN-NEXT: s_or_b32 s7, s7, s8
+; GCN-NEXT: s_cselect_b32 s10, s48, 1
+; GCN-NEXT: s_and_b32 s10, s10, 1
+; GCN-NEXT: s_lshl_b32 s10, s10, 2
+; GCN-NEXT: s_or_b32 s9, s9, s10
; GCN-NEXT: s_cmp_lg_u32 s6, 37
-; GCN-NEXT: s_cselect_b32 s8, s46, 1
-; GCN-NEXT: s_lshl_b32 s8, s8, 1
+; GCN-NEXT: s_cselect_b32 s10, s46, 1
+; GCN-NEXT: s_lshl_b32 s10, s10, 1
; GCN-NEXT: s_cmp_lg_u32 s6, 36
-; GCN-NEXT: s_cselect_b32 s9, s44, 1
-; GCN-NEXT: s_and_b32 s9, s9, 1
-; GCN-NEXT: s_or_b32 s8, s9, s8
-; GCN-NEXT: s_and_b32 s8, s8, 3
-; GCN-NEXT: s_or_b32 s7, s8, s7
-; GCN-NEXT: s_lshl_b32 s7, s7, 4
+; GCN-NEXT: s_cselect_b32 s11, s44, 1
+; GCN-NEXT: s_and_b32 s11, s11, 1
+; GCN-NEXT: s_or_b32 s10, s11, s10
+; GCN-NEXT: s_and_b32 s10, s10, 3
+; GCN-NEXT: s_or_b32 s9, s10, s9
+; GCN-NEXT: s_lshl_b32 s9, s9, 4
; GCN-NEXT: s_cmp_lg_u32 s6, 35
-; GCN-NEXT: s_cselect_b32 s8, s41, 1
-; GCN-NEXT: s_lshl_b32 s8, s8, 3
+; GCN-NEXT: v_readlane_b32 s10, v6, 33
+; GCN-NEXT: s_cselect_b32 s10, s10, 1
+; GCN-NEXT: s_lshl_b32 s10, s10, 3
; GCN-NEXT: s_cmp_lg_u32 s6, 34
-; GCN-NEXT: s_cselect_b32 s9, s40, 1
-; GCN-NEXT: s_and_b32 s9, s9, 1
-; GCN-NEXT: s_lshl_b32 s9, s9, 2
-; GCN-NEXT: s_or_b32 s8, s8, s9
+; GCN-NEXT: v_readlane_b32 s11, v6, 32
+; GCN-NEXT: s_cselect_b32 s11, s11, 1
+; GCN-NEXT: s_and_b32 s11, s11, 1
+; GCN-NEXT: s_lshl_b32 s11, s11, 2
+; GCN-NEXT: s_or_b32 s10, s10, s11
; GCN-NEXT: s_cmp_lg_u32 s6, 32
; GCN-NEXT: s_cselect_b32 s1, s1, 1
; GCN-NEXT: s_and_b32 s1, s1, 1
; GCN-NEXT: s_cmp_lg_u32 s6, 33
-; GCN-NEXT: v_readlane_b32 s9, v6, 33
-; GCN-NEXT: s_cselect_b32 s9, s9, 1
-; GCN-NEXT: s_lshl_b32 s9, s9, 1
-; GCN-NEXT: s_or_b32 s1, s1, s9
+; GCN-NEXT: v_readlane_b32 s11, v6, 31
+; GCN-NEXT: s_cselect_b32 s11, s11, 1
+; GCN-NEXT: s_lshl_b32 s11, s11, 1
+; GCN-NEXT: s_or_b32 s1, s1, s11
; GCN-NEXT: s_and_b32 s1, s1, 3
-; GCN-NEXT: s_or_b32 s1, s1, s8
+; GCN-NEXT: s_or_b32 s1, s1, s10
; GCN-NEXT: s_and_b32 s1, s1, 15
-; GCN-NEXT: s_or_b32 s1, s1, s7
+; GCN-NEXT: s_or_b32 s1, s1, s9
; GCN-NEXT: s_and_b32 s1, s1, 0xff
-; GCN-NEXT: s_or_b32 s1, s1, s5
+; GCN-NEXT: s_or_b32 s1, s1, s8
; GCN-NEXT: s_and_b32 s1, s1, 0xffff
-; GCN-NEXT: s_or_b32 s1, s1, s4
+; GCN-NEXT: s_or_b32 s1, s1, s7
; GCN-NEXT: s_cmp_lg_u32 s6, 31
-; GCN-NEXT: v_readlane_b32 s4, v6, 17
-; GCN-NEXT: s_cselect_b32 s4, s4, 1
-; GCN-NEXT: s_lshl_b32 s4, s4, 3
+; GCN-NEXT: v_readlane_b32 s7, v6, 15
+; GCN-NEXT: s_cselect_b32 s7, s7, 1
+; GCN-NEXT: s_lshl_b32 s7, s7, 3
; GCN-NEXT: s_cmp_lg_u32 s6, 30
-; GCN-NEXT: v_readlane_b32 s5, v6, 16
-; GCN-NEXT: s_cselect_b32 s5, s5, 1
-; GCN-NEXT: s_and_b32 s5, s5, 1
-; GCN-NEXT: s_lshl_b32 s5, s5, 2
-; GCN-NEXT: s_or_b32 s4, s4, s5
+; GCN-NEXT: v_readlane_b32 s8, v6, 14
+; GCN-NEXT: s_cselect_b32 s8, s8, 1
+; GCN-NEXT: s_and_b32 s8, s8, 1
+; GCN-NEXT: s_lshl_b32 s8, s8, 2
+; GCN-NEXT: s_or_b32 s7, s7, s8
; GCN-NEXT: s_cmp_lg_u32 s6, 29
-; GCN-NEXT: v_readlane_b32 s5, v6, 15
-; GCN-NEXT: s_cselect_b32 s5, s5, 1
-; GCN-NEXT: s_lshl_b32 s5, s5, 1
+; GCN-NEXT: v_readlane_b32 s8, v6, 13
+; GCN-NEXT: s_cselect_b32 s8, s8, 1
+; GCN-NEXT: s_lshl_b32 s8, s8, 1
; GCN-NEXT: s_cmp_lg_u32 s6, 28
-; GCN-NEXT: v_readlane_b32 s7, v6, 14
-; GCN-NEXT: s_cselect_b32 s7, s7, 1
-; GCN-NEXT: s_and_b32 s7, s7, 1
-; GCN-NEXT: s_or_b32 s5, s7, s5
-; GCN-NEXT: s_and_b32 s5, s5, 3
-; GCN-NEXT: s_or_b32 s4, s5, s4
-; GCN-NEXT: s_lshl_b32 s4, s4, 12
+; GCN-NEXT: v_readlane_b32 s9, v6, 12
+; GCN-NEXT: s_cselect_b32 s9, s9, 1
+; GCN-NEXT: s_and_b32 s9, s9, 1
+; GCN-NEXT: s_or_b32 s8, s9, s8
+; GCN-NEXT: s_and_b32 s8, s8, 3
+; GCN-NEXT: s_or_b32 s7, s8, s7
+; GCN-NEXT: s_lshl_b32 s7, s7, 12
; GCN-NEXT: s_cmp_lg_u32 s6, 27
-; GCN-NEXT: v_readlane_b32 s5, v6, 13
-; GCN-NEXT: s_cselect_b32 s5, s5, 1
-; GCN-NEXT: s_lshl_b32 s5, s5, 3
+; GCN-NEXT: v_readlane_b32 s8, v6, 11
+; GCN-NEXT: s_cselect_b32 s8, s8, 1
+; GCN-NEXT: s_lshl_b32 s8, s8, 3
; GCN-NEXT: s_cmp_lg_u32 s6, 26
-; GCN-NEXT: v_readlane_b32 s7, v6, 12
-; GCN-NEXT: s_cselect_b32 s7, s7, 1
-; GCN-NEXT: s_and_b32 s7, s7, 1
-; GCN-NEXT: s_lshl_b32 s7, s7, 2
-; GCN-NEXT: s_or_b32 s5, s5, s7
+; GCN-NEXT: v_readlane_b32 s9, v6, 10
+; GCN-NEXT: s_cselect_b32 s9, s9, 1
+; GCN-NEXT: s_and_b32 s9, s9, 1
+; GCN-NEXT: s_lshl_b32 s9, s9, 2
+; GCN-NEXT: s_or_b32 s8, s8, s9
; GCN-NEXT: s_cmp_lg_u32 s6, 25
-; GCN-NEXT: v_readlane_b32 s7, v6, 11
-; GCN-NEXT: s_cselect_b32 s7, s7, 1
-; GCN-NEXT: s_lshl_b32 s7, s7, 1
+; GCN-NEXT: v_readlane_b32 s9, v6, 9
+; GCN-NEXT: s_cselect_b32 s9, s9, 1
+; GCN-NEXT: s_lshl_b32 s9, s9, 1
; GCN-NEXT: s_cmp_lg_u32 s6, 24
-; GCN-NEXT: v_readlane_b32 s8, v6, 10
-; GCN-NEXT: s_cselect_b32 s8, s8, 1
-; GCN-NEXT: s_and_b32 s8, s8, 1
-; GCN-NEXT: s_or_b32 s7, s8, s7
-; GCN-NEXT: s_and_b32 s7, s7, 3
-; GCN-NEXT: s_or_b32 s5, s7, s5
-; GCN-NEXT: s_and_b32 s5, s5, 15
-; GCN-NEXT: s_lshl_b32 s5, s5, 8
-; GCN-NEXT: s_or_b32 s4, s4, s5
+; GCN-NEXT: v_readlane_b32 s10, v6, 8
+; GCN-NEXT: s_cselect_b32 s10, s10, 1
+; GCN-NEXT: s_and_b32 s10, s10, 1
+; GCN-NEXT: s_or_b32 s9, s10, s9
+; GCN-NEXT: s_and_b32 s9, s9, 3
+; GCN-NEXT: s_or_b32 s8, s9, s8
+; GCN-NEXT: s_and_b32 s8, s8, 15
+; GCN-NEXT: s_lshl_b32 s8, s8, 8
+; GCN-NEXT: s_or_b32 s7, s7, s8
; GCN-NEXT: s_cmp_lg_u32 s6, 23
-; GCN-NEXT: v_readlane_b32 s5, v6, 9
-; GCN-NEXT: s_cselect_b32 s5, s5, 1
-; GCN-NEXT: s_lshl_b32 s5, s5, 3
+; GCN-NEXT: v_readlane_b32 s8, v6, 7
+; GCN-NEXT: s_cselect_b32 s8, s8, 1
+; GCN-NEXT: s_lshl_b32 s8, s8, 3
; GCN-NEXT: s_cmp_lg_u32 s6, 22
-; GCN-NEXT: v_readlane_b32 s7, v6, 8
-; GCN-NEXT: s_cselect_b32 s7, s7, 1
-; GCN-NEXT: s_and_b32 s7, s7, 1
-; GCN-NEXT: s_lshl_b32 s7, s7, 2
-; GCN-NEXT: s_or_b32 s5, s5, s7
+; GCN-NEXT: v_readlane_b32 s9, v6, 6
+; GCN-NEXT: s_cselect_b32 s9, s9, 1
+; GCN-NEXT: s_and_b32 s9, s9, 1
+; GCN-NEXT: s_lshl_b32 s9, s9, 2
+; GCN-NEXT: s_or_b32 s8, s8, s9
; GCN-NEXT: s_cmp_lg_u32 s6, 21
-; GCN-NEXT: v_readlane_b32 s7, v6, 7
-; GCN-NEXT: s_cselect_b32 s7, s7, 1
-; GCN-NEXT: s_lshl_b32 s7, s7, 1
+; GCN-NEXT: v_readlane_b32 s9, v6, 5
+; GCN-NEXT: s_cselect_b32 s9, s9, 1
+; GCN-NEXT: s_lshl_b32 s9, s9, 1
; GCN-NEXT: s_cmp_lg_u32 s6, 20
-; GCN-NEXT: v_readlane_b32 s8, v6, 6
-; GCN-NEXT: s_cselect_b32 s8, s8, 1
-; GCN-NEXT: s_and_b32 s8, s8, 1
-; GCN-NEXT: s_or_b32 s7, s8, s7
-; GCN-NEXT: s_and_b32 s7, s7, 3
-; GCN-NEXT: s_or_b32 s5, s7, s5
-; GCN-NEXT: s_lshl_b32 s5, s5, 4
+; GCN-NEXT: v_readlane_b32 s10, v6, 4
+; GCN-NEXT: s_cselect_b32 s10, s10, 1
+; GCN-NEXT: s_and_b32 s10, s10, 1
+; GCN-NEXT: s_or_b32 s9, s10, s9
+; GCN-NEXT: s_and_b32 s9, s9, 3
+; GCN-NEXT: s_or_b32 s8, s9, s8
+; GCN-NEXT: s_lshl_b32 s8, s8, 4
; GCN-NEXT: s_cmp_lg_u32 s6, 19
-; GCN-NEXT: v_readlane_b32 s7, v6, 5
-; GCN-NEXT: s_cselect_b32 s7, s7, 1
-; GCN-NEXT: s_lshl_b32 s7, s7, 3
+; GCN-NEXT: v_readlane_b32 s9, v6, 3
+; GCN-NEXT: s_cselect_b32 s9, s9, 1
+; GCN-NEXT: s_lshl_b32 s9, s9, 3
; GCN-NEXT: s_cmp_lg_u32 s6, 18
-; GCN-NEXT: v_readlane_b32 s8, v6, 4
-; GCN-NEXT: s_cselect_b32 s8, s8, 1
-; GCN-NEXT: s_and_b32 s8, s8, 1
-; GCN-NEXT: s_lshl_b32 s8, s8, 2
-; GCN-NEXT: s_or_b32 s7, s7, s8
+; GCN-NEXT: v_readlane_b32 s10, v6, 2
+; GCN-NEXT: s_cselect_b32 s10, s10, 1
+; GCN-NEXT: s_and_b32 s10, s10, 1
+; GCN-NEXT: s_lshl_b32 s10, s10, 2
+; GCN-NEXT: s_or_b32 s9, s9, s10
; GCN-NEXT: s_cmp_lg_u32 s6, 17
-; GCN-NEXT: v_readlane_b32 s8, v6, 3
-; GCN-NEXT: s_cselect_b32 s8, s8, 1
-; GCN-NEXT: s_lshl_b32 s8, s8, 1
+; GCN-NEXT: v_readlane_b32 s10, v6, 1
+; GCN-NEXT: s_cselect_b32 s10, s10, 1
+; GCN-NEXT: s_lshl_b32 s10, s10, 1
; GCN-NEXT: s_cmp_lg_u32 s6, 16
-; GCN-NEXT: v_readlane_b32 s9, v6, 2
-; GCN-NEXT: s_cselect_b32 s9, s9, 1
-; GCN-NEXT: s_and_b32 s9, s9, 1
+; GCN-NEXT: v_readlane_b32 s11, v6, 0
+; GCN-NEXT: s_cselect_b32 s11, s11, 1
+; GCN-NEXT: s_and_b32 s11, s11, 1
+; GCN-NEXT: s_or_b32 s10, s11, s10
+; GCN-NEXT: s_and_b32 s10, s10, 3
+; GCN-NEXT: s_or_b32 s9, s10, s9
+; GCN-NEXT: s_and_b32 s9, s9, 15
; GCN-NEXT: s_or_b32 s8, s9, s8
-; GCN-NEXT: s_and_b32 s8, s8, 3
+; GCN-NEXT: s_and_b32 s8, s8, 0xff
; GCN-NEXT: s_or_b32 s7, s8, s7
-; GCN-NEXT: s_and_b32 s7, s7, 15
-; GCN-NEXT: s_or_b32 s5, s7, s5
-; GCN-NEXT: s_and_b32 s5, s5, 0xff
-; GCN-NEXT: s_or_b32 s4, s5, s4
-; GCN-NEXT: s_lshl_b32 s4, s4, 16
+; GCN-NEXT: s_lshl_b32 s7, s7, 16
; GCN-NEXT: s_cmp_lg_u32 s6, 15
-; GCN-NEXT: v_readlane_b32 s5, v6, 32
-; GCN-NEXT: s_cselect_b32 s5, s5, 1
-; GCN-NEXT: s_lshl_b32 s5, s5, 3
+; GCN-NEXT: v_readlane_b32 s8, v6, 30
+; GCN-NEXT: s_cselect_b32 s8, s8, 1
+; GCN-NEXT: s_lshl_b32 s8, s8, 3
; GCN-NEXT: s_cmp_lg_u32 s6, 14
-; GCN-NEXT: v_readlane_b32 s7, v6, 31
-; GCN-NEXT: s_cselect_b32 s7, s7, 1
-; GCN-NEXT: s_and_b32 s7, s7, 1
-; GCN-NEXT: s_lshl_b32 s7, s7, 2
-; GCN-NEXT: s_or_b32 s5, s5, s7
+; GCN-NEXT: v_readlane_b32 s9, v6, 29
+; GCN-NEXT: s_cselect_b32 s9, s9, 1
+; GCN-NEXT: s_and_b32 s9, s9, 1
+; GCN-NEXT: s_lshl_b32 s9, s9, 2
+; GCN-NEXT: s_or_b32 s8, s8, s9
; GCN-NEXT: s_cmp_lg_u32 s6, 13
-; GCN-NEXT: v_readlane_b32 s7, v6, 30
-; GCN-NEXT: s_cselect_b32 s7, s7, 1
-; GCN-NEXT: s_lshl_b32 s7, s7, 1
+; GCN-NEXT: v_readlane_b32 s9, v6, 28
+; GCN-NEXT: s_cselect_b32 s9, s9, 1
+; GCN-NEXT: s_lshl_b32 s9, s9, 1
; GCN-NEXT: s_cmp_lg_u32 s6, 12
-; GCN-NEXT: v_readlane_b32 s8, v6, 29
-; GCN-NEXT: s_cselect_b32 s8, s8, 1
-; GCN-NEXT: s_and_b32 s8, s8, 1
-; GCN-NEXT: s_or_b32 s7, s8, s7
-; GCN-NEXT: s_and_b32 s7, s7, 3
-; GCN-NEXT: s_or_b32 s5, s7, s5
-; GCN-NEXT: s_lshl_b32 s5, s5, 12
+; GCN-NEXT: v_readlane_b32 s10, v6, 27
+; GCN-NEXT: s_cselect_b32 s10, s10, 1
+; GCN-NEXT: s_and_b32 s10, s10, 1
+; GCN-NEXT: s_or_b32 s9, s10, s9
+; GCN-NEXT: s_and_b32 s9, s9, 3
+; GCN-NEXT: s_or_b32 s8, s9, s8
+; GCN-NEXT: s_lshl_b32 s8, s8, 12
; GCN-NEXT: s_cmp_lg_u32 s6, 11
-; GCN-NEXT: v_readlane_b32 s7, v6, 28
-; GCN-NEXT: s_cselect_b32 s7, s7, 1
-; GCN-NEXT: s_lshl_b32 s7, s7, 3
+; GCN-NEXT: v_readlane_b32 s9, v6, 26
+; GCN-NEXT: s_cselect_b32 s9, s9, 1
+; GCN-NEXT: s_lshl_b32 s9, s9, 3
; GCN-NEXT: s_cmp_lg_u32 s6, 10
-; GCN-NEXT: v_readlane_b32 s8, v6, 27
-; GCN-NEXT: s_cselect_b32 s8, s8, 1
-; GCN-NEXT: s_and_b32 s8, s8, 1
-; GCN-NEXT: s_lshl_b32 s8, s8, 2
-; GCN-NEXT: s_or_b32 s7, s7, s8
+; GCN-NEXT: v_readlane_b32 s10, v6, 25
+; GCN-NEXT: s_cselect_b32 s10, s10, 1
+; GCN-NEXT: s_and_b32 s10, s10, 1
+; GCN-NEXT: s_lshl_b32 s10, s10, 2
+; GCN-NEXT: s_or_b32 s9, s9, s10
; GCN-NEXT: s_cmp_lg_u32 s6, 9
-; GCN-NEXT: v_readlane_b32 s8, v6, 26
-; GCN-NEXT: s_cselect_b32 s8, s8, 1
-; GCN-NEXT: s_lshl_b32 s8, s8, 1
+; GCN-NEXT: v_readlane_b32 s10, v6, 24
+; GCN-NEXT: s_cselect_b32 s10, s10, 1
+; GCN-NEXT: s_lshl_b32 s10, s10, 1
; GCN-NEXT: s_cmp_lg_u32 s6, 8
-; GCN-NEXT: v_readlane_b32 s9, v6, 25
-; GCN-NEXT: s_cselect_b32 s9, s9, 1
-; GCN-NEXT: s_and_b32 s9, s9, 1
-; GCN-NEXT: s_or_b32 s8, s9, s8
-; GCN-NEXT: s_and_b32 s8, s8, 3
-; GCN-NEXT: s_or_b32 s7, s8, s7
-; GCN-NEXT: s_and_b32 s7, s7, 15
-; GCN-NEXT: s_lshl_b32 s7, s7, 8
-; GCN-NEXT: s_or_b32 s5, s5, s7
+; GCN-NEXT: v_readlane_b32 s11, v6, 23
+; GCN-NEXT: s_cselect_b32 s11, s11, 1
+; GCN-NEXT: s_and_b32 s11, s11, 1
+; GCN-NEXT: s_or_b32 s10, s11, s10
+; GCN-NEXT: s_and_b32 s10, s10, 3
+; GCN-NEXT: s_or_b32 s9, s10, s9
+; GCN-NEXT: s_and_b32 s9, s9, 15
+; GCN-NEXT: s_lshl_b32 s9, s9, 8
+; GCN-NEXT: s_or_b32 s8, s8, s9
; GCN-NEXT: s_cmp_lg_u32 s6, 7
-; GCN-NEXT: v_readlane_b32 s7, v6, 24
-; GCN-NEXT: s_cselect_b32 s7, s7, 1
-; GCN-NEXT: s_lshl_b32 s7, s7, 3
+; GCN-NEXT: v_readlane_b32 s9, v6, 22
+; GCN-NEXT: s_cselect_b32 s9, s9, 1
+; GCN-NEXT: s_lshl_b32 s9, s9, 3
; GCN-NEXT: s_cmp_lg_u32 s6, 6
-; GCN-NEXT: v_readlane_b32 s8, v6, 23
-; GCN-NEXT: s_cselect_b32 s8, s8, 1
-; GCN-NEXT: s_and_b32 s8, s8, 1
-; GCN-NEXT: s_lshl_b32 s8, s8, 2
-; GCN-NEXT: s_or_b32 s7, s7, s8
+; GCN-NEXT: v_readlane_b32 s10, v6, 21
+; GCN-NEXT: s_cselect_b32 s10, s10, 1
+; GCN-NEXT: s_and_b32 s10, s10, 1
+; GCN-NEXT: s_lshl_b32 s10, s10, 2
+; GCN-NEXT: s_or_b32 s9, s9, s10
; GCN-NEXT: s_cmp_lg_u32 s6, 5
-; GCN-NEXT: v_readlane_b32 s8, v6, 22
-; GCN-NEXT: s_cselect_b32 s8, s8, 1
-; GCN-NEXT: s_lshl_b32 s8, s8, 1
+; GCN-NEXT: v_readlane_b32 s10, v6, 20
+; GCN-NEXT: s_cselect_b32 s10, s10, 1
+; GCN-NEXT: s_lshl_b32 s10, s10, 1
; GCN-NEXT: s_cmp_lg_u32 s6, 4
-; GCN-NEXT: v_readlane_b32 s9, v6, 21
-; GCN-NEXT: s_cselect_b32 s9, s9, 1
-; GCN-NEXT: s_and_b32 s9, s9, 1
-; GCN-NEXT: s_or_b32 s8, s9, s8
-; GCN-NEXT: s_and_b32 s8, s8, 3
-; GCN-NEXT: s_or_b32 s7, s8, s7
-; GCN-NEXT: s_lshl_b32 s7, s7, 4
+; GCN-NEXT: v_readlane_b32 s11, v6, 19
+; GCN-NEXT: s_cselect_b32 s11, s11, 1
+; GCN-NEXT: s_and_b32 s11, s11, 1
+; GCN-NEXT: s_or_b32 s10, s11, s10
+; GCN-NEXT: s_and_b32 s10, s10, 3
+; GCN-NEXT: s_or_b32 s9, s10, s9
+; GCN-NEXT: s_lshl_b32 s9, s9, 4
; GCN-NEXT: s_cmp_lg_u32 s6, 3
-; GCN-NEXT: v_readlane_b32 s8, v6, 20
-; GCN-NEXT: s_cselect_b32 s8, s8, 1
-; GCN-NEXT: s_lshl_b32 s8, s8, 3
+; GCN-NEXT: v_readlane_b32 s10, v6, 18
+; GCN-NEXT: s_cselect_b32 s10, s10, 1
+; GCN-NEXT: s_lshl_b32 s10, s10, 3
; GCN-NEXT: s_cmp_lg_u32 s6, 2
-; GCN-NEXT: v_readlane_b32 s9, v6, 19
-; GCN-NEXT: s_cselect_b32 s9, s9, 1
-; GCN-NEXT: s_and_b32 s9, s9, 1
-; GCN-NEXT: s_lshl_b32 s9, s9, 2
-; GCN-NEXT: s_or_b32 s8, s8, s9
+; GCN-NEXT: v_readlane_b32 s11, v6, 17
+; GCN-NEXT: s_cselect_b32 s11, s11, 1
+; GCN-NEXT: s_and_b32 s11, s11, 1
+; GCN-NEXT: s_lshl_b32 s11, s11, 2
+; GCN-NEXT: s_or_b32 s10, s10, s11
; GCN-NEXT: s_cmp_lg_u32 s6, 0
; GCN-NEXT: s_cselect_b32 s0, s0, 1
; GCN-NEXT: s_and_b32 s0, s0, 1
; GCN-NEXT: s_cmp_lg_u32 s6, 1
-; GCN-NEXT: v_readlane_b32 s6, v6, 18
+; GCN-NEXT: v_readlane_b32 s6, v6, 16
; GCN-NEXT: s_cselect_b32 s6, s6, 1
; GCN-NEXT: s_lshl_b32 s6, s6, 1
; GCN-NEXT: s_or_b32 s0, s0, s6
; GCN-NEXT: s_and_b32 s0, s0, 3
-; GCN-NEXT: s_or_b32 s0, s0, s8
+; GCN-NEXT: s_or_b32 s0, s0, s10
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[90:91], 0x24
; GCN-NEXT: s_and_b32 s0, s0, 15
-; GCN-NEXT: s_or_b32 s0, s0, s7
+; GCN-NEXT: s_or_b32 s0, s0, s9
; GCN-NEXT: s_and_b32 s0, s0, 0xff
-; GCN-NEXT: s_or_b32 s0, s0, s5
+; GCN-NEXT: s_or_b32 s0, s0, s8
; GCN-NEXT: s_and_b32 s0, s0, 0xffff
-; GCN-NEXT: s_or_b32 s0, s0, s4
+; GCN-NEXT: s_or_b32 s0, s0, s7
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v4, s4
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
-; GCN-NEXT: v_readlane_b32 s0, v6, 0
-; GCN-NEXT: v_readlane_b32 s1, v6, 1
-; GCN-NEXT: v_mov_b32_e32 v5, s1
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v3, s3
-; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: v_mov_b32_e32 v5, s5
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
index fe0892788ca84..9d734421fd542 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
@@ -299,8 +299,6 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32
-; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_getpc_b64 s[16:17]
; GFX9-NEXT: s_add_u32 s16, s16, byval_align16_f64_arg at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s17, s17, byval_align16_f64_arg at rel32@hi+12
@@ -308,6 +306,8 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24
; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32
diff --git a/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll b/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll
index 9644c941cd06c..d1179ddb304ba 100644
--- a/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll
+++ b/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll
@@ -12,9 +12,9 @@ define void @issue92561(ptr addrspace(1) %arg) {
; SDAG: ; %bb.0: ; %bb
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: s_clause 0x1
-; SDAG-NEXT: global_load_b128 v[4:7], v[0:1], off offset:16
-; SDAG-NEXT: global_load_b128 v[0:3], v[0:1], off
-; SDAG-NEXT: v_mov_b32_e32 v8, 0
+; SDAG-NEXT: global_load_b128 v[2:5], v[0:1], off
+; SDAG-NEXT: global_load_b128 v[6:9], v[0:1], off offset:16
+; SDAG-NEXT: v_mov_b32_e32 v1, 0
; SDAG-NEXT: s_mov_b32 s12, 0
; SDAG-NEXT: s_mov_b32 s3, exec_lo
; SDAG-NEXT: s_mov_b32 s13, s12
@@ -22,31 +22,31 @@ define void @issue92561(ptr addrspace(1) %arg) {
; SDAG-NEXT: s_mov_b32 s15, s12
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
-; SDAG-NEXT: v_readfirstlane_b32 s4, v0
-; SDAG-NEXT: v_readfirstlane_b32 s5, v1
-; SDAG-NEXT: v_readfirstlane_b32 s6, v2
-; SDAG-NEXT: v_readfirstlane_b32 s7, v3
-; SDAG-NEXT: v_readfirstlane_b32 s8, v4
-; SDAG-NEXT: v_readfirstlane_b32 s9, v5
-; SDAG-NEXT: v_readfirstlane_b32 s10, v6
-; SDAG-NEXT: v_readfirstlane_b32 s11, v7
-; SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; SDAG-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; SDAG-NEXT: v_cmp_eq_u64_e64 s1, s[8:9], v[4:5]
+; SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; SDAG-NEXT: v_readfirstlane_b32 s6, v4
+; SDAG-NEXT: v_readfirstlane_b32 s7, v5
+; SDAG-NEXT: v_readfirstlane_b32 s8, v6
+; SDAG-NEXT: v_readfirstlane_b32 s9, v7
+; SDAG-NEXT: v_readfirstlane_b32 s10, v8
+; SDAG-NEXT: v_readfirstlane_b32 s11, v9
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[2:3]
+; SDAG-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[4:5]
+; SDAG-NEXT: v_cmp_eq_u64_e64 s1, s[8:9], v[6:7]
; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; SDAG-NEXT: v_cmp_eq_u64_e64 s2, s[10:11], v[6:7]
+; SDAG-NEXT: v_cmp_eq_u64_e64 s2, s[10:11], v[8:9]
; SDAG-NEXT: s_and_b32 s0, vcc_lo, s0
; SDAG-NEXT: s_and_b32 s0, s0, s1
; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; SDAG-NEXT: s_and_b32 s0, s0, s2
; SDAG-NEXT: s_and_saveexec_b32 s0, s0
-; SDAG-NEXT: image_sample_c_lz v9, [v8, v8, v8, v8], s[4:11], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
-; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+; SDAG-NEXT: image_sample_c_lz v0, [v1, v1, v1, v1], s[4:11], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
+; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
; SDAG-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; SDAG-NEXT: s_cbranch_execnz .LBB0_1
; SDAG-NEXT: ; %bb.2:
; SDAG-NEXT: s_mov_b32 exec_lo, s3
-; SDAG-NEXT: v_dual_mov_b32 v0, 0x7fc00000 :: v_dual_mov_b32 v1, 1.0
+; SDAG-NEXT: v_dual_mov_b32 v2, 0x7fc00000 :: v_dual_mov_b32 v3, 1.0
; SDAG-NEXT: s_mov_b32 s0, s12
; SDAG-NEXT: s_mov_b32 s1, s12
; SDAG-NEXT: s_mov_b32 s2, s12
@@ -56,18 +56,19 @@ define void @issue92561(ptr addrspace(1) %arg) {
; SDAG-NEXT: s_mov_b32 s6, s12
; SDAG-NEXT: s_mov_b32 s7, s12
; SDAG-NEXT: s_clause 0x2
-; SDAG-NEXT: image_sample_c_lz v0, [v8, v8, v0, v8], s[0:7], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
-; SDAG-NEXT: image_sample_c_lz v2, [v8, v8, v8, v8], s[0:7], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
-; SDAG-NEXT: image_sample_c_lz v1, [v8, v1, v8, v8], s[0:7], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
+; SDAG-NEXT: image_sample_c_lz v2, [v1, v1, v2, v1], s[0:7], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
+; SDAG-NEXT: image_sample_c_lz v4, [v1, v1, v1, v1], s[0:7], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
+; SDAG-NEXT: image_sample_c_lz v3, [v1, v3, v1, v1], s[0:7], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
; SDAG-NEXT: s_waitcnt vmcnt(2)
-; SDAG-NEXT: v_dual_add_f32 v0, v9, v0 :: v_dual_mov_b32 v9, v8
+; SDAG-NEXT: v_add_f32_e32 v0, v0, v2
+; SDAG-NEXT: v_mov_b32_e32 v2, v1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-NEXT: v_add_f32_e32 v0, v1, v0
-; SDAG-NEXT: v_add_f32_e32 v0, v2, v0
+; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; SDAG-NEXT: v_dual_add_f32 v0, v3, v0 :: v_dual_mov_b32 v3, 0
+; SDAG-NEXT: v_add_f32_e32 v0, v4, v0
; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-NEXT: v_dual_mul_f32 v7, 0x3e800000, v0 :: v_dual_mov_b32 v0, 0
-; SDAG-NEXT: image_store v[7:9], [v0, v0], s[0:7] dim:SQ_RSRC_IMG_2D unorm
+; SDAG-NEXT: v_mul_f32_e32 v0, 0x3e800000, v0
+; SDAG-NEXT: image_store v[0:2], [v3, v3], s[0:7] dim:SQ_RSRC_IMG_2D unorm
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: issue92561:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
index 0161cdf03deac..efa46f883aa13 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
@@ -5734,14 +5734,14 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg
; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; NOLIT-SRCC-NEXT: v_lshlrev_b32_e32 v32, 7, v0
; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
-; NOLIT-SRCC-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
; NOLIT-SRCC-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
-; NOLIT-SRCC-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
+; NOLIT-SRCC-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
; NOLIT-SRCC-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
-; NOLIT-SRCC-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
+; NOLIT-SRCC-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
; NOLIT-SRCC-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
-; NOLIT-SRCC-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
+; NOLIT-SRCC-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
; NOLIT-SRCC-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
+; NOLIT-SRCC-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
; NOLIT-SRCC-NEXT: s_waitcnt vmcnt(0)
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v0
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1
@@ -5829,14 +5829,14 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg
; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; LIT-SRCC-NEXT: v_lshlrev_b32_e32 v32, 7, v0
; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
-; LIT-SRCC-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
; LIT-SRCC-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
-; LIT-SRCC-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
+; LIT-SRCC-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
; LIT-SRCC-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
-; LIT-SRCC-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
+; LIT-SRCC-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
; LIT-SRCC-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
-; LIT-SRCC-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
+; LIT-SRCC-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
; LIT-SRCC-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
+; LIT-SRCC-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
; LIT-SRCC-NEXT: s_waitcnt vmcnt(0)
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v0
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1
@@ -5927,14 +5927,14 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg
; GFX90A-NEXT: v_mov_b32_e32 v33, 1.0
; GFX90A-NEXT: v_mov_b32_e32 v34, 2.0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
; GFX90A-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
-; GFX90A-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
+; GFX90A-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
; GFX90A-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
-; GFX90A-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
+; GFX90A-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
; GFX90A-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
-; GFX90A-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
+; GFX90A-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
; GFX90A-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
+; GFX90A-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v34, v[0:31] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: s_nop 15
@@ -5957,14 +5957,14 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg
; GFX942-NEXT: v_mov_b32_e32 v33, 1.0
; GFX942-NEXT: v_mov_b32_e32 v34, 2.0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
; GFX942-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
-; GFX942-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
+; GFX942-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
; GFX942-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
-; GFX942-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
+; GFX942-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
; GFX942-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
-; GFX942-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
+; GFX942-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
; GFX942-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
+; GFX942-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v34, v[0:31] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_nop 15
@@ -5987,14 +5987,14 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg
; GFX942-VGPR-NEXT: v_mov_b32_e32 v33, 1.0
; GFX942-VGPR-NEXT: v_mov_b32_e32 v34, 2.0
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
; GFX942-VGPR-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
-; GFX942-VGPR-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
+; GFX942-VGPR-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
; GFX942-VGPR-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
-; GFX942-VGPR-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
+; GFX942-VGPR-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
; GFX942-VGPR-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
-; GFX942-VGPR-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
+; GFX942-VGPR-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
; GFX942-VGPR-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
+; GFX942-VGPR-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
; GFX942-VGPR-NEXT: s_waitcnt vmcnt(0)
; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v34, v[0:31] cbsz:1 abid:2 blgp:3
; GFX942-VGPR-NEXT: s_nop 15
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
index 443c74456b4ff..3162c390be058 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
@@ -300,10 +300,10 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) %
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
-; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
+; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
+; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[2:3]
; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[0:1]
; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[14:15]
@@ -362,10 +362,10 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) %
; SDAG-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-VGPR-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
-; SDAG-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
+; SDAG-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
+; SDAG-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; SDAG-VGPR-NEXT: v_mov_b64_e32 v[28:29], s[2:3]
; SDAG-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[0:1]
; SDAG-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[14:15]
@@ -1049,10 +1049,10 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_bf16__vgpr(ptr addrspace(1)
; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
; GCN-NEXT: s_load_dword s16, s[4:5], 0x64
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
-; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
+; GCN-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
+; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; GCN-NEXT: v_mov_b64_e32 v[28:29], s[2:3]
; GCN-NEXT: v_mov_b64_e32 v[26:27], s[0:1]
; GCN-NEXT: v_mov_b64_e32 v[24:25], s[14:15]
@@ -1080,10 +1080,10 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_bf16__vgpr(ptr addrspace(1)
; GCN-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
; GCN-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
; GCN-VGPR-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; GCN-VGPR-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
-; GCN-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
+; GCN-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; GCN-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
+; GCN-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; GCN-VGPR-NEXT: v_mov_b64_e32 v[28:29], s[2:3]
; GCN-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[0:1]
; GCN-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[14:15]
@@ -1349,8 +1349,8 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) %
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7]
; SDAG-NEXT: v_mov_b32_e32 v12, 0
@@ -1405,8 +1405,8 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) %
; SDAG-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-VGPR-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; SDAG-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-VGPR-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7]
; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, 0
@@ -1640,13 +1640,13 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
-; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
+; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
+; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; SDAG-NEXT: v_mov_b32_e32 v24, s8
; SDAG-NEXT: v_mov_b32_e32 v25, s9
; SDAG-NEXT: v_mov_b32_e32 v26, s10
@@ -1708,13 +1708,13 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a
; SDAG-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-VGPR-NEXT: v_lshlrev_b32_e32 v16, 6, v0
; SDAG-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-VGPR-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
-; SDAG-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
+; SDAG-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
+; SDAG-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; SDAG-VGPR-NEXT: v_mov_b32_e32 v24, s8
; SDAG-VGPR-NEXT: v_mov_b32_e32 v25, s9
; SDAG-VGPR-NEXT: v_mov_b32_e32 v26, s10
@@ -2208,8 +2208,8 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7]
; SDAG-NEXT: v_mov_b32_e32 v12, 0
@@ -2264,8 +2264,8 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace
; SDAG-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-VGPR-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; SDAG-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-VGPR-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7]
; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, 0
@@ -2499,8 +2499,8 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7]
; SDAG-NEXT: v_mov_b32_e32 v12, 0
@@ -2555,8 +2555,8 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace
; SDAG-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-VGPR-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; SDAG-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-VGPR-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7]
; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, 0
@@ -2790,8 +2790,8 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7]
; SDAG-NEXT: v_mov_b32_e32 v12, 0
@@ -2846,8 +2846,8 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace
; SDAG-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-VGPR-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; SDAG-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-VGPR-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7]
; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, 0
@@ -3081,8 +3081,8 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7]
; SDAG-NEXT: v_mov_b32_e32 v12, 0
@@ -3137,8 +3137,8 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace
; SDAG-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-VGPR-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; SDAG-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-VGPR-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7]
; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, 0
@@ -3372,13 +3372,13 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace(
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
-; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
+; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
+; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; SDAG-NEXT: v_mov_b32_e32 v24, s8
; SDAG-NEXT: v_mov_b32_e32 v25, s9
; SDAG-NEXT: v_mov_b32_e32 v26, s10
@@ -3440,13 +3440,13 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace(
; SDAG-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-VGPR-NEXT: v_lshlrev_b32_e32 v16, 6, v0
; SDAG-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-VGPR-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
-; SDAG-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
+; SDAG-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
+; SDAG-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; SDAG-VGPR-NEXT: v_mov_b32_e32 v24, s8
; SDAG-VGPR-NEXT: v_mov_b32_e32 v25, s9
; SDAG-VGPR-NEXT: v_mov_b32_e32 v26, s10
@@ -3940,13 +3940,13 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace(
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
-; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
+; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
+; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; SDAG-NEXT: v_mov_b32_e32 v24, s8
; SDAG-NEXT: v_mov_b32_e32 v25, s9
; SDAG-NEXT: v_mov_b32_e32 v26, s10
@@ -4008,13 +4008,13 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace(
; SDAG-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-VGPR-NEXT: v_lshlrev_b32_e32 v16, 6, v0
; SDAG-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-VGPR-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
-; SDAG-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
+; SDAG-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
+; SDAG-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; SDAG-VGPR-NEXT: v_mov_b32_e32 v24, s8
; SDAG-VGPR-NEXT: v_mov_b32_e32 v25, s9
; SDAG-VGPR-NEXT: v_mov_b32_e32 v26, s10
@@ -4508,13 +4508,13 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace(
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
-; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
+; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
+; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; SDAG-NEXT: v_mov_b32_e32 v24, s8
; SDAG-NEXT: v_mov_b32_e32 v25, s9
; SDAG-NEXT: v_mov_b32_e32 v26, s10
@@ -4576,13 +4576,13 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace(
; SDAG-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-VGPR-NEXT: v_lshlrev_b32_e32 v16, 6, v0
; SDAG-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-VGPR-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
-; SDAG-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
+; SDAG-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
+; SDAG-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; SDAG-VGPR-NEXT: v_mov_b32_e32 v24, s8
; SDAG-VGPR-NEXT: v_mov_b32_e32 v25, s9
; SDAG-VGPR-NEXT: v_mov_b32_e32 v26, s10
@@ -5076,13 +5076,13 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace(
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
-; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
+; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
+; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; SDAG-NEXT: v_mov_b32_e32 v24, s8
; SDAG-NEXT: v_mov_b32_e32 v25, s9
; SDAG-NEXT: v_mov_b32_e32 v26, s10
@@ -5144,13 +5144,13 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace(
; SDAG-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-VGPR-NEXT: v_lshlrev_b32_e32 v16, 6, v0
; SDAG-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-VGPR-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
-; SDAG-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
+; SDAG-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
+; SDAG-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; SDAG-VGPR-NEXT: v_mov_b32_e32 v24, s8
; SDAG-VGPR-NEXT: v_mov_b32_e32 v25, s9
; SDAG-VGPR-NEXT: v_mov_b32_e32 v26, s10
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index 84804eeaa9a75..87c30cfb56db1 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -766,28 +766,28 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v8, 0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: s_clause 0x7
+; GFX12-TRUE16-NEXT: global_load_d16_b16 v1, v8, s[0:1] offset:4
+; GFX12-TRUE16-NEXT: global_load_d16_b16 v2, v8, s[0:1] offset:8
; GFX12-TRUE16-NEXT: global_load_d16_b16 v3, v8, s[0:1] offset:12
-; GFX12-TRUE16-NEXT: global_load_d16_b16 v7, v8, s[0:1] offset:28
-; GFX12-TRUE16-NEXT: global_load_d16_b16 v6, v8, s[0:1] offset:24
-; GFX12-TRUE16-NEXT: global_load_d16_b16 v5, v8, s[0:1] offset:20
; GFX12-TRUE16-NEXT: global_load_d16_b16 v4, v8, s[0:1] offset:16
-; GFX12-TRUE16-NEXT: global_load_d16_b16 v2, v8, s[0:1] offset:8
-; GFX12-TRUE16-NEXT: global_load_d16_b16 v1, v8, s[0:1] offset:4
+; GFX12-TRUE16-NEXT: global_load_d16_b16 v5, v8, s[0:1] offset:20
+; GFX12-TRUE16-NEXT: global_load_d16_b16 v6, v8, s[0:1] offset:24
+; GFX12-TRUE16-NEXT: global_load_d16_b16 v7, v8, s[0:1] offset:28
; GFX12-TRUE16-NEXT: global_load_d16_b16 v0, v8, s[0:1]
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7
-; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v3, v8, s[0:1] offset:14
-; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7
-; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v7, v8, s[0:1] offset:30
+; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v1, v8, s[0:1] offset:6
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7
-; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v6, v8, s[0:1] offset:26
+; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v2, v8, s[0:1] offset:10
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7
-; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v5, v8, s[0:1] offset:22
+; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v3, v8, s[0:1] offset:14
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7
; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v4, v8, s[0:1] offset:18
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7
-; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v2, v8, s[0:1] offset:10
+; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v5, v8, s[0:1] offset:22
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7
-; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v1, v8, s[0:1] offset:6
+; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v6, v8, s[0:1] offset:26
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7
+; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v7, v8, s[0:1] offset:30
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7
; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v0, v8, s[0:1] offset:2
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
@@ -802,28 +802,28 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #
; GFX12-FAKE16-NEXT: v_mov_b32_e32 v8, 0
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX12-FAKE16-NEXT: s_clause 0x7
+; GFX12-FAKE16-NEXT: global_load_u16 v1, v8, s[0:1] offset:4
+; GFX12-FAKE16-NEXT: global_load_u16 v2, v8, s[0:1] offset:8
; GFX12-FAKE16-NEXT: global_load_u16 v3, v8, s[0:1] offset:12
-; GFX12-FAKE16-NEXT: global_load_u16 v7, v8, s[0:1] offset:28
-; GFX12-FAKE16-NEXT: global_load_u16 v6, v8, s[0:1] offset:24
-; GFX12-FAKE16-NEXT: global_load_u16 v5, v8, s[0:1] offset:20
; GFX12-FAKE16-NEXT: global_load_u16 v4, v8, s[0:1] offset:16
-; GFX12-FAKE16-NEXT: global_load_u16 v2, v8, s[0:1] offset:8
-; GFX12-FAKE16-NEXT: global_load_u16 v1, v8, s[0:1] offset:4
+; GFX12-FAKE16-NEXT: global_load_u16 v5, v8, s[0:1] offset:20
+; GFX12-FAKE16-NEXT: global_load_u16 v6, v8, s[0:1] offset:24
+; GFX12-FAKE16-NEXT: global_load_u16 v7, v8, s[0:1] offset:28
; GFX12-FAKE16-NEXT: global_load_u16 v0, v8, s[0:1]
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x7
-; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v3, v8, s[0:1] offset:14
-; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x7
-; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v7, v8, s[0:1] offset:30
+; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v1, v8, s[0:1] offset:6
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x7
-; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v6, v8, s[0:1] offset:26
+; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v2, v8, s[0:1] offset:10
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x7
-; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v5, v8, s[0:1] offset:22
+; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v3, v8, s[0:1] offset:14
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x7
; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v4, v8, s[0:1] offset:18
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x7
-; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v2, v8, s[0:1] offset:10
+; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v5, v8, s[0:1] offset:22
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x7
-; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v1, v8, s[0:1] offset:6
+; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v6, v8, s[0:1] offset:26
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x7
+; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v7, v8, s[0:1] offset:30
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x7
; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v0, v8, s[0:1] offset:2
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index 388006281abdc..dfcaf99925cf1 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -4666,107 +4666,106 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
-; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16
-; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32
+; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0
+; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:16
+; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48
-; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[23:26], off, s[8:11], 0 offset:64
-; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[27:30], off, s[8:11], 0 offset:80
-; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[31:34], off, s[8:11], 0 offset:96
-; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[35:38], off, s[8:11], 0 offset:112
+; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:64
+; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:80
+; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[32:35], off, s[8:11], 0 offset:96
+; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[36:39], off, s[8:11], 0 offset:112
; GCN-NOHSA-VI-NEXT: s_addc_u32 s89, s89, 0
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(7)
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 16, v15
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 16, v14
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v15, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v14, 0, 16
-; GCN-NOHSA-VI-NEXT: buffer_store_dword v15, off, s[88:91], 0 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT: buffer_store_dword v16, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT: buffer_store_dword v17, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT: buffer_store_dword v18, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 16, v13
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 16, v12
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v13, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v12, 0, 16
-; GCN-NOHSA-VI-NEXT: buffer_store_dword v13, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT: buffer_store_dword v14, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT: buffer_store_dword v15, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT: buffer_store_dword v16, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(14)
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 16, v11
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 16, v10
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v11, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v10, 0, 16
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v42, 16, v9
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v40, 16, v8
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v41, v9, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v39, v8, 0, 16
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 16, v23
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 16, v22
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v23, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v22, 0, 16
+; GCN-NOHSA-VI-NEXT: buffer_store_dword v4, off, s[88:91], 0 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT: buffer_store_dword v5, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT: buffer_store_dword v6, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT: buffer_store_dword v7, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 16, v21
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 16, v20
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v21, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v20, 0, 16
+; GCN-NOHSA-VI-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT: buffer_store_dword v5, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT: buffer_store_dword v6, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT: buffer_store_dword v7, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(12)
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 16, v3
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 16, v2
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v3, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v2, 0, 16
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v50, 16, v1
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v48, 16, v0
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v49, v1, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v47, v0, 0, 16
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 16, v3
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v13, 16, v2
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v3, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v2, 0, 16
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v55, 16, v1
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v53, 16, v0
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v54, v1, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v52, v0, 0, 16
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(8)
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 16, v36
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 16, v35
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v36, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v35, 0, 16
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 16, v7
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v13, 16, v6
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v7, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v6, 0, 16
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v46, 16, v5
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v44, 16, v4
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v45, v5, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v43, v4, 0, 16
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v54, 16, v26
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v52, 16, v25
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v53, v26, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v51, v25, 0, 16
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v58, 16, v24
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v56, 16, v23
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v57, v24, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v55, v23, 0, 16
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v26, 16, v30
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v24, 16, v29
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v25, v30, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v23, v29, 0, 16
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v62, 16, v28
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v60, 16, v27
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v61, v28, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v59, v27, 0, 16
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v30, 16, v34
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v28, 16, v33
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v29, v34, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v27, v33, 0, 16
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 16, v32
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 16, v31
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v32, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v31, 0, 16
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v34, 16, v38
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v32, 16, v37
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v33, v38, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v31, v37, 0, 16
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 16, v37
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 16, v36
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v37, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v36, 0, 16
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v22, 16, v19
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v20, 16, v18
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v21, v19, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v18, 0, 16
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v43, 16, v17
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v41, 16, v16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v42, v17, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v40, v16, 0, 16
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v47, 16, v11
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v45, 16, v10
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v46, v11, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v44, v10, 0, 16
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v51, 16, v9
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v49, 16, v8
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v50, v9, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v48, v8, 0, 16
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 16, v27
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 16, v26
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v27, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v26, 0, 16
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v59, 16, v25
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v57, 16, v24
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v58, v25, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v56, v24, 0, 16
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v26, 16, v31
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v24, 16, v30
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v25, v31, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v23, v30, 0, 16
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v63, 16, v29
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v61, 16, v28
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v62, v29, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v60, v28, 0, 16
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v30, 16, v35
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v28, 16, v34
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v29, v35, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v27, v34, 0, 16
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 16, v33
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 16, v32
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v33, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v32, 0, 16
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v34, 16, v39
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v32, 16, v38
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v33, v39, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v31, v38, 0, 16
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:240
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:208
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[59:62], off, s[0:3], 0 offset:160
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:160
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:176
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[55:58], off, s[0:3], 0 offset:128
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[51:54], off, s[0:3], 0 offset:144
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[47:50], off, s[0:3], 0 offset:96
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[43:46], off, s[0:3], 0 offset:64
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[39:42], off, s[0:3], 0 offset:32
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:128
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:144
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:96
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:112
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:64
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:80
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:32
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:48
; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:16 ; 4-byte Folded Reload
; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload
; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:24 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
index 0499b007575c8..f14fe2518d97a 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
@@ -2377,8 +2377,8 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou
; SI-NOHSA-NEXT: s_mov_b32 s9, s7
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32
-; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16
-; SI-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0
+; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0
+; SI-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:16
; SI-NOHSA-NEXT: s_waitcnt vmcnt(3)
; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v19, 31, v3
; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v17, 31, v2
@@ -2397,24 +2397,23 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou
; SI-NOHSA-NEXT: v_mov_b32_e32 v26, v5
; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v6
; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v7
-; SI-NOHSA-NEXT: s_waitcnt vmcnt(1)
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v7, 31, v11
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v5, 31, v10
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v31, 31, v9
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v29, 31, v8
-; SI-NOHSA-NEXT: v_mov_b32_e32 v28, v8
-; SI-NOHSA-NEXT: v_mov_b32_e32 v30, v9
-; SI-NOHSA-NEXT: v_mov_b32_e32 v4, v10
-; SI-NOHSA-NEXT: v_mov_b32_e32 v6, v11
; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v11, 31, v15
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v9, 31, v14
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v35, 31, v13
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v33, 31, v12
-; SI-NOHSA-NEXT: v_mov_b32_e32 v32, v12
-; SI-NOHSA-NEXT: v_mov_b32_e32 v34, v13
-; SI-NOHSA-NEXT: v_mov_b32_e32 v8, v14
-; SI-NOHSA-NEXT: v_mov_b32_e32 v10, v15
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v7, 31, v15
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v5, 31, v14
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v31, 31, v13
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v29, 31, v12
+; SI-NOHSA-NEXT: v_mov_b32_e32 v28, v12
+; SI-NOHSA-NEXT: v_mov_b32_e32 v30, v13
+; SI-NOHSA-NEXT: v_mov_b32_e32 v4, v14
+; SI-NOHSA-NEXT: v_mov_b32_e32 v6, v15
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v15, 31, v11
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v13, 31, v10
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v35, 31, v9
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v33, 31, v8
+; SI-NOHSA-NEXT: v_mov_b32_e32 v32, v8
+; SI-NOHSA-NEXT: v_mov_b32_e32 v34, v9
+; SI-NOHSA-NEXT: v_mov_b32_e32 v12, v10
+; SI-NOHSA-NEXT: v_mov_b32_e32 v14, v11
; SI-NOHSA-NEXT: s_mov_b32 s0, s4
; SI-NOHSA-NEXT: s_mov_b32 s1, s5
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:96
@@ -2424,7 +2423,7 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:32
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0
-; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
+; SI-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16
; SI-NOHSA-NEXT: s_endpgm
;
; GCNX3-HSA-LABEL: global_sextload_v16i32_to_v16i64:
@@ -2686,51 +2685,51 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou
; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v36, s[2:3] offset:32
; GCN-HSA-NEXT: global_load_dwordx4 v[4:7], v36, s[2:3] offset:48
; GCN-HSA-NEXT: global_load_dwordx4 v[8:11], v36, s[2:3] offset:16
-; GCN-HSA-NEXT: global_load_dwordx4 v[12:15], v36, s[2:3]
-; GCN-HSA-NEXT: s_waitcnt vmcnt(3)
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v3
; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v27, 31, v5
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v4
-; GCN-HSA-NEXT: v_mov_b32_e32 v24, v4
-; GCN-HSA-NEXT: v_mov_b32_e32 v26, v5
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v2
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 31, v7
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 31, v6
-; GCN-HSA-NEXT: v_mov_b32_e32 v20, v6
-; GCN-HSA-NEXT: v_mov_b32_e32 v22, v7
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v3
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v2
+; GCN-HSA-NEXT: s_waitcnt vmcnt(1)
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v7
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v6
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 31, v5
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 31, v4
+; GCN-HSA-NEXT: v_mov_b32_e32 v20, v4
+; GCN-HSA-NEXT: v_mov_b32_e32 v22, v5
+; GCN-HSA-NEXT: v_mov_b32_e32 v16, v6
+; GCN-HSA-NEXT: v_mov_b32_e32 v18, v7
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, v0
; GCN-HSA-NEXT: v_mov_b32_e32 v6, v1
-; GCN-HSA-NEXT: v_mov_b32_e32 v16, v2
-; GCN-HSA-NEXT: v_mov_b32_e32 v18, v3
-; GCN-HSA-NEXT: s_waitcnt vmcnt(1)
+; GCN-HSA-NEXT: v_mov_b32_e32 v12, v2
+; GCN-HSA-NEXT: v_mov_b32_e32 v14, v3
+; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v11
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v10
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v31, 31, v9
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v29, 31, v8
-; GCN-HSA-NEXT: v_mov_b32_e32 v28, v8
-; GCN-HSA-NEXT: v_mov_b32_e32 v30, v9
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v27, 31, v9
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v8
+; GCN-HSA-NEXT: v_mov_b32_e32 v24, v8
+; GCN-HSA-NEXT: v_mov_b32_e32 v26, v9
; GCN-HSA-NEXT: v_mov_b32_e32 v0, v10
; GCN-HSA-NEXT: v_mov_b32_e32 v2, v11
+; GCN-HSA-NEXT: global_load_dwordx4 v[8:11], v36, s[2:3]
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v15
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v14
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v35, 31, v13
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v33, 31, v12
-; GCN-HSA-NEXT: v_mov_b32_e32 v32, v12
-; GCN-HSA-NEXT: v_mov_b32_e32 v34, v13
-; GCN-HSA-NEXT: v_mov_b32_e32 v8, v14
-; GCN-HSA-NEXT: v_mov_b32_e32 v10, v15
-; GCN-HSA-NEXT: global_store_dwordx4 v36, v[24:27], s[0:1] offset:96
-; GCN-HSA-NEXT: global_store_dwordx4 v36, v[20:23], s[0:1] offset:112
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v31, 31, v11
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v29, 31, v10
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v35, 31, v9
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v33, 31, v8
+; GCN-HSA-NEXT: v_mov_b32_e32 v32, v8
+; GCN-HSA-NEXT: v_mov_b32_e32 v34, v9
+; GCN-HSA-NEXT: v_mov_b32_e32 v28, v10
+; GCN-HSA-NEXT: v_mov_b32_e32 v30, v11
+; GCN-HSA-NEXT: global_store_dwordx4 v36, v[20:23], s[0:1] offset:96
+; GCN-HSA-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:112
; GCN-HSA-NEXT: global_store_dwordx4 v36, v[4:7], s[0:1] offset:64
-; GCN-HSA-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:80
-; GCN-HSA-NEXT: global_store_dwordx4 v36, v[28:31], s[0:1] offset:32
+; GCN-HSA-NEXT: global_store_dwordx4 v36, v[12:15], s[0:1] offset:80
+; GCN-HSA-NEXT: global_store_dwordx4 v36, v[24:27], s[0:1] offset:32
; GCN-HSA-NEXT: global_store_dwordx4 v36, v[0:3], s[0:1] offset:48
; GCN-HSA-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1]
-; GCN-HSA-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:16
+; GCN-HSA-NEXT: global_store_dwordx4 v36, v[28:31], s[0:1] offset:16
; GCN-HSA-NEXT: s_endpgm
%ld = load <16 x i32>, ptr addrspace(1) %in
%ext = sext <16 x i32> %ld to <16 x i64>
@@ -3075,17 +3074,17 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; SI-NOHSA-NEXT: s_mov_b32 s8, s6
; SI-NOHSA-NEXT: s_mov_b32 s9, s7
-; SI-NOHSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:96
+; SI-NOHSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:96
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:112
-; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:80
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:64
-; SI-NOHSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:48
-; SI-NOHSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:32
-; SI-NOHSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:16
+; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:80
+; SI-NOHSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:32
+; SI-NOHSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:48
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0
+; SI-NOHSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:16
; SI-NOHSA-NEXT: s_waitcnt vmcnt(7)
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v47, 31, v31
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v45, 31, v30
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v47, 31, v27
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v45, 31, v26
; SI-NOHSA-NEXT: s_waitcnt vmcnt(6)
; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v39, 31, v15
; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v37, 31, v14
@@ -3095,17 +3094,17 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; SI-NOHSA-NEXT: v_mov_b32_e32 v42, v13
; SI-NOHSA-NEXT: v_mov_b32_e32 v36, v14
; SI-NOHSA-NEXT: v_mov_b32_e32 v38, v15
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v35, 31, v29
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v33, 31, v28
-; SI-NOHSA-NEXT: v_mov_b32_e32 v32, v28
-; SI-NOHSA-NEXT: v_mov_b32_e32 v34, v29
-; SI-NOHSA-NEXT: v_mov_b32_e32 v44, v30
-; SI-NOHSA-NEXT: v_mov_b32_e32 v46, v31
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v35, 31, v25
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v33, 31, v24
+; SI-NOHSA-NEXT: v_mov_b32_e32 v32, v24
+; SI-NOHSA-NEXT: v_mov_b32_e32 v34, v25
+; SI-NOHSA-NEXT: v_mov_b32_e32 v44, v26
+; SI-NOHSA-NEXT: v_mov_b32_e32 v46, v27
; SI-NOHSA-NEXT: buffer_store_dword v44, off, s[12:15], 0 ; 4-byte Folded Spill
; SI-NOHSA-NEXT: buffer_store_dword v45, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; SI-NOHSA-NEXT: buffer_store_dword v46, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
; SI-NOHSA-NEXT: buffer_store_dword v47, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
-; SI-NOHSA-NEXT: s_waitcnt vmcnt(9)
+; SI-NOHSA-NEXT: s_waitcnt vmcnt(8)
; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v15, 31, v7
; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v13, 31, v6
; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
@@ -3115,7 +3114,6 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; SI-NOHSA-NEXT: v_mov_b32_e32 v46, v5
; SI-NOHSA-NEXT: v_mov_b32_e32 v12, v6
; SI-NOHSA-NEXT: v_mov_b32_e32 v14, v7
-; SI-NOHSA-NEXT: s_waitcnt vmcnt(8)
; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v7, 31, v3
; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v5, 31, v2
; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v51, 31, v1
@@ -3124,42 +3122,40 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; SI-NOHSA-NEXT: v_mov_b32_e32 v50, v1
; SI-NOHSA-NEXT: v_mov_b32_e32 v4, v2
; SI-NOHSA-NEXT: v_mov_b32_e32 v6, v3
-; SI-NOHSA-NEXT: s_waitcnt vmcnt(7)
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v19
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v18
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v55, 31, v17
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v53, 31, v16
-; SI-NOHSA-NEXT: v_mov_b32_e32 v52, v16
-; SI-NOHSA-NEXT: v_mov_b32_e32 v54, v17
-; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v18
-; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v19
; SI-NOHSA-NEXT: s_waitcnt vmcnt(6)
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v19, 31, v23
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v17, 31, v22
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v59, 31, v21
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v57, 31, v20
-; SI-NOHSA-NEXT: v_mov_b32_e32 v56, v20
-; SI-NOHSA-NEXT: v_mov_b32_e32 v58, v21
-; SI-NOHSA-NEXT: v_mov_b32_e32 v16, v22
-; SI-NOHSA-NEXT: v_mov_b32_e32 v18, v23
-; SI-NOHSA-NEXT: s_waitcnt vmcnt(5)
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v23, 31, v27
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v21, 31, v26
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v63, 31, v25
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v61, 31, v24
-; SI-NOHSA-NEXT: v_mov_b32_e32 v60, v24
-; SI-NOHSA-NEXT: v_mov_b32_e32 v62, v25
-; SI-NOHSA-NEXT: v_mov_b32_e32 v20, v26
-; SI-NOHSA-NEXT: v_mov_b32_e32 v22, v27
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v23
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v22
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v55, 31, v21
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v53, 31, v20
+; SI-NOHSA-NEXT: v_mov_b32_e32 v52, v20
+; SI-NOHSA-NEXT: v_mov_b32_e32 v54, v21
+; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v22
+; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v23
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v23, 31, v19
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v21, 31, v18
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v59, 31, v17
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v57, 31, v16
+; SI-NOHSA-NEXT: v_mov_b32_e32 v56, v16
+; SI-NOHSA-NEXT: v_mov_b32_e32 v58, v17
+; SI-NOHSA-NEXT: v_mov_b32_e32 v20, v18
+; SI-NOHSA-NEXT: v_mov_b32_e32 v22, v19
; SI-NOHSA-NEXT: s_waitcnt vmcnt(4)
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v27, 31, v11
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v25, 31, v10
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v31, 31, v9
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v29, 31, v8
-; SI-NOHSA-NEXT: v_mov_b32_e32 v28, v8
-; SI-NOHSA-NEXT: v_mov_b32_e32 v30, v9
-; SI-NOHSA-NEXT: v_mov_b32_e32 v24, v10
-; SI-NOHSA-NEXT: v_mov_b32_e32 v26, v11
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v19, 31, v31
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v17, 31, v30
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v63, 31, v29
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v61, 31, v28
+; SI-NOHSA-NEXT: v_mov_b32_e32 v60, v28
+; SI-NOHSA-NEXT: v_mov_b32_e32 v62, v29
+; SI-NOHSA-NEXT: v_mov_b32_e32 v16, v30
+; SI-NOHSA-NEXT: v_mov_b32_e32 v18, v31
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v31, 31, v11
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v29, 31, v10
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v27, 31, v9
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v25, 31, v8
+; SI-NOHSA-NEXT: v_mov_b32_e32 v24, v8
+; SI-NOHSA-NEXT: v_mov_b32_e32 v26, v9
+; SI-NOHSA-NEXT: v_mov_b32_e32 v28, v10
+; SI-NOHSA-NEXT: v_mov_b32_e32 v30, v11
; SI-NOHSA-NEXT: s_mov_b32 s0, s4
; SI-NOHSA-NEXT: s_mov_b32 s1, s5
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:224
@@ -3178,11 +3174,11 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:96
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:64
-; SI-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80
+; SI-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:32
-; SI-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:48
-; SI-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0
-; SI-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:16
+; SI-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
+; SI-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0
+; SI-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:16
; SI-NOHSA-NEXT: s_endpgm
;
; GCNX3-HSA-LABEL: global_sextload_v32i32_to_v32i64:
@@ -3391,10 +3387,10 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
-; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:96
-; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:112
-; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:80
-; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:64
+; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:80
+; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:96
+; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:112
+; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:64
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:48
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:32
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0
@@ -3402,52 +3398,52 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7)
-; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v35, 31, v11
-; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v33, 31, v10
+; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v35, 31, v3
+; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v33, 31, v2
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(6)
-; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v39, 31, v15
-; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v37, 31, v14
-; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v43, 31, v13
-; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v41, 31, v12
-; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v40, v12
-; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v42, v13
-; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v36, v14
-; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v38, v15
-; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v15, 31, v9
-; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v13, 31, v8
-; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v12, v8
-; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v14, v9
-; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v32, v10
-; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v34, v11
+; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v39, 31, v7
+; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v37, 31, v6
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(5)
-; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v11, 31, v7
-; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v9, 31, v6
-; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v47, 31, v5
-; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v45, 31, v4
-; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v44, v4
-; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v46, v5
-; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v8, v6
-; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v10, v7
+; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v43, 31, v11
+; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v41, 31, v10
+; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v47, 31, v9
+; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v45, 31, v8
+; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v44, v8
+; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v46, v9
+; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v40, v10
+; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v42, v11
+; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v11, 31, v5
+; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v9, 31, v4
+; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v8, v4
+; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v10, v5
+; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v36, v6
+; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v38, v7
+; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1
+; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v5, 31, v0
+; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v4, v0
+; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v6, v1
+; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v32, v2
+; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v34, v3
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(4)
-; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v7, 31, v3
-; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v5, 31, v2
-; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v4, v2
-; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v6, v3
+; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v15
+; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v14
+; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v0, v14
+; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v2, v15
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(3)
-; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v19
-; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v2, v19
+; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v15, 31, v19
+; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v14, v19
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v19, 31, v23
-; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v51, 31, v1
-; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v49, 31, v0
-; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v48, v0
-; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v50, v1
-; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v18
+; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v51, 31, v13
+; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v49, 31, v12
+; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v48, v12
+; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v50, v13
+; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v13, 31, v18
; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v55, 31, v17
; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v53, 31, v16
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v52, v16
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v54, v17
-; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v0, v18
+; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v12, v18
; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v17, 31, v22
; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v59, 31, v21
; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v57, 31, v20
@@ -3458,36 +3454,36 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(1)
; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v22, 31, v27
; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v20, 31, v26
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:224
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:240
-; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v42, 31, v25
-; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v40, 31, v24
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:224
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:240
+; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v46, 31, v25
+; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v44, 31, v24
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
-; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v38, 31, v31
-; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v36, 31, v30
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192
-; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v37, v31
-; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v15, 31, v29
-; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v13, 31, v28
-; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v12, v28
-; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v14, v29
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:208
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:160
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:176
+; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v42, 31, v31
+; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v40, 31, v30
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:192
+; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v41, v31
+; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v11, 31, v29
+; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v9, 31, v28
+; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v8, v28
+; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v10, v29
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:208
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:160
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:176
; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:128
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:144
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:96
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:112
; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:64
; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
-; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v35, v30
-; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v39, v24
-; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v41, v25
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
+; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v39, v30
+; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v43, v24
+; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v45, v25
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v19, v26
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v21, v27
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[35:38], off, s[0:3], 0 offset:48
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[39:42], off, s[0:3], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[39:42], off, s[0:3], 0 offset:48
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[43:46], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:16
; GCNX3-NOHSA-NEXT: s_endpgm
;
@@ -3656,233 +3652,109 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; EG-NEXT: MOV * T32.Z, T12.Y,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
-; GCN-GFX900-HSA-LABEL: global_sextload_v32i32_to_v32i64:
-; GCN-GFX900-HSA: ; %bb.0:
-; GCN-GFX900-HSA-NEXT: s_mov_b64 s[22:23], s[2:3]
-; GCN-GFX900-HSA-NEXT: s_mov_b64 s[20:21], s[0:1]
-; GCN-GFX900-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v12, 0
-; GCN-GFX900-HSA-NEXT: s_add_u32 s20, s20, s17
-; GCN-GFX900-HSA-NEXT: s_addc_u32 s21, s21, 0
-; GCN-GFX900-HSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[4:7], v12, s[2:3] offset:96
-; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[8:11], v12, s[2:3] offset:112
-; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[25:28], v12, s[2:3] offset:80
-; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[13:16], v12, s[2:3] offset:64
-; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[17:20], v12, s[2:3] offset:48
-; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[21:24], v12, s[2:3] offset:32
-; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[0:3], v12, s[2:3] offset:16
-; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(6)
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v32, 31, v7
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v30, 31, v6
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v29, v6
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v31, v7
-; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(5)
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v36, 31, v11
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v34, 31, v10
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v40, 31, v9
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v38, 31, v8
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v37, v8
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v39, v9
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v33, v10
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v35, v11
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v5
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v4
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v8, v4
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v10, v5
-; GCN-GFX900-HSA-NEXT: buffer_store_dword v29, off, s[20:23], 0 ; 4-byte Folded Spill
-; GCN-GFX900-HSA-NEXT: s_nop 0
-; GCN-GFX900-HSA-NEXT: buffer_store_dword v30, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill
-; GCN-GFX900-HSA-NEXT: buffer_store_dword v31, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill
-; GCN-GFX900-HSA-NEXT: buffer_store_dword v32, off, s[20:23], 0 offset:12 ; 4-byte Folded Spill
-; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(7)
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v16
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v15
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v48, 31, v14
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v46, 31, v13
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v45, v13
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v47, v14
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v4, v15
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v6, v16
-; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(6)
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v20
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v19
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v52, 31, v18
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v50, 31, v17
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v49, v17
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v51, v18
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v13, v19
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v15, v20
-; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(5)
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v24
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v23
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v56, 31, v22
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v54, 31, v21
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v53, v21
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v55, v22
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v17, v23
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v19, v24
-; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[21:24], v12, s[2:3]
-; GCN-GFX900-HSA-NEXT: s_nop 0
-; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[37:40], s[0:1] offset:224
-; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[33:36], s[0:1] offset:240
-; GCN-GFX900-HSA-NEXT: buffer_load_dword v33, off, s[20:23], 0 ; 4-byte Folded Reload
-; GCN-GFX900-HSA-NEXT: s_nop 0
-; GCN-GFX900-HSA-NEXT: buffer_load_dword v34, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload
-; GCN-GFX900-HSA-NEXT: buffer_load_dword v35, off, s[20:23], 0 offset:8 ; 4-byte Folded Reload
-; GCN-GFX900-HSA-NEXT: buffer_load_dword v36, off, s[20:23], 0 offset:12 ; 4-byte Folded Reload
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v32, 31, v28
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v30, 31, v27
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v44, 31, v26
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v42, 31, v25
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v41, v25
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v43, v26
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v29, v27
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v31, v28
-; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(11)
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v60, 31, v3
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v58, 31, v2
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v1
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v0
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v25, v0
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v27, v1
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v57, v2
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v59, v3
-; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:192
-; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(7)
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v24
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v23
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v0, v23
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v2, v24
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v22
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v21
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v8, v21
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v10, v22
-; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(1)
-; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[33:36], s[0:1] offset:208
-; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[41:44], s[0:1] offset:160
-; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[29:32], s[0:1] offset:176
-; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[45:48], s[0:1] offset:128
-; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] offset:144
-; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[49:52], s[0:1] offset:96
-; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[13:16], s[0:1] offset:112
-; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[53:56], s[0:1] offset:64
-; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[17:20], s[0:1] offset:80
-; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[25:28], s[0:1] offset:32
-; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[57:60], s[0:1] offset:48
-; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1]
-; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16
-; GCN-GFX900-HSA-NEXT: s_endpgm
-;
-; GCN-GFX908-HSA-LABEL: global_sextload_v32i32_to_v32i64:
-; GCN-GFX908-HSA: ; %bb.0:
-; GCN-GFX908-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v12, 0
-; GCN-GFX908-HSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[4:7], v12, s[2:3] offset:96
-; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[8:11], v12, s[2:3] offset:112
-; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[25:28], v12, s[2:3] offset:80
-; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[13:16], v12, s[2:3] offset:64
-; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[17:20], v12, s[2:3] offset:48
-; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[21:24], v12, s[2:3] offset:32
-; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[0:3], v12, s[2:3] offset:16
-; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(6)
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v32, 31, v7
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v30, 31, v6
-; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(5)
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v36, 31, v11
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v34, 31, v10
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v40, 31, v9
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v38, 31, v8
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v37, v8
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v39, v9
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v33, v10
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v35, v11
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v5
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v4
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v8, v4
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v10, v5
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v29, v6
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v31, v7
-; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(3)
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v16
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v15
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v48, 31, v14
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v46, 31, v13
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v45, v13
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v47, v14
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v4, v15
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v6, v16
-; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(2)
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v20
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v19
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v52, 31, v18
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v50, 31, v17
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v49, v17
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v51, v18
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v13, v19
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v15, v20
-; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(1)
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v24
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v23
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v56, 31, v22
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v54, 31, v21
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v53, v21
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v55, v22
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v17, v23
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v19, v24
-; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[21:24], v12, s[2:3]
-; GCN-GFX908-HSA-NEXT: v_accvgpr_write_b32 a0, v29
-; GCN-GFX908-HSA-NEXT: v_accvgpr_write_b32 a3, v32
-; GCN-GFX908-HSA-NEXT: v_accvgpr_write_b32 a1, v30
-; GCN-GFX908-HSA-NEXT: v_accvgpr_write_b32 a2, v31
-; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[37:40], s[0:1] offset:224
-; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[33:36], s[0:1] offset:240
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v32, 31, v28
-; GCN-GFX908-HSA-NEXT: v_accvgpr_read_b32 v36, a3
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v30, 31, v27
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v44, 31, v26
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v42, 31, v25
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v41, v25
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v43, v26
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v29, v27
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v31, v28
-; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(3)
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v60, 31, v3
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v58, 31, v2
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v1
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v0
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v25, v0
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v27, v1
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v57, v2
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v59, v3
-; GCN-GFX908-HSA-NEXT: v_accvgpr_read_b32 v35, a2
-; GCN-GFX908-HSA-NEXT: v_accvgpr_read_b32 v34, a1
-; GCN-GFX908-HSA-NEXT: v_accvgpr_read_b32 v33, a0
-; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:192
-; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(3)
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v24
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v23
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v0, v23
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v2, v24
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v22
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v21
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v8, v21
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v10, v22
-; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[33:36], s[0:1] offset:208
-; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[41:44], s[0:1] offset:160
-; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[29:32], s[0:1] offset:176
-; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[45:48], s[0:1] offset:128
-; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] offset:144
-; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[49:52], s[0:1] offset:96
-; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[13:16], s[0:1] offset:112
-; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[53:56], s[0:1] offset:64
-; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[17:20], s[0:1] offset:80
-; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[25:28], s[0:1] offset:32
-; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[57:60], s[0:1] offset:48
-; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1]
-; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16
-; GCN-GFX908-HSA-NEXT: s_endpgm
+; GCN-HSA-LABEL: global_sextload_v32i32_to_v32i64:
+; GCN-HSA: ; %bb.0:
+; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: v_mov_b32_e32 v12, 0
+; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-HSA-NEXT: global_load_dwordx4 v[4:7], v12, s[2:3] offset:96
+; GCN-HSA-NEXT: global_load_dwordx4 v[8:11], v12, s[2:3] offset:112
+; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v12, s[2:3] offset:80
+; GCN-HSA-NEXT: global_load_dwordx4 v[13:16], v12, s[2:3] offset:64
+; GCN-HSA-NEXT: global_load_dwordx4 v[17:20], v12, s[2:3] offset:48
+; GCN-HSA-NEXT: global_load_dwordx4 v[21:24], v12, s[2:3] offset:32
+; GCN-HSA-NEXT: global_load_dwordx4 v[25:28], v12, s[2:3] offset:16
+; GCN-HSA-NEXT: s_waitcnt vmcnt(6)
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v32, 31, v7
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v30, 31, v6
+; GCN-HSA-NEXT: s_waitcnt vmcnt(5)
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v36, 31, v11
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v34, 31, v10
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v40, 31, v9
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v38, 31, v8
+; GCN-HSA-NEXT: v_mov_b32_e32 v37, v8
+; GCN-HSA-NEXT: v_mov_b32_e32 v39, v9
+; GCN-HSA-NEXT: v_mov_b32_e32 v33, v10
+; GCN-HSA-NEXT: v_mov_b32_e32 v35, v11
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v5
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v4
+; GCN-HSA-NEXT: v_mov_b32_e32 v8, v4
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, v5
+; GCN-HSA-NEXT: v_mov_b32_e32 v29, v6
+; GCN-HSA-NEXT: v_mov_b32_e32 v31, v7
+; GCN-HSA-NEXT: s_waitcnt vmcnt(4)
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v3
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v2
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v44, 31, v1
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v42, 31, v0
+; GCN-HSA-NEXT: v_mov_b32_e32 v41, v0
+; GCN-HSA-NEXT: v_mov_b32_e32 v43, v1
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, v2
+; GCN-HSA-NEXT: v_mov_b32_e32 v6, v3
+; GCN-HSA-NEXT: s_waitcnt vmcnt(3)
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v16
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v15
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v48, 31, v14
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v46, 31, v13
+; GCN-HSA-NEXT: v_mov_b32_e32 v45, v13
+; GCN-HSA-NEXT: v_mov_b32_e32 v47, v14
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, v15
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, v16
+; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v20
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v19
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v52, 31, v18
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v50, 31, v17
+; GCN-HSA-NEXT: v_mov_b32_e32 v49, v17
+; GCN-HSA-NEXT: v_mov_b32_e32 v51, v18
+; GCN-HSA-NEXT: v_mov_b32_e32 v13, v19
+; GCN-HSA-NEXT: v_mov_b32_e32 v15, v20
+; GCN-HSA-NEXT: s_waitcnt vmcnt(1)
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v24
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v23
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v56, 31, v22
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v54, 31, v21
+; GCN-HSA-NEXT: v_mov_b32_e32 v53, v21
+; GCN-HSA-NEXT: v_mov_b32_e32 v55, v22
+; GCN-HSA-NEXT: v_mov_b32_e32 v17, v23
+; GCN-HSA-NEXT: v_mov_b32_e32 v19, v24
+; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v24, 31, v28
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 31, v27
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v60, 31, v26
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v58, 31, v25
+; GCN-HSA-NEXT: v_mov_b32_e32 v57, v25
+; GCN-HSA-NEXT: v_mov_b32_e32 v59, v26
+; GCN-HSA-NEXT: v_mov_b32_e32 v21, v27
+; GCN-HSA-NEXT: v_mov_b32_e32 v23, v28
+; GCN-HSA-NEXT: global_load_dwordx4 v[25:28], v12, s[2:3]
+; GCN-HSA-NEXT: s_nop 0
+; GCN-HSA-NEXT: global_store_dwordx4 v12, v[37:40], s[0:1] offset:224
+; GCN-HSA-NEXT: global_store_dwordx4 v12, v[33:36], s[0:1] offset:240
+; GCN-HSA-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:192
+; GCN-HSA-NEXT: s_waitcnt vmcnt(3)
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v35, 31, v28
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v33, 31, v27
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v26
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v25
+; GCN-HSA-NEXT: v_mov_b32_e32 v8, v25
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, v26
+; GCN-HSA-NEXT: global_store_dwordx4 v12, v[29:32], s[0:1] offset:208
+; GCN-HSA-NEXT: global_store_dwordx4 v12, v[41:44], s[0:1] offset:160
+; GCN-HSA-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] offset:176
+; GCN-HSA-NEXT: global_store_dwordx4 v12, v[45:48], s[0:1] offset:128
+; GCN-HSA-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:144
+; GCN-HSA-NEXT: global_store_dwordx4 v12, v[49:52], s[0:1] offset:96
+; GCN-HSA-NEXT: global_store_dwordx4 v12, v[13:16], s[0:1] offset:112
+; GCN-HSA-NEXT: global_store_dwordx4 v12, v[53:56], s[0:1] offset:64
+; GCN-HSA-NEXT: global_store_dwordx4 v12, v[17:20], s[0:1] offset:80
+; GCN-HSA-NEXT: global_store_dwordx4 v12, v[57:60], s[0:1] offset:32
+; GCN-HSA-NEXT: global_store_dwordx4 v12, v[21:24], s[0:1] offset:48
+; GCN-HSA-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1]
+; GCN-HSA-NEXT: v_mov_b32_e32 v32, v27
+; GCN-HSA-NEXT: v_mov_b32_e32 v34, v28
+; GCN-HSA-NEXT: global_store_dwordx4 v12, v[32:35], s[0:1] offset:16
+; GCN-HSA-NEXT: s_endpgm
%ld = load <32 x i32>, ptr addrspace(1) %in
%ext = sext <32 x i32> %ld to <32 x i64>
store <32 x i64> %ext, ptr addrspace(1) %out
@@ -3902,31 +3774,31 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; SI-NOHSA-NEXT: s_mov_b32 s8, s6
; SI-NOHSA-NEXT: s_mov_b32 s9, s7
+; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:96
+; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:112
; SI-NOHSA-NEXT: s_mov_b32 s0, s4
; SI-NOHSA-NEXT: s_mov_b32 s1, s5
-; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:112
-; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:96
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:16
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:32
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:48
-; SI-NOHSA-NEXT: s_waitcnt vmcnt(5)
-; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v4
-; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v5
+; SI-NOHSA-NEXT: s_waitcnt vmcnt(4)
+; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v8
+; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v9
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:64
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[32:35], off, s[8:11], 0 offset:80
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
-; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v6
-; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v7
+; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v10
+; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v11
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
-; SI-NOHSA-NEXT: s_waitcnt vmcnt(8) expcnt(0)
-; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v8
-; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v9
+; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
+; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v4
+; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v5
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192
; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
-; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v10
-; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v11
+; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v6
+; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v7
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208
; SI-NOHSA-NEXT: s_waitcnt vmcnt(4) expcnt(0)
; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v32
@@ -4485,22 +4357,22 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa
; SI-NOHSA-NEXT: s_mov_b32 s9, s3
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
-; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:112
-; SI-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:96
-; SI-NOHSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:80
-; SI-NOHSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:64
-; SI-NOHSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:32
-; SI-NOHSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:48
-; SI-NOHSA-NEXT: s_waitcnt vmcnt(4)
-; SI-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:96
-; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:112
-; SI-NOHSA-NEXT: s_waitcnt vmcnt(4)
-; SI-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 offset:64
-; SI-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:80
-; SI-NOHSA-NEXT: s_waitcnt vmcnt(5)
-; SI-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 offset:32
-; SI-NOHSA-NEXT: s_waitcnt vmcnt(5)
-; SI-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 offset:48
+; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
+; SI-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48
+; SI-NOHSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:96
+; SI-NOHSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:112
+; SI-NOHSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:64
+; SI-NOHSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:80
+; SI-NOHSA-NEXT: s_waitcnt vmcnt(3)
+; SI-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:96
+; SI-NOHSA-NEXT: s_waitcnt vmcnt(3)
+; SI-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 offset:112
+; SI-NOHSA-NEXT: s_waitcnt vmcnt(3)
+; SI-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 offset:64
+; SI-NOHSA-NEXT: s_waitcnt vmcnt(3)
+; SI-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 offset:80
+; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32
+; SI-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:48
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
; SI-NOHSA-NEXT: s_endpgm
@@ -4604,27 +4476,29 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa
; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
-; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:112
-; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:96
-; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:80
-; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:64
-; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:32
-; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:48
-; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0
-; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:16
; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
-; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(6)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:96
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:112
-; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(6)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:64
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:80
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, s7
+; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:96
+; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:112
+; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:64
+; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:80
+; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[0:3], 0 offset:32
+; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[0:3], 0 offset:48
+; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[0:3], 0
+; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[0:3], 0 offset:16
+; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7)
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:96
+; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7)
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:112
+; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7)
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:64
+; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7)
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:80
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7)
; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:32
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7)
@@ -4723,3 +4597,6 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa
}
attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1024,1024" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN-GFX900-HSA: {{.*}}
+; GCN-GFX908-HSA: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
index 04d906ca6ad9c..558456125751a 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
@@ -4845,129 +4845,139 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-DS128-LABEL: local_sextload_v64i16_to_v64i32:
; VI-DS128: ; %bb.0:
; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-DS128-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
; VI-DS128-NEXT: s_mov_b32 m0, -1
+; VI-DS128-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
; VI-DS128-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
; VI-DS128-NEXT: s_mov_b32 s90, -1
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DS128-NEXT: v_mov_b32_e32 v32, s1
-; VI-DS128-NEXT: ds_read_b128 v[8:11], v32
-; VI-DS128-NEXT: ds_read_b128 v[16:19], v32 offset:16
-; VI-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32
-; VI-DS128-NEXT: ds_read_b128 v[33:36], v32 offset:48
+; VI-DS128-NEXT: v_mov_b32_e32 v0, s1
+; VI-DS128-NEXT: ds_read_b128 v[8:11], v0
+; VI-DS128-NEXT: ds_read_b128 v[16:19], v0 offset:16
; VI-DS128-NEXT: s_mov_b32 s91, 0xe80000
; VI-DS128-NEXT: s_add_u32 s88, s88, s11
-; VI-DS128-NEXT: s_addc_u32 s89, s89, 0
+; VI-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32
+; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:48
; VI-DS128-NEXT: s_waitcnt lgkmcnt(3)
; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v11
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v10
-; VI-DS128-NEXT: v_bfe_i32 v2, v11, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v0, v10, 0, 16
-; VI-DS128-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill
-; VI-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
-; VI-DS128-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill
-; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v6, 16, v9
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v8
-; VI-DS128-NEXT: v_bfe_i32 v5, v9, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v3, v8, 0, 16
+; VI-DS128-NEXT: s_addc_u32 s89, s89, 0
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v2, 16, v10
+; VI-DS128-NEXT: v_mov_b32_e32 v4, v3
+; VI-DS128-NEXT: v_bfe_i32 v3, v11, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v1, v10, 0, 16
+; VI-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 ; 4-byte Folded Spill
+; VI-DS128-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
+; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill
+; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v9
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v8
+; VI-DS128-NEXT: v_bfe_i32 v6, v9, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v4, v8, 0, 16
+; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
+; VI-DS128-NEXT: buffer_store_dword v5, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill
+; VI-DS128-NEXT: buffer_store_dword v6, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill
+; VI-DS128-NEXT: buffer_store_dword v7, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill
; VI-DS128-NEXT: s_waitcnt lgkmcnt(2)
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v19
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v18
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v19
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v2, 16, v18
; VI-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v17
; VI-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v16
-; VI-DS128-NEXT: v_bfe_i32 v10, v19, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v8, v18, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v3, v19, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v1, v18, 0, 16
; VI-DS128-NEXT: v_bfe_i32 v14, v17, 0, 16
; VI-DS128-NEXT: v_bfe_i32 v12, v16, 0, 16
; VI-DS128-NEXT: s_waitcnt lgkmcnt(1)
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v27
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v26
-; VI-DS128-NEXT: v_bfe_i32 v18, v27, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v16, v26, 0, 16
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v23
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v22
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v31, 16, v21
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v20
+; VI-DS128-NEXT: v_bfe_i32 v18, v23, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v16, v22, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v30, v21, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v28, v20, 0, 16
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v36
-; VI-DS128-NEXT: v_bfe_i32 v26, v36, 0, 16
-; VI-DS128-NEXT: ds_read_b128 v[36:39], v32 offset:64
-; VI-DS128-NEXT: ds_read_b128 v[40:43], v32 offset:80
-; VI-DS128-NEXT: ds_read_b128 v[56:59], v32 offset:96
-; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
-; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill
-; VI-DS128-NEXT: buffer_store_dword v5, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill
-; VI-DS128-NEXT: buffer_store_dword v6, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v25
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v27
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v26
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v35, 16, v25
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v33, 16, v24
+; VI-DS128-NEXT: v_bfe_i32 v22, v27, 0, 16
+; VI-DS128-NEXT: ds_read_b128 v[36:39], v0 offset:64
+; VI-DS128-NEXT: v_bfe_i32 v20, v26, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v34, v25, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v32, v24, 0, 16
+; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:80
+; VI-DS128-NEXT: ds_read_b128 v[55:58], v0 offset:96
+; VI-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:32 ; 4-byte Folded Spill
+; VI-DS128-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:36 ; 4-byte Folded Spill
+; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:40 ; 4-byte Folded Spill
+; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:44 ; 4-byte Folded Spill
; VI-DS128-NEXT: s_waitcnt lgkmcnt(2)
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v47, 16, v39
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v45, 16, v38
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v51, 16, v37
-; VI-DS128-NEXT: v_bfe_i32 v46, v39, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v44, v38, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v50, v37, 0, 16
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v42, 16, v39
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v40, 16, v38
; VI-DS128-NEXT: s_waitcnt lgkmcnt(1)
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v53, 16, v40
-; VI-DS128-NEXT: v_bfe_i32 v52, v40, 0, 16
-; VI-DS128-NEXT: ds_read_b128 v[37:40], v32 offset:112
-; VI-DS128-NEXT: v_mov_b32_e32 v32, s0
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v24
-; VI-DS128-NEXT: v_bfe_i32 v22, v25, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v20, v24, 0, 16
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v50, 16, v27
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v48, 16, v26
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v54, 16, v25
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v52, 16, v24
+; VI-DS128-NEXT: v_bfe_i32 v49, v27, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v47, v26, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v53, v25, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v51, v24, 0, 16
+; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:112
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v46, 16, v37
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v44, 16, v36
+; VI-DS128-NEXT: v_bfe_i32 v41, v39, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v39, v38, 0, 16
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v38
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v37
-; VI-DS128-NEXT: v_bfe_i32 v2, v38, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v0, v37, 0, 16
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v35
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v31, 16, v34
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v33
-; VI-DS128-NEXT: v_bfe_i32 v24, v35, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v30, v34, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v28, v33, 0, 16
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v49, 16, v36
-; VI-DS128-NEXT: v_bfe_i32 v48, v36, 0, 16
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v36, 16, v43
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v34, 16, v42
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v55, 16, v41
-; VI-DS128-NEXT: v_bfe_i32 v35, v43, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v33, v42, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v54, v41, 0, 16
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v62, 16, v59
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v60, 16, v58
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v57
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v56
-; VI-DS128-NEXT: v_bfe_i32 v61, v59, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v59, v58, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v6, v57, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v4, v56, 0, 16
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v43, 16, v40
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v41, 16, v39
-; VI-DS128-NEXT: v_bfe_i32 v42, v40, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v40, v39, 0, 16
-; VI-DS128-NEXT: ds_write_b128 v32, v[0:3] offset:224
-; VI-DS128-NEXT: ds_write_b128 v32, v[40:43] offset:240
-; VI-DS128-NEXT: ds_write_b128 v32, v[4:7] offset:192
-; VI-DS128-NEXT: ds_write_b128 v32, v[59:62] offset:208
-; VI-DS128-NEXT: ds_write_b128 v32, v[52:55] offset:160
-; VI-DS128-NEXT: ds_write_b128 v32, v[33:36] offset:176
-; VI-DS128-NEXT: ds_write_b128 v32, v[48:51] offset:128
-; VI-DS128-NEXT: ds_write_b128 v32, v[44:47] offset:144
-; VI-DS128-NEXT: ds_write_b128 v32, v[28:31] offset:96
-; VI-DS128-NEXT: ds_write_b128 v32, v[24:27] offset:112
-; VI-DS128-NEXT: ds_write_b128 v32, v[20:23] offset:64
-; VI-DS128-NEXT: ds_write_b128 v32, v[16:19] offset:80
-; VI-DS128-NEXT: ds_write_b128 v32, v[12:15] offset:32
-; VI-DS128-NEXT: ds_write_b128 v32, v[8:11] offset:48
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v25
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v24
+; VI-DS128-NEXT: v_bfe_i32 v2, v25, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v0, v24, 0, 16
+; VI-DS128-NEXT: v_mov_b32_e32 v24, s0
+; VI-DS128-NEXT: v_bfe_i32 v45, v37, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v43, v36, 0, 16
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v61, 16, v58
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v59, 16, v57
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v56
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v55
+; VI-DS128-NEXT: v_bfe_i32 v60, v58, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v58, v57, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v10, v56, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v8, v55, 0, 16
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v27
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v26
+; VI-DS128-NEXT: v_bfe_i32 v6, v27, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v4, v26, 0, 16
+; VI-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:224
+; VI-DS128-NEXT: ds_write_b128 v24, v[4:7] offset:240
+; VI-DS128-NEXT: ds_write_b128 v24, v[8:11] offset:192
+; VI-DS128-NEXT: ds_write_b128 v24, v[58:61] offset:208
+; VI-DS128-NEXT: ds_write_b128 v24, v[51:54] offset:160
+; VI-DS128-NEXT: ds_write_b128 v24, v[47:50] offset:176
+; VI-DS128-NEXT: ds_write_b128 v24, v[43:46] offset:128
+; VI-DS128-NEXT: ds_write_b128 v24, v[39:42] offset:144
+; VI-DS128-NEXT: ds_write_b128 v24, v[32:35] offset:96
+; VI-DS128-NEXT: ds_write_b128 v24, v[20:23] offset:112
+; VI-DS128-NEXT: ds_write_b128 v24, v[28:31] offset:64
+; VI-DS128-NEXT: ds_write_b128 v24, v[16:19] offset:80
+; VI-DS128-NEXT: ds_write_b128 v24, v[12:15] offset:32
+; VI-DS128-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:32 ; 4-byte Folded Reload
+; VI-DS128-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:36 ; 4-byte Folded Reload
+; VI-DS128-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:40 ; 4-byte Folded Reload
+; VI-DS128-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:44 ; 4-byte Folded Reload
+; VI-DS128-NEXT: s_waitcnt vmcnt(0)
+; VI-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:48
; VI-DS128-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:16 ; 4-byte Folded Reload
; VI-DS128-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload
; VI-DS128-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:24 ; 4-byte Folded Reload
; VI-DS128-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:28 ; 4-byte Folded Reload
; VI-DS128-NEXT: s_waitcnt vmcnt(0)
-; VI-DS128-NEXT: ds_write_b128 v32, v[0:3]
+; VI-DS128-NEXT: ds_write_b128 v24, v[0:3]
; VI-DS128-NEXT: buffer_load_dword v0, off, s[88:91], 0 ; 4-byte Folded Reload
; VI-DS128-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload
; VI-DS128-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload
; VI-DS128-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload
; VI-DS128-NEXT: s_waitcnt vmcnt(0)
-; VI-DS128-NEXT: ds_write_b128 v32, v[0:3] offset:16
+; VI-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:16
; VI-DS128-NEXT: s_endpgm
;
; GFX9-DS128-LABEL: local_sextload_v64i16_to_v64i32:
@@ -4978,125 +4988,136 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-DS128-NEXT: s_mov_b32 s14, -1
; GFX9-DS128-NEXT: s_mov_b32 s15, 0xe00000
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DS128-NEXT: v_mov_b32_e32 v32, s1
-; GFX9-DS128-NEXT: ds_read_b128 v[8:11], v32
-; GFX9-DS128-NEXT: ds_read_b128 v[16:19], v32 offset:16
-; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32
-; GFX9-DS128-NEXT: ds_read_b128 v[33:36], v32 offset:48
+; GFX9-DS128-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-DS128-NEXT: ds_read_b128 v[8:11], v0
+; GFX9-DS128-NEXT: ds_read_b128 v[16:19], v0 offset:16
; GFX9-DS128-NEXT: s_add_u32 s12, s12, s11
-; GFX9-DS128-NEXT: s_addc_u32 s13, s13, 0
+; GFX9-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32
+; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:48
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(3)
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v11
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v10
-; GFX9-DS128-NEXT: v_bfe_i32 v2, v11, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v0, v10, 0, 16
-; GFX9-DS128-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT: s_addc_u32 s13, s13, 0
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v2, 16, v10
+; GFX9-DS128-NEXT: v_mov_b32_e32 v4, v3
+; GFX9-DS128-NEXT: v_bfe_i32 v3, v11, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v1, v10, 0, 16
+; GFX9-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT: s_nop 0
+; GFX9-DS128-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v9
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v8
+; GFX9-DS128-NEXT: v_bfe_i32 v6, v9, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v4, v8, 0, 16
+; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
; GFX9-DS128-NEXT: s_nop 0
-; GFX9-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v6, 16, v9
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v8
-; GFX9-DS128-NEXT: v_bfe_i32 v5, v9, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v3, v8, 0, 16
+; GFX9-DS128-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2)
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v19
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v18
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v19
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v2, 16, v18
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v17
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v16
-; GFX9-DS128-NEXT: v_bfe_i32 v10, v19, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v8, v18, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v3, v19, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v1, v18, 0, 16
; GFX9-DS128-NEXT: v_bfe_i32 v14, v17, 0, 16
; GFX9-DS128-NEXT: v_bfe_i32 v12, v16, 0, 16
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1)
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v27
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v26
-; GFX9-DS128-NEXT: v_bfe_i32 v18, v27, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v16, v26, 0, 16
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v23
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v22
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v31, 16, v21
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v20
+; GFX9-DS128-NEXT: v_bfe_i32 v18, v23, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v16, v22, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v30, v21, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v28, v20, 0, 16
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v36
-; GFX9-DS128-NEXT: v_bfe_i32 v26, v36, 0, 16
-; GFX9-DS128-NEXT: ds_read_b128 v[36:39], v32 offset:64
-; GFX9-DS128-NEXT: ds_read_b128 v[40:43], v32 offset:80
-; GFX9-DS128-NEXT: ds_read_b128 v[56:59], v32 offset:96
-; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v27
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v26
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v35, 16, v25
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v33, 16, v24
+; GFX9-DS128-NEXT: v_bfe_i32 v22, v27, 0, 16
+; GFX9-DS128-NEXT: ds_read_b128 v[36:39], v0 offset:64
+; GFX9-DS128-NEXT: v_bfe_i32 v20, v26, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v34, v25, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v32, v24, 0, 16
+; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:80
+; GFX9-DS128-NEXT: ds_read_b128 v[55:58], v0 offset:96
+; GFX9-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill
; GFX9-DS128-NEXT: s_nop 0
-; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v25
+; GFX9-DS128-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2)
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v47, 16, v39
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v45, 16, v38
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v51, 16, v37
-; GFX9-DS128-NEXT: v_bfe_i32 v46, v39, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v44, v38, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v50, v37, 0, 16
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v42, 16, v39
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v40, 16, v38
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1)
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v53, 16, v40
-; GFX9-DS128-NEXT: v_bfe_i32 v52, v40, 0, 16
-; GFX9-DS128-NEXT: ds_read_b128 v[37:40], v32 offset:112
-; GFX9-DS128-NEXT: v_mov_b32_e32 v32, s0
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v24
-; GFX9-DS128-NEXT: v_bfe_i32 v22, v25, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v20, v24, 0, 16
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v50, 16, v27
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v48, 16, v26
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v54, 16, v25
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v52, 16, v24
+; GFX9-DS128-NEXT: v_bfe_i32 v49, v27, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v47, v26, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v53, v25, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v51, v24, 0, 16
+; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:112
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v46, 16, v37
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v44, 16, v36
+; GFX9-DS128-NEXT: v_bfe_i32 v41, v39, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v39, v38, 0, 16
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v38
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v37
-; GFX9-DS128-NEXT: v_bfe_i32 v2, v38, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v0, v37, 0, 16
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v35
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v31, 16, v34
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v33
-; GFX9-DS128-NEXT: v_bfe_i32 v24, v35, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v30, v34, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v28, v33, 0, 16
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v49, 16, v36
-; GFX9-DS128-NEXT: v_bfe_i32 v48, v36, 0, 16
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v36, 16, v43
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v34, 16, v42
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v55, 16, v41
-; GFX9-DS128-NEXT: v_bfe_i32 v35, v43, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v33, v42, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v54, v41, 0, 16
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v62, 16, v59
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v60, 16, v58
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v57
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v56
-; GFX9-DS128-NEXT: v_bfe_i32 v61, v59, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v59, v58, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v6, v57, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v4, v56, 0, 16
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v43, 16, v40
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v41, 16, v39
-; GFX9-DS128-NEXT: v_bfe_i32 v42, v40, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v40, v39, 0, 16
-; GFX9-DS128-NEXT: ds_write_b128 v32, v[0:3] offset:224
-; GFX9-DS128-NEXT: ds_write_b128 v32, v[40:43] offset:240
-; GFX9-DS128-NEXT: ds_write_b128 v32, v[4:7] offset:192
-; GFX9-DS128-NEXT: ds_write_b128 v32, v[59:62] offset:208
-; GFX9-DS128-NEXT: ds_write_b128 v32, v[52:55] offset:160
-; GFX9-DS128-NEXT: ds_write_b128 v32, v[33:36] offset:176
-; GFX9-DS128-NEXT: ds_write_b128 v32, v[48:51] offset:128
-; GFX9-DS128-NEXT: ds_write_b128 v32, v[44:47] offset:144
-; GFX9-DS128-NEXT: ds_write_b128 v32, v[28:31] offset:96
-; GFX9-DS128-NEXT: ds_write_b128 v32, v[24:27] offset:112
-; GFX9-DS128-NEXT: ds_write_b128 v32, v[20:23] offset:64
-; GFX9-DS128-NEXT: ds_write_b128 v32, v[16:19] offset:80
-; GFX9-DS128-NEXT: ds_write_b128 v32, v[12:15] offset:32
-; GFX9-DS128-NEXT: ds_write_b128 v32, v[8:11] offset:48
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v25
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v24
+; GFX9-DS128-NEXT: v_bfe_i32 v2, v25, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v0, v24, 0, 16
+; GFX9-DS128-NEXT: v_mov_b32_e32 v24, s0
+; GFX9-DS128-NEXT: v_bfe_i32 v45, v37, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v43, v36, 0, 16
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v61, 16, v58
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v59, 16, v57
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v56
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v55
+; GFX9-DS128-NEXT: v_bfe_i32 v60, v58, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v58, v57, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v10, v56, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v8, v55, 0, 16
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v27
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v26
+; GFX9-DS128-NEXT: v_bfe_i32 v6, v27, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v4, v26, 0, 16
+; GFX9-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:224
+; GFX9-DS128-NEXT: ds_write_b128 v24, v[4:7] offset:240
+; GFX9-DS128-NEXT: ds_write_b128 v24, v[8:11] offset:192
+; GFX9-DS128-NEXT: ds_write_b128 v24, v[58:61] offset:208
+; GFX9-DS128-NEXT: ds_write_b128 v24, v[51:54] offset:160
+; GFX9-DS128-NEXT: ds_write_b128 v24, v[47:50] offset:176
+; GFX9-DS128-NEXT: ds_write_b128 v24, v[43:46] offset:128
+; GFX9-DS128-NEXT: ds_write_b128 v24, v[39:42] offset:144
+; GFX9-DS128-NEXT: ds_write_b128 v24, v[32:35] offset:96
+; GFX9-DS128-NEXT: ds_write_b128 v24, v[20:23] offset:112
+; GFX9-DS128-NEXT: ds_write_b128 v24, v[28:31] offset:64
+; GFX9-DS128-NEXT: ds_write_b128 v24, v[16:19] offset:80
+; GFX9-DS128-NEXT: ds_write_b128 v24, v[12:15] offset:32
+; GFX9-DS128-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload
+; GFX9-DS128-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload
+; GFX9-DS128-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload
+; GFX9-DS128-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload
+; GFX9-DS128-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:48
; GFX9-DS128-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
; GFX9-DS128-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
; GFX9-DS128-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
; GFX9-DS128-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
; GFX9-DS128-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DS128-NEXT: ds_write_b128 v32, v[0:3]
+; GFX9-DS128-NEXT: ds_write_b128 v24, v[0:3]
; GFX9-DS128-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
; GFX9-DS128-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GFX9-DS128-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
; GFX9-DS128-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
; GFX9-DS128-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DS128-NEXT: ds_write_b128 v32, v[0:3] offset:16
+; GFX9-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:16
; GFX9-DS128-NEXT: s_endpgm
%load = load <64 x i16>, ptr addrspace(3) %in
%ext = sext <64 x i16> %load to <64 x i32>
@@ -6691,53 +6712,53 @@ define amdgpu_kernel void @local_zextload_v16i16_to_v16i64(ptr addrspace(3) %out
; VI-DS128: ; %bb.0:
; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-DS128-NEXT: s_mov_b32 m0, -1
-; VI-DS128-NEXT: v_mov_b32_e32 v26, 0
-; VI-DS128-NEXT: v_mov_b32_e32 v22, v26
-; VI-DS128-NEXT: v_mov_b32_e32 v24, v26
+; VI-DS128-NEXT: v_mov_b32_e32 v25, 0
+; VI-DS128-NEXT: v_mov_b32_e32 v21, v25
+; VI-DS128-NEXT: v_mov_b32_e32 v23, v25
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DS128-NEXT: v_mov_b32_e32 v5, s1
-; VI-DS128-NEXT: ds_read_b128 v[0:3], v5
-; VI-DS128-NEXT: ds_read_b128 v[13:16], v5 offset:16
-; VI-DS128-NEXT: v_mov_b32_e32 v11, v26
-; VI-DS128-NEXT: v_mov_b32_e32 v19, v26
-; VI-DS128-NEXT: v_mov_b32_e32 v8, v26
+; VI-DS128-NEXT: v_mov_b32_e32 v4, s1
+; VI-DS128-NEXT: ds_read_b128 v[0:3], v4
+; VI-DS128-NEXT: ds_read_b128 v[4:7], v4 offset:16
+; VI-DS128-NEXT: v_mov_b32_e32 v28, s0
+; VI-DS128-NEXT: v_mov_b32_e32 v18, v25
+; VI-DS128-NEXT: v_mov_b32_e32 v15, v25
; VI-DS128-NEXT: s_waitcnt lgkmcnt(1)
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v12, 16, v2
-; VI-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v2
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v2
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v13
-; VI-DS128-NEXT: v_and_b32_e32 v21, 0xffff, v13
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v27, 16, v14
-; VI-DS128-NEXT: v_and_b32_e32 v25, 0xffff, v14
-; VI-DS128-NEXT: v_mov_b32_e32 v14, s0
-; VI-DS128-NEXT: v_mov_b32_e32 v13, v26
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v1
-; VI-DS128-NEXT: v_and_b32_e32 v7, 0xffff, v1
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v20, 16, v16
-; VI-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v16
-; VI-DS128-NEXT: ds_write_b128 v14, v[21:24] offset:64
-; VI-DS128-NEXT: v_mov_b32_e32 v21, v26
-; VI-DS128-NEXT: ds_write_b128 v14, v[10:13] offset:32
-; VI-DS128-NEXT: v_mov_b32_e32 v10, v26
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v0
-; VI-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v0
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v22, 16, v7
+; VI-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v7
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v6
+; VI-DS128-NEXT: v_and_b32_e32 v17, 0xffff, v6
+; VI-DS128-NEXT: ds_write_b128 v28, v[20:23] offset:112
+; VI-DS128-NEXT: v_mov_b32_e32 v20, v25
+; VI-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v2
+; VI-DS128-NEXT: ds_write_b128 v28, v[17:20] offset:96
+; VI-DS128-NEXT: v_mov_b32_e32 v17, v25
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v1
+; VI-DS128-NEXT: v_and_b32_e32 v11, 0xffff, v1
+; VI-DS128-NEXT: ds_write_b128 v28, v[14:17] offset:32
+; VI-DS128-NEXT: v_mov_b32_e32 v12, v25
+; VI-DS128-NEXT: v_mov_b32_e32 v14, v25
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v10, 16, v0
+; VI-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v0
; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3
; VI-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v3
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v15
-; VI-DS128-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; VI-DS128-NEXT: ds_write_b128 v14, v[18:21] offset:112
-; VI-DS128-NEXT: v_mov_b32_e32 v16, v26
-; VI-DS128-NEXT: v_mov_b32_e32 v18, v26
-; VI-DS128-NEXT: v_mov_b32_e32 v1, v26
-; VI-DS128-NEXT: v_mov_b32_e32 v3, v26
-; VI-DS128-NEXT: v_mov_b32_e32 v28, v26
-; VI-DS128-NEXT: ds_write_b128 v14, v[7:10] offset:16
-; VI-DS128-NEXT: v_mov_b32_e32 v5, v26
-; VI-DS128-NEXT: v_mov_b32_e32 v7, v26
-; VI-DS128-NEXT: ds_write_b128 v14, v[15:18] offset:96
-; VI-DS128-NEXT: ds_write_b128 v14, v[0:3] offset:48
-; VI-DS128-NEXT: ds_write_b128 v14, v[25:28] offset:80
-; VI-DS128-NEXT: ds_write_b128 v14, v[4:7]
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v4
+; VI-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v26, 16, v5
+; VI-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v5
+; VI-DS128-NEXT: v_mov_b32_e32 v5, v25
+; VI-DS128-NEXT: v_mov_b32_e32 v7, v25
+; VI-DS128-NEXT: v_mov_b32_e32 v1, v25
+; VI-DS128-NEXT: v_mov_b32_e32 v3, v25
+; VI-DS128-NEXT: v_mov_b32_e32 v27, v25
+; VI-DS128-NEXT: ds_write_b128 v28, v[11:14] offset:16
+; VI-DS128-NEXT: v_mov_b32_e32 v9, v25
+; VI-DS128-NEXT: v_mov_b32_e32 v11, v25
+; VI-DS128-NEXT: ds_write_b128 v28, v[4:7] offset:64
+; VI-DS128-NEXT: ds_write_b128 v28, v[0:3] offset:48
+; VI-DS128-NEXT: ds_write_b128 v28, v[24:27] offset:80
+; VI-DS128-NEXT: ds_write_b128 v28, v[8:11]
; VI-DS128-NEXT: s_endpgm
;
; GFX9-DS128-LABEL: local_zextload_v16i16_to_v16i64:
@@ -8682,111 +8703,112 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out
; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-DS128-NEXT: s_mov_b32 m0, -1
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DS128-NEXT: v_mov_b32_e32 v4, s1
-; VI-DS128-NEXT: ds_read_b128 v[0:3], v4 offset:48
-; VI-DS128-NEXT: ds_read_b128 v[9:12], v4 offset:32
-; VI-DS128-NEXT: v_mov_b32_e32 v8, s0
-; VI-DS128-NEXT: ds_read_b128 v[17:20], v4 offset:16
-; VI-DS128-NEXT: ds_read_b128 v[4:7], v4
+; VI-DS128-NEXT: v_mov_b32_e32 v13, s1
+; VI-DS128-NEXT: ds_read_b128 v[4:7], v13 offset:48
+; VI-DS128-NEXT: ds_read_b128 v[0:3], v13 offset:32
+; VI-DS128-NEXT: v_mov_b32_e32 v12, s0
+; VI-DS128-NEXT: ds_read_b128 v[8:11], v13
+; VI-DS128-NEXT: ds_read_b128 v[18:21], v13 offset:16
; VI-DS128-NEXT: s_waitcnt lgkmcnt(3)
-; VI-DS128-NEXT: v_bfe_i32 v13, v2, 0, 16
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-DS128-NEXT: v_bfe_i32 v15, v2, 0, 16
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; VI-DS128-NEXT: ds_write_b128 v8, v[13:16] offset:224
-; VI-DS128-NEXT: v_bfe_i32 v15, v2, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v13, v3, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v14, v6, 0, 16
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; VI-DS128-NEXT: v_bfe_i32 v16, v6, 0, 16
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v7
+; VI-DS128-NEXT: ds_write_b128 v12, v[14:17] offset:224
+; VI-DS128-NEXT: v_bfe_i32 v15, v6, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v13, v7, 0, 16
; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; VI-DS128-NEXT: ds_write_b128 v8, v[13:16] offset:240
-; VI-DS128-NEXT: v_bfe_i32 v15, v2, 0, 16
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; VI-DS128-NEXT: v_bfe_i32 v13, v0, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v0, v1, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:208
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v4
+; VI-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:240
+; VI-DS128-NEXT: v_bfe_i32 v15, v6, 0, 16
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v5
+; VI-DS128-NEXT: v_bfe_i32 v13, v4, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v4, v5, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v6, v6, 0, 16
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6
+; VI-DS128-NEXT: ds_write_b128 v12, v[4:7] offset:208
; VI-DS128-NEXT: s_waitcnt lgkmcnt(5)
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v11
-; VI-DS128-NEXT: v_bfe_i32 v0, v11, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:160
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v12
-; VI-DS128-NEXT: v_bfe_i32 v0, v12, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:176
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v9
-; VI-DS128-NEXT: v_bfe_i32 v0, v9, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; VI-DS128-NEXT: v_bfe_i32 v4, v2, 0, 16
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; VI-DS128-NEXT: v_bfe_i32 v6, v2, 0, 16
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; VI-DS128-NEXT: ds_write_b128 v12, v[4:7] offset:160
+; VI-DS128-NEXT: v_bfe_i32 v4, v3, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v6, v2, 0, 16
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6
+; VI-DS128-NEXT: v_bfe_i32 v2, v0, 0, 16
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; VI-DS128-NEXT: ds_write_b128 v12, v[4:7] offset:176
+; VI-DS128-NEXT: v_bfe_i32 v4, v0, 0, 16
; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:128
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; VI-DS128-NEXT: s_waitcnt lgkmcnt(6)
-; VI-DS128-NEXT: v_bfe_i32 v0, v5, 0, 16
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v5
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v10
-; VI-DS128-NEXT: v_bfe_i32 v9, v10, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v11, v5, 0, 16
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v19
-; VI-DS128-NEXT: ds_write_b128 v8, v[9:12] offset:144
-; VI-DS128-NEXT: v_bfe_i32 v9, v19, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v11, v5, 0, 16
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v20
-; VI-DS128-NEXT: ds_write_b128 v8, v[9:12] offset:96
-; VI-DS128-NEXT: v_bfe_i32 v9, v20, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v11, v5, 0, 16
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v17
-; VI-DS128-NEXT: ds_write_b128 v8, v[9:12] offset:112
-; VI-DS128-NEXT: v_bfe_i32 v9, v17, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v11, v5, 0, 16
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11
-; VI-DS128-NEXT: ds_write_b128 v8, v[9:12] offset:64
-; VI-DS128-NEXT: v_bfe_i32 v9, v4, 0, 16
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v9
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
+; VI-DS128-NEXT: ds_write_b128 v12, v[2:5] offset:128
+; VI-DS128-NEXT: v_bfe_i32 v4, v0, 0, 16
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; VI-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:192
+; VI-DS128-NEXT: v_bfe_i32 v13, v1, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v15, v0, 0, 16
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
+; VI-DS128-NEXT: s_waitcnt lgkmcnt(7)
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v20
+; VI-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:144
+; VI-DS128-NEXT: v_bfe_i32 v13, v20, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v15, v0, 0, 16
; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
-; VI-DS128-NEXT: v_bfe_i32 v11, v4, 0, 16
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v18
-; VI-DS128-NEXT: ds_write_b128 v8, v[13:16] offset:192
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v21
+; VI-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:96
+; VI-DS128-NEXT: v_bfe_i32 v13, v21, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v15, v0, 0, 16
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v18
+; VI-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:112
; VI-DS128-NEXT: v_bfe_i32 v13, v18, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v15, v4, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v15, v0, 0, 16
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v8
; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
-; VI-DS128-NEXT: ds_write_b128 v8, v[13:16] offset:80
-; VI-DS128-NEXT: v_bfe_i32 v13, v7, 0, 16
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v7
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v6
-; VI-DS128-NEXT: v_bfe_i32 v15, v4, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v4, v6, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v6, v7, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v6, v8, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v8, v0, 0, 16
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v19
+; VI-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:64
+; VI-DS128-NEXT: v_bfe_i32 v13, v19, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v15, v0, 0, 16
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v11
+; VI-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:80
+; VI-DS128-NEXT: v_bfe_i32 v15, v0, 0, 16
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v10
+; VI-DS128-NEXT: v_bfe_i32 v17, v10, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v19, v0, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v2, v9, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v13, v11, 0, 16
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; VI-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; VI-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8
; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
-; VI-DS128-NEXT: ds_write_b128 v8, v[4:7] offset:32
-; VI-DS128-NEXT: ds_write_b128 v8, v[13:16] offset:48
-; VI-DS128-NEXT: ds_write_b128 v8, v[9:12]
-; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:16
+; VI-DS128-NEXT: ds_write_b128 v12, v[17:20] offset:32
+; VI-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:48
+; VI-DS128-NEXT: ds_write_b128 v12, v[6:9]
+; VI-DS128-NEXT: ds_write_b128 v12, v[2:5] offset:16
; VI-DS128-NEXT: s_endpgm
;
; GFX9-DS128-LABEL: local_sextload_v32i16_to_v32i64:
diff --git a/llvm/test/CodeGen/AMDGPU/load-local.128.ll b/llvm/test/CodeGen/AMDGPU/load-local.128.ll
index 5b6af7654f7e9..fdc9b0164a9ce 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local.128.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local.128.ll
@@ -95,53 +95,52 @@ define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) {
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 m0, -1
-; GFX7-NEXT: ds_read_u8 v1, v0 offset:1
-; GFX7-NEXT: ds_read_u8 v2, v0 offset:6
-; GFX7-NEXT: ds_read_u8 v3, v0 offset:4
-; GFX7-NEXT: ds_read_u8 v4, v0 offset:2
-; GFX7-NEXT: ds_read_u8 v5, v0
-; GFX7-NEXT: ds_read_u8 v6, v0 offset:3
-; GFX7-NEXT: ds_read_u8 v7, v0 offset:5
+; GFX7-NEXT: ds_read_u8 v1, v0
+; GFX7-NEXT: ds_read_u8 v2, v0 offset:1
+; GFX7-NEXT: ds_read_u8 v3, v0 offset:2
+; GFX7-NEXT: ds_read_u8 v4, v0 offset:3
+; GFX7-NEXT: ds_read_u8 v5, v0 offset:4
+; GFX7-NEXT: ds_read_u8 v6, v0 offset:5
+; GFX7-NEXT: ds_read_u8 v7, v0 offset:6
; GFX7-NEXT: ds_read_u8 v8, v0 offset:7
-; GFX7-NEXT: s_waitcnt lgkmcnt(7)
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-NEXT: s_waitcnt lgkmcnt(3)
-; GFX7-NEXT: v_or_b32_e32 v1, v1, v5
-; GFX7-NEXT: s_waitcnt lgkmcnt(2)
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v6
-; GFX7-NEXT: v_or_b32_e32 v4, v5, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_or_b32_e32 v4, v4, v1
-; GFX7-NEXT: s_waitcnt lgkmcnt(1)
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v7
-; GFX7-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX7-NEXT: s_waitcnt lgkmcnt(6)
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX7-NEXT: v_or_b32_e32 v1, v2, v1
+; GFX7-NEXT: s_waitcnt lgkmcnt(4)
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v4
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_or_b32_e32 v4, v2, v1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v8
-; GFX7-NEXT: ds_read_u8 v5, v0 offset:9
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v6
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v7
+; GFX7-NEXT: v_or_b32_e32 v1, v1, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_or_b32_e32 v1, v2, v1
+; GFX7-NEXT: ds_read_u8 v2, v0 offset:8
+; GFX7-NEXT: ds_read_u8 v3, v0 offset:9
+; GFX7-NEXT: ds_read_u8 v5, v0 offset:10
; GFX7-NEXT: ds_read_u8 v6, v0 offset:11
-; GFX7-NEXT: ds_read_u8 v7, v0 offset:13
-; GFX7-NEXT: ds_read_u8 v8, v0 offset:15
+; GFX7-NEXT: ds_read_u8 v7, v0 offset:12
+; GFX7-NEXT: ds_read_u8 v8, v0 offset:13
; GFX7-NEXT: ds_read_u8 v9, v0 offset:14
-; GFX7-NEXT: ds_read_u8 v10, v0 offset:12
-; GFX7-NEXT: ds_read_u8 v11, v0 offset:10
-; GFX7-NEXT: ds_read_u8 v0, v0 offset:8
+; GFX7-NEXT: ds_read_u8 v0, v0 offset:15
+; GFX7-NEXT: s_waitcnt lgkmcnt(6)
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_or_b32_e32 v1, v2, v1
-; GFX7-NEXT: s_waitcnt lgkmcnt(7)
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v5
+; GFX7-NEXT: s_waitcnt lgkmcnt(4)
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v6
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v6
-; GFX7-NEXT: v_or_b32_e32 v2, v2, v11
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0
+; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v8
-; GFX7-NEXT: v_or_b32_e32 v2, v2, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v7
-; GFX7-NEXT: v_or_b32_e32 v3, v3, v9
-; GFX7-NEXT: v_or_b32_e32 v0, v0, v10
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_or_b32_e32 v3, v3, v0
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v9
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_or_b32_e32 v3, v0, v3
; GFX7-NEXT: v_mov_b32_e32 v0, v4
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -331,27 +330,26 @@ define <4 x i32> @load_lds_v4i32_align2(ptr addrspace(3) %ptr) {
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 m0, -1
-; GFX7-NEXT: ds_read_u16 v1, v0 offset:2
-; GFX7-NEXT: ds_read_u16 v3, v0 offset:12
-; GFX7-NEXT: ds_read_u16 v2, v0 offset:8
-; GFX7-NEXT: ds_read_u16 v4, v0 offset:4
-; GFX7-NEXT: ds_read_u16 v5, v0
-; GFX7-NEXT: ds_read_u16 v6, v0 offset:6
-; GFX7-NEXT: ds_read_u16 v7, v0 offset:10
+; GFX7-NEXT: ds_read_u16 v1, v0
+; GFX7-NEXT: ds_read_u16 v2, v0 offset:2
+; GFX7-NEXT: ds_read_u16 v3, v0 offset:4
+; GFX7-NEXT: ds_read_u16 v4, v0 offset:6
+; GFX7-NEXT: ds_read_u16 v5, v0 offset:8
+; GFX7-NEXT: ds_read_u16 v6, v0 offset:10
+; GFX7-NEXT: ds_read_u16 v7, v0 offset:12
; GFX7-NEXT: ds_read_u16 v8, v0 offset:14
-; GFX7-NEXT: s_waitcnt lgkmcnt(7)
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX7-NEXT: s_waitcnt lgkmcnt(3)
-; GFX7-NEXT: v_or_b32_e32 v0, v0, v5
+; GFX7-NEXT: s_waitcnt lgkmcnt(6)
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT: s_waitcnt lgkmcnt(4)
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4
+; GFX7-NEXT: v_or_b32_e32 v1, v1, v3
; GFX7-NEXT: s_waitcnt lgkmcnt(2)
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v6
-; GFX7-NEXT: v_or_b32_e32 v1, v1, v4
-; GFX7-NEXT: s_waitcnt lgkmcnt(1)
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v7
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8
-; GFX7-NEXT: v_or_b32_e32 v3, v4, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v8
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v5
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v7
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: load_lds_v4i32_align2:
diff --git a/llvm/test/CodeGen/AMDGPU/load-local.96.ll b/llvm/test/CodeGen/AMDGPU/load-local.96.ll
index 509aba49893f6..c5455910b68be 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local.96.ll
@@ -86,43 +86,41 @@ define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) {
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 m0, -1
-; GFX7-NEXT: ds_read_u8 v1, v0 offset:1
-; GFX7-NEXT: ds_read_u8 v2, v0 offset:6
-; GFX7-NEXT: ds_read_u8 v4, v0 offset:4
+; GFX7-NEXT: ds_read_u8 v1, v0
+; GFX7-NEXT: ds_read_u8 v2, v0 offset:1
; GFX7-NEXT: ds_read_u8 v3, v0 offset:2
-; GFX7-NEXT: ds_read_u8 v5, v0
-; GFX7-NEXT: ds_read_u8 v6, v0 offset:3
-; GFX7-NEXT: ds_read_u8 v7, v0 offset:5
+; GFX7-NEXT: ds_read_u8 v4, v0 offset:3
+; GFX7-NEXT: ds_read_u8 v5, v0 offset:4
+; GFX7-NEXT: ds_read_u8 v6, v0 offset:5
+; GFX7-NEXT: ds_read_u8 v7, v0 offset:6
; GFX7-NEXT: ds_read_u8 v8, v0 offset:7
-; GFX7-NEXT: s_waitcnt lgkmcnt(7)
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-NEXT: s_waitcnt lgkmcnt(3)
-; GFX7-NEXT: v_or_b32_e32 v1, v1, v5
+; GFX7-NEXT: s_waitcnt lgkmcnt(6)
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX7-NEXT: v_or_b32_e32 v1, v2, v1
+; GFX7-NEXT: s_waitcnt lgkmcnt(4)
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v4
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_or_b32_e32 v3, v2, v1
; GFX7-NEXT: s_waitcnt lgkmcnt(2)
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v6
-; GFX7-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_or_b32_e32 v3, v3, v1
-; GFX7-NEXT: s_waitcnt lgkmcnt(1)
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v6
+; GFX7-NEXT: v_or_b32_e32 v1, v1, v5
+; GFX7-NEXT: ds_read_u8 v4, v0 offset:8
; GFX7-NEXT: ds_read_u8 v5, v0 offset:9
-; GFX7-NEXT: ds_read_u8 v6, v0 offset:11
-; GFX7-NEXT: ds_read_u8 v7, v0 offset:10
-; GFX7-NEXT: ds_read_u8 v0, v0 offset:8
-; GFX7-NEXT: v_or_b32_e32 v1, v1, v4
+; GFX7-NEXT: ds_read_u8 v6, v0 offset:10
+; GFX7-NEXT: ds_read_u8 v0, v0 offset:11
; GFX7-NEXT: s_waitcnt lgkmcnt(4)
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v8
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v8
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v7
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_or_b32_e32 v1, v2, v1
-; GFX7-NEXT: s_waitcnt lgkmcnt(3)
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v5
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v6
-; GFX7-NEXT: v_or_b32_e32 v2, v2, v7
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v2, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v5
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v6
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_or_b32_e32 v2, v0, v2
; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -274,22 +272,21 @@ define <3 x i32> @load_lds_v3i32_align2(ptr addrspace(3) %ptr) {
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 m0, -1
-; GFX7-NEXT: ds_read_u16 v1, v0 offset:2
-; GFX7-NEXT: ds_read_u16 v2, v0 offset:8
+; GFX7-NEXT: ds_read_u16 v1, v0
+; GFX7-NEXT: ds_read_u16 v2, v0 offset:2
; GFX7-NEXT: ds_read_u16 v3, v0 offset:4
-; GFX7-NEXT: ds_read_u16 v4, v0
-; GFX7-NEXT: ds_read_u16 v5, v0 offset:6
+; GFX7-NEXT: ds_read_u16 v4, v0 offset:6
+; GFX7-NEXT: ds_read_u16 v5, v0 offset:8
; GFX7-NEXT: ds_read_u16 v6, v0 offset:10
-; GFX7-NEXT: s_waitcnt lgkmcnt(5)
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX7-NEXT: s_waitcnt lgkmcnt(4)
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-NEXT: s_waitcnt lgkmcnt(2)
-; GFX7-NEXT: v_or_b32_e32 v0, v0, v4
-; GFX7-NEXT: s_waitcnt lgkmcnt(1)
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4
; GFX7-NEXT: v_or_b32_e32 v1, v1, v3
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: load_lds_v3i32_align2:
diff --git a/llvm/test/CodeGen/AMDGPU/max.i16.ll b/llvm/test/CodeGen/AMDGPU/max.i16.ll
index a5b64f6f80d9b..2787edb9d4abe 100644
--- a/llvm/test/CodeGen/AMDGPU/max.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/max.i16.ll
@@ -145,15 +145,15 @@ define amdgpu_kernel void @v_test_imax_sge_v3i16(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v3, v0, s[2:3]
+; GFX9-NEXT: s_nop 0
; GFX9-NEXT: global_load_short_d16 v2, v0, s[2:3] offset:4
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7]
-; GFX9-NEXT: global_load_dword v4, v0, s[2:3]
-; GFX9-NEXT: ; kill: killed $sgpr2_sgpr3
+; GFX9-NEXT: global_load_dword v4, v0, s[6:7]
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: global_load_short_d16 v1, v0, s[6:7] offset:4
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_pk_max_i16 v3, v4, v3
+; GFX9-NEXT: v_pk_max_i16 v3, v3, v4
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_max_i16 v1, v2, v1
; GFX9-NEXT: global_store_short v0, v1, s[0:1] offset:4
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
index 5b7c36559a366..4d58786002bd7 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
@@ -169,23 +169,23 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %
; CHECK-NEXT: s_addc_u32 s21, s21, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v26, s0
-; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:124
-; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:120
-; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:116
-; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:112
-; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:108
-; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:104
-; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:100
-; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen offset:96
+; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:100
+; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:104
+; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:108
+; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen offset:112
+; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:116
+; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:120
+; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:124
+; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:96
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-NEXT: buffer_load_dword v11, v26, s[20:23], 0 offen offset:92
-; CHECK-NEXT: buffer_load_dword v10, v26, s[20:23], 0 offen offset:88
-; CHECK-NEXT: buffer_load_dword v9, v26, s[20:23], 0 offen offset:84
-; CHECK-NEXT: buffer_load_dword v8, v26, s[20:23], 0 offen offset:80
-; CHECK-NEXT: buffer_load_dword v15, v26, s[20:23], 0 offen offset:76
-; CHECK-NEXT: buffer_load_dword v14, v26, s[20:23], 0 offen offset:72
-; CHECK-NEXT: buffer_load_dword v13, v26, s[20:23], 0 offen offset:68
-; CHECK-NEXT: buffer_load_dword v12, v26, s[20:23], 0 offen offset:64
+; CHECK-NEXT: buffer_load_dword v8, v26, s[20:23], 0 offen offset:64
+; CHECK-NEXT: buffer_load_dword v9, v26, s[20:23], 0 offen offset:68
+; CHECK-NEXT: buffer_load_dword v10, v26, s[20:23], 0 offen offset:72
+; CHECK-NEXT: buffer_load_dword v11, v26, s[20:23], 0 offen offset:76
+; CHECK-NEXT: buffer_load_dword v12, v26, s[20:23], 0 offen offset:80
+; CHECK-NEXT: buffer_load_dword v13, v26, s[20:23], 0 offen offset:84
+; CHECK-NEXT: buffer_load_dword v14, v26, s[20:23], 0 offen offset:88
+; CHECK-NEXT: buffer_load_dword v15, v26, s[20:23], 0 offen offset:92
; CHECK-NEXT: buffer_load_dword v16, v26, s[20:23], 0 offen offset:32
; CHECK-NEXT: buffer_load_dword v17, v26, s[20:23], 0 offen offset:36
; CHECK-NEXT: buffer_load_dword v18, v26, s[20:23], 0 offen offset:40
@@ -197,22 +197,22 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v25, s1
; CHECK-NEXT: v_mov_b32_e32 v24, s0
-; CHECK-NEXT: s_waitcnt vmcnt(20)
-; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:112
+; CHECK-NEXT: s_waitcnt vmcnt(17)
+; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:112
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:96
+; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:96
; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen
+; CHECK-NEXT: s_nop 0
; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:4
; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:8
-; CHECK-NEXT: s_nop 0
; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen offset:16
; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:20
; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:24
; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:28
; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:12
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11] offset:80
-; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15] offset:64
+; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15] offset:80
+; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11] offset:64
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:48
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19] offset:32
; CHECK-NEXT: s_waitcnt vmcnt(0)
@@ -457,23 +457,23 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %
; CHECK-NEXT: s_addc_u32 s21, s21, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v26, s0
-; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:124
-; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:120
-; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:116
-; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:112
-; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:108
-; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:104
-; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:100
-; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen offset:96
+; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:100
+; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:104
+; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:108
+; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen offset:112
+; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:116
+; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:120
+; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:124
+; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:96
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-NEXT: buffer_load_dword v11, v26, s[20:23], 0 offen offset:92
-; CHECK-NEXT: buffer_load_dword v10, v26, s[20:23], 0 offen offset:88
-; CHECK-NEXT: buffer_load_dword v9, v26, s[20:23], 0 offen offset:84
-; CHECK-NEXT: buffer_load_dword v8, v26, s[20:23], 0 offen offset:80
-; CHECK-NEXT: buffer_load_dword v15, v26, s[20:23], 0 offen offset:76
-; CHECK-NEXT: buffer_load_dword v14, v26, s[20:23], 0 offen offset:72
-; CHECK-NEXT: buffer_load_dword v13, v26, s[20:23], 0 offen offset:68
-; CHECK-NEXT: buffer_load_dword v12, v26, s[20:23], 0 offen offset:64
+; CHECK-NEXT: buffer_load_dword v8, v26, s[20:23], 0 offen offset:64
+; CHECK-NEXT: buffer_load_dword v9, v26, s[20:23], 0 offen offset:68
+; CHECK-NEXT: buffer_load_dword v10, v26, s[20:23], 0 offen offset:72
+; CHECK-NEXT: buffer_load_dword v11, v26, s[20:23], 0 offen offset:76
+; CHECK-NEXT: buffer_load_dword v12, v26, s[20:23], 0 offen offset:80
+; CHECK-NEXT: buffer_load_dword v13, v26, s[20:23], 0 offen offset:84
+; CHECK-NEXT: buffer_load_dword v14, v26, s[20:23], 0 offen offset:88
+; CHECK-NEXT: buffer_load_dword v15, v26, s[20:23], 0 offen offset:92
; CHECK-NEXT: buffer_load_dword v16, v26, s[20:23], 0 offen offset:32
; CHECK-NEXT: buffer_load_dword v17, v26, s[20:23], 0 offen offset:36
; CHECK-NEXT: buffer_load_dword v18, v26, s[20:23], 0 offen offset:40
@@ -485,22 +485,22 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v25, s1
; CHECK-NEXT: v_mov_b32_e32 v24, s0
-; CHECK-NEXT: s_waitcnt vmcnt(20)
-; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:112
+; CHECK-NEXT: s_waitcnt vmcnt(17)
+; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:112
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:96
+; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:96
; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen
+; CHECK-NEXT: s_nop 0
; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:4
; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:8
-; CHECK-NEXT: s_nop 0
; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen offset:16
; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:20
; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:24
; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:28
; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:12
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11] offset:80
-; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15] offset:64
+; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15] offset:80
+; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11] offset:64
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:48
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19] offset:32
; CHECK-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
index c60642e2cc4d8..54f3e1b862db9 100644
--- a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
+++ b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
@@ -19,8 +19,8 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0)
; CHECK-NEXT: v_add_co_u32 v24, vcc_lo, v2, s4
; CHECK-NEXT: v_add_co_ci_u32_e64 v25, null, s5, v3, vcc_lo
; CHECK-NEXT: s_clause 0x4
-; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[24:25] offset:64
-; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[24:25] offset:32
+; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[24:25] offset:32
+; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[24:25] offset:64
; CHECK-NEXT: flat_load_dwordx4 v[12:15], v[24:25]
; CHECK-NEXT: flat_load_dwordx4 v[16:19], v[24:25] offset:16
; CHECK-NEXT: flat_load_dwordx4 v[20:23], v[24:25] offset:128
@@ -49,10 +49,9 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0)
; CHECK-NEXT: v_add_co_u32 v102, vcc_lo, v100, 48
; CHECK-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5]
; CHECK-NEXT: v_add_co_ci_u32_e64 v103, null, 0, v101, vcc_lo
-; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:64
-; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15)
-; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11] offset:32
+; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14)
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11] offset:64
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:32
; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15)
; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15]
; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15)
@@ -3649,54 +3648,54 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:52
; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:56
; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:60
+; CHECK-NEXT: buffer_load_dword v11, v2, s[0:3], 0 offen offset:64
+; CHECK-NEXT: buffer_load_dword v12, v2, s[0:3], 0 offen offset:68
+; CHECK-NEXT: buffer_load_dword v13, v2, s[0:3], 0 offen offset:72
; CHECK-NEXT: buffer_load_dword v14, v2, s[0:3], 0 offen offset:76
-; CHECK-NEXT: buffer_load_dword v18, v2, s[0:3], 0 offen offset:92
-; CHECK-NEXT: buffer_load_dword v17, v2, s[0:3], 0 offen offset:88
-; CHECK-NEXT: buffer_load_dword v16, v2, s[0:3], 0 offen offset:84
; CHECK-NEXT: buffer_load_dword v15, v2, s[0:3], 0 offen offset:80
-; CHECK-NEXT: buffer_load_dword v13, v2, s[0:3], 0 offen offset:72
-; CHECK-NEXT: buffer_load_dword v12, v2, s[0:3], 0 offen offset:68
-; CHECK-NEXT: buffer_load_dword v11, v2, s[0:3], 0 offen offset:64
+; CHECK-NEXT: buffer_load_dword v16, v2, s[0:3], 0 offen offset:84
+; CHECK-NEXT: buffer_load_dword v17, v2, s[0:3], 0 offen offset:88
+; CHECK-NEXT: buffer_load_dword v18, v2, s[0:3], 0 offen offset:92
+; CHECK-NEXT: buffer_load_dword v19, v2, s[0:3], 0 offen offset:224
+; CHECK-NEXT: buffer_load_dword v20, v2, s[0:3], 0 offen offset:228
+; CHECK-NEXT: buffer_load_dword v21, v2, s[0:3], 0 offen offset:232
; CHECK-NEXT: buffer_load_dword v22, v2, s[0:3], 0 offen offset:236
-; CHECK-NEXT: buffer_load_dword v26, v2, s[0:3], 0 offen offset:252
-; CHECK-NEXT: buffer_load_dword v25, v2, s[0:3], 0 offen offset:248
-; CHECK-NEXT: buffer_load_dword v24, v2, s[0:3], 0 offen offset:244
; CHECK-NEXT: buffer_load_dword v23, v2, s[0:3], 0 offen offset:240
-; CHECK-NEXT: buffer_load_dword v21, v2, s[0:3], 0 offen offset:232
-; CHECK-NEXT: buffer_load_dword v20, v2, s[0:3], 0 offen offset:228
-; CHECK-NEXT: buffer_load_dword v19, v2, s[0:3], 0 offen offset:224
-; CHECK-NEXT: buffer_load_dword v30, v2, s[0:3], 0 offen offset:220
-; CHECK-NEXT: buffer_load_dword v29, v2, s[0:3], 0 offen offset:216
-; CHECK-NEXT: buffer_load_dword v28, v2, s[0:3], 0 offen offset:212
-; CHECK-NEXT: buffer_load_dword v27, v2, s[0:3], 0 offen offset:208
-; CHECK-NEXT: buffer_load_dword v34, v2, s[0:3], 0 offen offset:204
-; CHECK-NEXT: buffer_load_dword v33, v2, s[0:3], 0 offen offset:200
-; CHECK-NEXT: buffer_load_dword v32, v2, s[0:3], 0 offen offset:196
-; CHECK-NEXT: buffer_load_dword v31, v2, s[0:3], 0 offen offset:192
-; CHECK-NEXT: buffer_load_dword v38, v2, s[0:3], 0 offen offset:108
-; CHECK-NEXT: buffer_load_dword v51, v2, s[0:3], 0 offen offset:124
-; CHECK-NEXT: buffer_load_dword v50, v2, s[0:3], 0 offen offset:120
-; CHECK-NEXT: buffer_load_dword v49, v2, s[0:3], 0 offen offset:116
-; CHECK-NEXT: buffer_load_dword v48, v2, s[0:3], 0 offen offset:112
-; CHECK-NEXT: buffer_load_dword v37, v2, s[0:3], 0 offen offset:104
-; CHECK-NEXT: buffer_load_dword v36, v2, s[0:3], 0 offen offset:100
-; CHECK-NEXT: buffer_load_dword v35, v2, s[0:3], 0 offen offset:96
-; CHECK-NEXT: buffer_load_dword v55, v2, s[0:3], 0 offen offset:188
-; CHECK-NEXT: buffer_load_dword v54, v2, s[0:3], 0 offen offset:184
-; CHECK-NEXT: buffer_load_dword v53, v2, s[0:3], 0 offen offset:180
-; CHECK-NEXT: buffer_load_dword v52, v2, s[0:3], 0 offen offset:176
-; CHECK-NEXT: buffer_load_dword v67, v2, s[0:3], 0 offen offset:172
-; CHECK-NEXT: buffer_load_dword v66, v2, s[0:3], 0 offen offset:168
-; CHECK-NEXT: buffer_load_dword v65, v2, s[0:3], 0 offen offset:164
-; CHECK-NEXT: buffer_load_dword v64, v2, s[0:3], 0 offen offset:160
-; CHECK-NEXT: buffer_load_dword v71, v2, s[0:3], 0 offen offset:156
-; CHECK-NEXT: buffer_load_dword v70, v2, s[0:3], 0 offen offset:152
-; CHECK-NEXT: buffer_load_dword v69, v2, s[0:3], 0 offen offset:148
-; CHECK-NEXT: buffer_load_dword v68, v2, s[0:3], 0 offen offset:144
-; CHECK-NEXT: buffer_load_dword v83, v2, s[0:3], 0 offen offset:140
-; CHECK-NEXT: buffer_load_dword v82, v2, s[0:3], 0 offen offset:136
-; CHECK-NEXT: buffer_load_dword v81, v2, s[0:3], 0 offen offset:132
-; CHECK-NEXT: buffer_load_dword v80, v2, s[0:3], 0 offen offset:128
+; CHECK-NEXT: buffer_load_dword v24, v2, s[0:3], 0 offen offset:244
+; CHECK-NEXT: buffer_load_dword v25, v2, s[0:3], 0 offen offset:248
+; CHECK-NEXT: buffer_load_dword v26, v2, s[0:3], 0 offen offset:252
+; CHECK-NEXT: buffer_load_dword v27, v2, s[0:3], 0 offen offset:192
+; CHECK-NEXT: buffer_load_dword v28, v2, s[0:3], 0 offen offset:196
+; CHECK-NEXT: buffer_load_dword v29, v2, s[0:3], 0 offen offset:200
+; CHECK-NEXT: buffer_load_dword v30, v2, s[0:3], 0 offen offset:204
+; CHECK-NEXT: buffer_load_dword v31, v2, s[0:3], 0 offen offset:208
+; CHECK-NEXT: buffer_load_dword v32, v2, s[0:3], 0 offen offset:212
+; CHECK-NEXT: buffer_load_dword v33, v2, s[0:3], 0 offen offset:216
+; CHECK-NEXT: buffer_load_dword v34, v2, s[0:3], 0 offen offset:220
+; CHECK-NEXT: buffer_load_dword v35, v2, s[0:3], 0 offen offset:160
+; CHECK-NEXT: buffer_load_dword v36, v2, s[0:3], 0 offen offset:164
+; CHECK-NEXT: buffer_load_dword v37, v2, s[0:3], 0 offen offset:168
+; CHECK-NEXT: buffer_load_dword v38, v2, s[0:3], 0 offen offset:172
+; CHECK-NEXT: buffer_load_dword v48, v2, s[0:3], 0 offen offset:176
+; CHECK-NEXT: buffer_load_dword v49, v2, s[0:3], 0 offen offset:180
+; CHECK-NEXT: buffer_load_dword v50, v2, s[0:3], 0 offen offset:184
+; CHECK-NEXT: buffer_load_dword v51, v2, s[0:3], 0 offen offset:188
+; CHECK-NEXT: buffer_load_dword v52, v2, s[0:3], 0 offen offset:128
+; CHECK-NEXT: buffer_load_dword v53, v2, s[0:3], 0 offen offset:132
+; CHECK-NEXT: buffer_load_dword v54, v2, s[0:3], 0 offen offset:136
+; CHECK-NEXT: buffer_load_dword v55, v2, s[0:3], 0 offen offset:140
+; CHECK-NEXT: buffer_load_dword v64, v2, s[0:3], 0 offen offset:144
+; CHECK-NEXT: buffer_load_dword v65, v2, s[0:3], 0 offen offset:148
+; CHECK-NEXT: buffer_load_dword v66, v2, s[0:3], 0 offen offset:152
+; CHECK-NEXT: buffer_load_dword v67, v2, s[0:3], 0 offen offset:156
+; CHECK-NEXT: buffer_load_dword v68, v2, s[0:3], 0 offen offset:96
+; CHECK-NEXT: buffer_load_dword v69, v2, s[0:3], 0 offen offset:100
+; CHECK-NEXT: buffer_load_dword v70, v2, s[0:3], 0 offen offset:104
+; CHECK-NEXT: buffer_load_dword v71, v2, s[0:3], 0 offen offset:108
+; CHECK-NEXT: buffer_load_dword v80, v2, s[0:3], 0 offen offset:112
+; CHECK-NEXT: buffer_load_dword v81, v2, s[0:3], 0 offen offset:116
+; CHECK-NEXT: buffer_load_dword v82, v2, s[0:3], 0 offen offset:120
+; CHECK-NEXT: buffer_load_dword v83, v2, s[0:3], 0 offen offset:124
; CHECK-NEXT: buffer_load_dword v84, v2, s[0:3], 0 offen
; CHECK-NEXT: buffer_load_dword v85, v2, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_load_dword v86, v2, s[0:3], 0 offen offset:8
@@ -3713,24 +3712,21 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; CHECK-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5]
; CHECK-NEXT: v_add_nc_u32_e32 v2, 0x100, v2
; CHECK-NEXT: v_add_co_ci_u32_e64 v103, null, 0, v101, vcc_lo
-; CHECK-NEXT: s_waitcnt vmcnt(43)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[23:26] offset:192
; CHECK-NEXT: s_waitcnt vmcnt(40)
+; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[23:26] offset:192
; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[19:22] offset:176
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[27:30] offset:160
; CHECK-NEXT: s_waitcnt vmcnt(32)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[31:34] offset:144
-; CHECK-NEXT: s_waitcnt vmcnt(20)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[52:55] offset:128
+; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[31:34] offset:160
+; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[27:30] offset:144
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[48:51] offset:128
+; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[35:38] offset:112
; CHECK-NEXT: s_waitcnt vmcnt(16)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[64:67] offset:112
-; CHECK-NEXT: s_waitcnt vmcnt(12)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[68:71] offset:96
+; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[64:67] offset:96
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:128
; CHECK-NEXT: s_waitcnt vmcnt(8)
-; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:128
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[48:51] offset:64
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[35:38] offset:48
+; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[80:83] offset:64
+; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[68:71] offset:48
; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[15:18] offset:32
; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[11:14] offset:64
; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[7:10]
@@ -3806,31 +3802,30 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:21
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:22
; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:23
-; ALIGNED-NEXT: buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:19
+; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:24
+; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:25
+; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:26
+; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:27
; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28
; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:29
-; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:30
-; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:31
-; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:24
-; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:25
-; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:26
-; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27
+; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:30
+; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:31
; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:32
; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:33
; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:34
-; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:35
+; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:35
; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:36
; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:37
-; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:38
-; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:39
+; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:38
+; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:39
; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:40
-; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:41
-; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:42
-; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:43
-; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:44
-; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:45
-; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:46
-; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:47
+; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:41
+; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:42
+; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:43
+; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:44
+; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:45
+; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:46
+; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:47
; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:48
; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:49
; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:50
@@ -3839,49 +3834,45 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:53
; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:54
; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:55
+; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:56
+; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:57
+; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:58
+; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:59
; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:60
; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:61
; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:62
; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:63
-; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:56
-; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:57
-; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:58
-; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:59
-; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:64
-; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:65
+; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:64
+; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:65
; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:66
; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:67
-; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:68
+; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:68
; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:69
; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:70
; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:71
; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:76
; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:77
+; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:72
+; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:73
; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:78
; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:79
-; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:75
-; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:159
-; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:155
-; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:152
-; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:153
-; ALIGNED-NEXT: buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:154
-; ALIGNED-NEXT: s_clause 0x35
-; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:160
-; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:161
-; ALIGNED-NEXT: buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:162
-; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:163
-; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:164
-; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:165
-; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:166
-; ALIGNED-NEXT: buffer_load_ubyte v91, v2, s[0:3], 0 offen offset:167
-; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:172
-; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:173
-; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:174
+; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:160
+; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:161
+; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:162
+; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:163
+; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:164
+; ALIGNED-NEXT: s_clause 0x30
+; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:165
+; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:166
+; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:167
+; ALIGNED-NEXT: buffer_load_ubyte v89, v2, s[0:3], 0 offen offset:172
+; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:173
+; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:174
; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:175
-; ALIGNED-NEXT: buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:171
-; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:168
-; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:169
-; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:170
+; ALIGNED-NEXT: buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:168
+; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:169
+; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:170
+; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:171
; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:176
; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:177
; ALIGNED-NEXT: buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:178
@@ -3894,62 +3885,62 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:189
; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:190
; ALIGNED-NEXT: buffer_load_ubyte v41, v2, s[0:3], 0 offen offset:191
-; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:187
-; ALIGNED-NEXT: buffer_load_ubyte v119, v2, s[0:3], 0 offen offset:184
-; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:185
-; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:186
+; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:184
+; ALIGNED-NEXT: buffer_load_ubyte v119, v2, s[0:3], 0 offen offset:185
+; ALIGNED-NEXT: buffer_load_ubyte v117, v2, s[0:3], 0 offen offset:186
+; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:187
; ALIGNED-NEXT: buffer_load_ubyte v115, v2, s[0:3], 0 offen offset:192
-; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:193
+; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:193
; ALIGNED-NEXT: buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:194
; ALIGNED-NEXT: buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:195
-; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:196
+; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:196
; ALIGNED-NEXT: buffer_load_ubyte v103, v2, s[0:3], 0 offen offset:197
; ALIGNED-NEXT: buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:198
; ALIGNED-NEXT: buffer_load_ubyte v99, v2, s[0:3], 0 offen offset:199
; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:204
-; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:205
-; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:206
-; ALIGNED-NEXT: buffer_load_ubyte v86, v2, s[0:3], 0 offen offset:207
-; ALIGNED-NEXT: buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:203
-; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:200
-; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:201
-; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:202
-; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:2
-; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:3
-; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:4
-; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:5
-; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:6
-; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:7
+; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:205
+; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:206
+; ALIGNED-NEXT: buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:207
+; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:12
+; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:13
+; ALIGNED-NEXT: buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:14
+; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:15
+; ALIGNED-NEXT: buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:2
+; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:3
+; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:4
+; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:5
+; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:6
+; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:7
; ALIGNED-NEXT: s_waitcnt vmcnt(62)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v5
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v9, 8, v8
; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v6
-; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v12
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v10, 8, v9
+; ALIGNED-NEXT: v_lshl_or_b32 v5, v8, 8, v6
+; ALIGNED-NEXT: v_lshl_or_b32 v6, v12, 8, v11
; ALIGNED-NEXT: v_lshl_or_b32 v7, v15, 8, v14
-; ALIGNED-NEXT: v_lshl_or_b32 v8, v20, 8, v19
+; ALIGNED-NEXT: v_lshl_or_b32 v8, v21, 8, v19
; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v13
-; ALIGNED-NEXT: v_lshl_or_b32 v10, v18, 8, v17
-; ALIGNED-NEXT: v_lshl_or_b32 v11, v26, 8, v22
-; ALIGNED-NEXT: v_lshl_or_b32 v12, v28, 8, v27
-; ALIGNED-NEXT: v_lshl_or_b32 v13, v23, 8, v21
+; ALIGNED-NEXT: v_lshl_or_b32 v10, v20, 8, v18
+; ALIGNED-NEXT: v_lshl_or_b32 v11, v27, 8, v22
+; ALIGNED-NEXT: v_lshl_or_b32 v12, v29, 8, v28
+; ALIGNED-NEXT: v_lshl_or_b32 v13, v24, 8, v23
+; ALIGNED-NEXT: v_lshl_or_b32 v14, v26, 8, v25
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v14, v25, 8, v24
; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v6, 16, v5
; ALIGNED-NEXT: v_lshl_or_b32 v4, v8, 16, v7
@@ -3969,76 +3960,83 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v5, v49, 8, v48
; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v6, v51, 8, v38
+; ALIGNED-NEXT: v_lshl_or_b32 v6, v17, 8, v38
; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v7, v53, 8, v52
+; ALIGNED-NEXT: v_lshl_or_b32 v7, v52, 8, v51
; ALIGNED-NEXT: v_lshl_or_b32 v0, v0, 16, v15
; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v1
; ALIGNED-NEXT: v_lshl_or_b32 v3, v5, 16, v4
-; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:85
; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87
; ALIGNED-NEXT: v_lshl_or_b32 v4, v7, 16, v6
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v55, 8, v29
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v54, 8, v53
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v1, v67, 8, v66
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v64, 8, v54
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v64, 8, v55
; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(61)
; ALIGNED-NEXT: v_lshl_or_b32 v4, v68, 8, v65
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: s_clause 0x2
; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:82
+; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:85
; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:86
-; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(62)
; ALIGNED-NEXT: v_lshl_or_b32 v0, v70, 8, v69
; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:84
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:74
-; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:83
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(62)
+; ALIGNED-NEXT: s_waitcnt vmcnt(60)
; ALIGNED-NEXT: v_lshl_or_b32 v1, v80, 8, v71
-; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: s_clause 0x5
-; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:73
+; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:75
+; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x3
; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:212
; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:213
; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:214
-; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:215
-; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:211
-; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:72
-; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:215
+; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v82, 8, v81
+; ALIGNED-NEXT: s_clause 0x3
+; ALIGNED-NEXT: buffer_load_ubyte v86, v2, s[0:3], 0 offen offset:200
+; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:201
+; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:202
+; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:203
+; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_clause 0x7
; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:216
-; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:217
-; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:218
-; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:219
-; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:220
-; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:221
-; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:222
-; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:223
+; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:217
+; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:218
+; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:219
+; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:220
+; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:221
+; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:222
+; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:223
; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:208
-; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:209
-; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:210
+; ALIGNED-NEXT: s_clause 0x3
+; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:208
+; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:209
+; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:210
+; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:211
; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
@@ -4056,31 +4054,31 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:229
; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:230
; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:231
-; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x4
+; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x3
; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:236
-; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:237
-; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:238
-; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:239
-; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:235
+; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:237
+; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:238
+; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:239
; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:232
-; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:233
-; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:234
+; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x3
+; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:232
+; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:233
+; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:234
+; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:235
; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0xc
+; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0xb
; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:240
; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:241
; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:242
@@ -4090,55 +4088,53 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:246
; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:247
; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:252
-; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:253
-; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:254
-; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:255
-; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:251
-; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(57)
-; ALIGNED-NEXT: buffer_store_dword v90, off, s[0:3], s32 offset:1084 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v90, v90, 8, v121
+; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:253
+; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:254
+; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:255
+; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(59)
+; ALIGNED-NEXT: buffer_store_dword v78, off, s[0:3], s32 offset:1148 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v78, v78, 8, v93
+; ALIGNED-NEXT: buffer_store_dword v93, off, s[0:3], s32 offset:1156 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(56)
-; ALIGNED-NEXT: buffer_store_dword v106, off, s[0:3], s32 offset:1092 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v121, off, s[0:3], s32 offset:1096 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v92, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(55)
-; ALIGNED-NEXT: buffer_store_dword v109, off, s[0:3], s32 offset:1108 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v95, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v79, off, s[0:3], s32 offset:1152 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v90, off, s[0:3], s32 offset:1160 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(54)
-; ALIGNED-NEXT: buffer_store_dword v111, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(53)
-; ALIGNED-NEXT: buffer_store_dword v122, off, s[0:3], s32 offset:1124 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(52)
; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(51)
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(49)
; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(48)
+; ALIGNED-NEXT: s_waitcnt vmcnt(50)
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(49)
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(46)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(40)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v81, 8, v3
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:84
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(48)
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 8, v3
+; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:83
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:81
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:80
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7
+; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:94
; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:98
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6
@@ -4146,33 +4142,35 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:102
; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:103
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v4
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:94
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:95
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:95
+; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:101
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:93
-; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:91
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:92
+; ALIGNED-NEXT: s_waitcnt vmcnt(7)
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(6)
-; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(5)
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3
+; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:90
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:101
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:91
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:89
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
@@ -4180,7 +4178,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
@@ -4188,23 +4186,24 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:99
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:100
-; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:99
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:100
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:97
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:96
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7
+; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:110
; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:114
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6
@@ -4212,33 +4211,35 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:118
; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:119
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v4
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:110
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:111
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:111
+; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:117
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:109
-; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:107
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:108
+; ALIGNED-NEXT: s_waitcnt vmcnt(7)
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(6)
-; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(5)
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3
+; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:106
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:117
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:107
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:105
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill
@@ -4246,7 +4247,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
@@ -4254,23 +4255,24 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:115
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:116
-; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:115
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:116
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:113
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:112
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7
+; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:126
; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:130
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6
@@ -4278,33 +4280,35 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:134
; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:135
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v4
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:126
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:127
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:127
+; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:133
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:125
-; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:123
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:124
+; ALIGNED-NEXT: s_waitcnt vmcnt(7)
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(6)
-; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1012 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1004 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(5)
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1016 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1012 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1016 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1008 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3
+; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:122
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:133
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:123
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:121
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill
@@ -4312,7 +4316,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1008 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
@@ -4320,23 +4324,24 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:131
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:132
-; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1004 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:131
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:132
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:129
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:128
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7
+; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:142
; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:146
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6
@@ -4344,33 +4349,35 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:150
; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:151
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v4
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:142
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:143
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:143
+; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:149
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:141
-; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:139
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:140
+; ALIGNED-NEXT: s_waitcnt vmcnt(7)
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(6)
-; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1120 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1084 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(5)
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1128 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1092 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1096 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1024 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3
+; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:138
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1056 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:149
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:139
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:137
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1048 ; 4-byte Folded Spill
@@ -4378,7 +4385,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1060 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1116 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1056 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1052 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
@@ -4386,184 +4393,188 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:147
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:148
-; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1104 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:147
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:148
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:145
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1064 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:144
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1100 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1072 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1072 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v86, 8, v96
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7
+; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:158
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6
-; ALIGNED-NEXT: v_lshl_or_b32 v6, v10, 8, v12
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:158
+; ALIGNED-NEXT: v_lshl_or_b32 v6, v9, 8, v11
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1100 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v4
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:159
; ALIGNED-NEXT: v_lshl_or_b32 v5, v14, 8, v18
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:157
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1108 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:156
+; ALIGNED-NEXT: s_waitcnt vmcnt(3)
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1116 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1156 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1120 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1152 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1104 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v125, 8, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v87, 8, v97
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3
+; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:154
+; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:155
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v82, 8, v83
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v124, 8, v110
-; ALIGNED-NEXT: v_lshl_or_b32 v79, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v83, 8, v84
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v85, 8, v82
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v120, 8, v123
-; ALIGNED-NEXT: v_lshl_or_b32 v72, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v67, 8, v80
+; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:153
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:152
+; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1144 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1136 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v127
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v84, 8, v86
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v92, 8, v93
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v64, 8, v70
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v104, 8, v108
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v108, 8, v109
+; ALIGNED-NEXT: v_lshl_or_b32 v73, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v67, 8, v80
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v65, 8, v70
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v122, 8, v125
; ALIGNED-NEXT: v_lshl_or_b32 v45, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v66, 8, v71
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v52, 8, v53
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v69, 8, v71
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v91, 8, v94
-; ALIGNED-NEXT: v_lshl_or_b32 v117, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v55, 8, v69
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v105, 8, v120
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v53, 8, v54
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v95, 8, v105
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v51, 8, v54
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v111, 8, v123
+; ALIGNED-NEXT: v_lshl_or_b32 v118, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v66, 8, v68
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v52, 8, v64
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v75, 8, v76
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v75, 8, v77
; ALIGNED-NEXT: v_lshl_or_b32 v114, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v38, 8, v49
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v65, 8, v39
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v39, 8, v50
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 8, v78
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v88, 8, v89
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v38, 8, v48
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v62, 8, v63
; ALIGNED-NEXT: v_lshl_or_b32 v98, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v35, 8, v37
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v74, 8, v62
; ALIGNED-NEXT: v_lshl_or_b32 v4, v32, 8, v31
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v63, 8, v73
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v72, 8, v74
; ALIGNED-NEXT: v_lshl_or_b32 v81, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v33, 8, v36
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v30, 8, v34
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v56, 8, v47
-; ALIGNED-NEXT: v_lshl_or_b32 v68, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v27, 8, v29
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v30, 8, v34
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v59, 8, v61
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v26, 8, v28
+; ALIGNED-NEXT: v_lshl_or_b32 v55, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v28, 8, v29
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v25, 8, v27
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v46, 8, v58
-; ALIGNED-NEXT: v_lshl_or_b32 v50, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v23, 8, v24
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v25, 8, v22
+; ALIGNED-NEXT: v_lshl_or_b32 v51, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v24, 8, v26
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v57, 8, v60
-; ALIGNED-NEXT: v_lshl_or_b32 v48, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v19, 8, v21
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v22, 8, v23
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v41, 8, v42
+; ALIGNED-NEXT: v_lshl_or_b32 v49, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v19, 8, v21
; ALIGNED-NEXT: v_lshl_or_b32 v4, v16, 8, v15
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v44
; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v4, v17, 8, v20
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v116
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v116, 8, v117
; ALIGNED-NEXT: v_lshl_or_b32 v4, v5, 16, v4
-; ALIGNED-NEXT: v_lshl_or_b32 v5, v11, 8, v13
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v118, 8, v119
-; ALIGNED-NEXT: v_lshl_or_b32 v5, v6, 16, v5
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v119, 8, v40
+; ALIGNED-NEXT: v_lshl_or_b32 v5, v12, 8, v13
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v100, 8, v101
+; ALIGNED-NEXT: v_lshl_or_b32 v5, v6, 16, v5
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v112, 8, v115
-; ALIGNED-NEXT: v_lshl_or_b32 v107, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v103, 8, v113
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v113, 8, v115
+; ALIGNED-NEXT: v_lshl_or_b32 v121, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v103, 8, v112
; ALIGNED-NEXT: v_lshl_or_b32 v1, v99, 8, v102
-; ALIGNED-NEXT: v_lshl_or_b32 v88, v1, 16, v0
-; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:248
-; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:249
-; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:250
+; ALIGNED-NEXT: v_lshl_or_b32 v94, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v96, 8, v97
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v85, 8, v87
+; ALIGNED-NEXT: v_lshl_or_b32 v91, v1, 16, v0
+; ALIGNED-NEXT: s_clause 0x4
+; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:248
+; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:249
+; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:250
+; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:251
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen
-; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: v_lshl_or_b32 v6, v7, 8, v8
+; ALIGNED-NEXT: s_waitcnt vmcnt(3)
+; ALIGNED-NEXT: v_lshl_or_b32 v6, v8, 8, v10
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v89, v9, 8, v1
-; ALIGNED-NEXT: v_lshl_or_b32 v6, v89, 16, v6
-; ALIGNED-NEXT: buffer_load_ubyte v89, v2, s[0:3], 0 offen offset:1
+; ALIGNED-NEXT: v_lshl_or_b32 v76, v1, 8, v7
+; ALIGNED-NEXT: v_lshl_or_b32 v6, v76, 16, v6
+; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:1
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v89, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v89, v89, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v90, 16, v89
-; ALIGNED-NEXT: v_lshl_or_b32 v89, v109, 8, v106
-; ALIGNED-NEXT: v_lshl_or_b32 v90, v122, 8, v111
-; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:15
-; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:11
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:8
-; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:9
-; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:10
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v90, 16, v89
-; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:13
-; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:14
-; ALIGNED-NEXT: buffer_store_dword v126, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:12
-; ALIGNED-NEXT: s_waitcnt vmcnt(7)
-; ALIGNED-NEXT: buffer_store_dword v106, off, s[0:3], s32 offset:1168 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: buffer_store_dword v90, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v90, v106, 8, v90
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: v_lshl_or_b32 v89, v127, 8, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1160 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v90, 16, v89
-; ALIGNED-NEXT: v_lshl_or_b32 v89, v111, 8, v122
-; ALIGNED-NEXT: v_lshl_or_b32 v90, v121, 8, v109
+; ALIGNED-NEXT: buffer_store_dword v76, off, s[0:3], s32 offset:1140 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v76, v76, 8, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v78, 16, v76
+; ALIGNED-NEXT: v_lshl_or_b32 v78, v95, 8, v92
+; ALIGNED-NEXT: s_clause 0x3
+; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:8
+; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:9
+; ALIGNED-NEXT: buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:10
+; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:11
+; ALIGNED-NEXT: v_lshl_or_b32 v76, v90, 8, v79
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v90, 16, v89
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:18
-; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:16
-; ALIGNED-NEXT: buffer_load_ubyte v89, v2, s[0:3], 0 offen offset:17
-; ALIGNED-NEXT: v_add_nc_u32_e32 v2, 0x100, v2
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v78, 16, v76
+; ALIGNED-NEXT: v_lshl_or_b32 v76, v107, 8, v124
+; ALIGNED-NEXT: v_lshl_or_b32 v78, v106, 8, v110
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v78, 16, v76
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: v_lshl_or_b32 v76, v95, 8, v104
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: v_lshl_or_b32 v78, v92, 8, v93
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v78, 16, v76
+; ALIGNED-NEXT: s_clause 0x3
+; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:18
+; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:19
+; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:16
+; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:17
+; ALIGNED-NEXT: v_add_nc_u32_e32 v2, 0x100, v2
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:232
; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:236
; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:228
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v126, 8, v90
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v79, 8, v78
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: v_lshl_or_b32 v126, v89, 8, v106
+; ALIGNED-NEXT: v_lshl_or_b32 v126, v76, 8, v90
; ALIGNED-NEXT: v_lshl_or_b32 v126, v0, 16, v126
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
@@ -4575,14 +4586,14 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: v_add_co_u32 v5, vcc_lo, v3, 3
; ALIGNED-NEXT: s_addc_u32 s5, s5, 0
; ALIGNED-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v4, vcc_lo
-; ALIGNED-NEXT: flat_store_byte v[5:6], v1 offset:247
-; ALIGNED-NEXT: flat_store_byte v[5:6], v9 offset:248
-; ALIGNED-NEXT: flat_store_byte v[5:6], v7 offset:246
-; ALIGNED-NEXT: flat_store_byte v[5:6], v10 offset:252
-; ALIGNED-NEXT: flat_store_byte v[5:6], v11 offset:250
-; ALIGNED-NEXT: flat_store_byte v[5:6], v12 offset:251
+; ALIGNED-NEXT: flat_store_byte v[5:6], v7 offset:247
+; ALIGNED-NEXT: flat_store_byte v[5:6], v1 offset:248
+; ALIGNED-NEXT: flat_store_byte v[5:6], v8 offset:246
+; ALIGNED-NEXT: flat_store_byte v[5:6], v9 offset:252
+; ALIGNED-NEXT: flat_store_byte v[5:6], v12 offset:250
+; ALIGNED-NEXT: flat_store_byte v[5:6], v11 offset:251
; ALIGNED-NEXT: flat_store_byte v[5:6], v13 offset:249
-; ALIGNED-NEXT: flat_store_byte v[5:6], v8 offset:245
+; ALIGNED-NEXT: flat_store_byte v[5:6], v10 offset:245
; ALIGNED-NEXT: flat_store_byte v[5:6], v15 offset:239
; ALIGNED-NEXT: flat_store_byte v[5:6], v16 offset:240
; ALIGNED-NEXT: flat_store_byte v[5:6], v19 offset:238
@@ -4591,18 +4602,18 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: flat_store_byte v[5:6], v18 offset:243
; ALIGNED-NEXT: flat_store_byte v[5:6], v20 offset:241
; ALIGNED-NEXT: flat_store_byte v[5:6], v21 offset:237
-; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:248
-; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:252
-; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:244
+; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:248
+; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:252
+; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:244
; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:240
-; ALIGNED-NEXT: flat_store_byte v[5:6], v22 offset:231
-; ALIGNED-NEXT: flat_store_byte v[5:6], v25 offset:232
-; ALIGNED-NEXT: flat_store_byte v[5:6], v23 offset:230
-; ALIGNED-NEXT: flat_store_byte v[5:6], v26 offset:236
-; ALIGNED-NEXT: flat_store_byte v[5:6], v27 offset:234
-; ALIGNED-NEXT: flat_store_byte v[5:6], v28 offset:235
+; ALIGNED-NEXT: flat_store_byte v[5:6], v23 offset:231
+; ALIGNED-NEXT: flat_store_byte v[5:6], v22 offset:232
+; ALIGNED-NEXT: flat_store_byte v[5:6], v24 offset:230
+; ALIGNED-NEXT: flat_store_byte v[5:6], v25 offset:236
+; ALIGNED-NEXT: flat_store_byte v[5:6], v28 offset:234
+; ALIGNED-NEXT: flat_store_byte v[5:6], v27 offset:235
; ALIGNED-NEXT: flat_store_byte v[5:6], v29 offset:233
-; ALIGNED-NEXT: flat_store_byte v[5:6], v24 offset:229
+; ALIGNED-NEXT: flat_store_byte v[5:6], v26 offset:229
; ALIGNED-NEXT: flat_store_byte v[5:6], v31 offset:223
; ALIGNED-NEXT: flat_store_byte v[5:6], v32 offset:224
; ALIGNED-NEXT: flat_store_byte v[5:6], v35 offset:222
@@ -4613,44 +4624,44 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: flat_store_byte v[5:6], v37 offset:221
; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:192
; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:204
-; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:200
+; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:200
; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:196
; ALIGNED-NEXT: flat_store_byte v[5:6], v67 offset:210
-; ALIGNED-NEXT: flat_store_byte v[5:6], v64 offset:212
-; ALIGNED-NEXT: flat_store_byte v[5:6], v38 offset:206
-; ALIGNED-NEXT: flat_store_byte v[5:6], v65 offset:208
-; ALIGNED-NEXT: flat_store_byte v[5:6], v39 offset:207
+; ALIGNED-NEXT: flat_store_byte v[5:6], v65 offset:212
+; ALIGNED-NEXT: flat_store_byte v[5:6], v39 offset:206
+; ALIGNED-NEXT: flat_store_byte v[5:6], v38 offset:208
+; ALIGNED-NEXT: flat_store_byte v[5:6], v48 offset:207
; ALIGNED-NEXT: flat_store_byte v[5:6], v70 offset:211
; ALIGNED-NEXT: flat_store_byte v[5:6], v80 offset:209
-; ALIGNED-NEXT: flat_store_byte v[5:6], v53 offset:215
-; ALIGNED-NEXT: flat_store_byte v[5:6], v52 offset:216
-; ALIGNED-NEXT: flat_store_byte v[5:6], v66 offset:214
-; ALIGNED-NEXT: flat_store_byte v[5:6], v51 offset:220
-; ALIGNED-NEXT: flat_store_byte v[5:6], v55 offset:218
-; ALIGNED-NEXT: flat_store_byte v[5:6], v54 offset:219
-; ALIGNED-NEXT: flat_store_byte v[5:6], v69 offset:217
+; ALIGNED-NEXT: flat_store_byte v[5:6], v54 offset:215
+; ALIGNED-NEXT: flat_store_byte v[5:6], v53 offset:216
+; ALIGNED-NEXT: flat_store_byte v[5:6], v69 offset:214
+; ALIGNED-NEXT: flat_store_byte v[5:6], v52 offset:220
+; ALIGNED-NEXT: flat_store_byte v[5:6], v66 offset:218
+; ALIGNED-NEXT: flat_store_byte v[5:6], v64 offset:219
+; ALIGNED-NEXT: flat_store_byte v[5:6], v68 offset:217
; ALIGNED-NEXT: flat_store_byte v[5:6], v71 offset:213
-; ALIGNED-NEXT: flat_store_byte v[5:6], v49 offset:205
-; ALIGNED-NEXT: buffer_store_dword v72, off, s[0:3], s32 offset:216
-; ALIGNED-NEXT: buffer_store_dword v79, off, s[0:3], s32 offset:220
-; ALIGNED-NEXT: buffer_store_dword v88, off, s[0:3], s32 offset:212
-; ALIGNED-NEXT: buffer_store_dword v107, off, s[0:3], s32 offset:208
+; ALIGNED-NEXT: flat_store_byte v[5:6], v50 offset:205
+; ALIGNED-NEXT: buffer_store_dword v73, off, s[0:3], s32 offset:216
+; ALIGNED-NEXT: buffer_store_dword v91, off, s[0:3], s32 offset:220
+; ALIGNED-NEXT: buffer_store_dword v94, off, s[0:3], s32 offset:212
+; ALIGNED-NEXT: buffer_store_dword v121, off, s[0:3], s32 offset:208
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Reload
-; ALIGNED-NEXT: flat_store_byte v[5:6], v82 offset:199
-; ALIGNED-NEXT: flat_store_byte v[5:6], v85 offset:200
-; ALIGNED-NEXT: flat_store_byte v[5:6], v83 offset:198
-; ALIGNED-NEXT: flat_store_byte v[5:6], v86 offset:204
-; ALIGNED-NEXT: flat_store_byte v[5:6], v87 offset:202
-; ALIGNED-NEXT: flat_store_byte v[5:6], v96 offset:203
+; ALIGNED-NEXT: flat_store_byte v[5:6], v83 offset:199
+; ALIGNED-NEXT: flat_store_byte v[5:6], v82 offset:200
+; ALIGNED-NEXT: flat_store_byte v[5:6], v84 offset:198
+; ALIGNED-NEXT: flat_store_byte v[5:6], v85 offset:204
+; ALIGNED-NEXT: flat_store_byte v[5:6], v96 offset:202
+; ALIGNED-NEXT: flat_store_byte v[5:6], v87 offset:203
; ALIGNED-NEXT: flat_store_byte v[5:6], v97 offset:201
-; ALIGNED-NEXT: flat_store_byte v[5:6], v84 offset:197
+; ALIGNED-NEXT: flat_store_byte v[5:6], v86 offset:197
; ALIGNED-NEXT: flat_store_byte v[5:6], v101 offset:191
; ALIGNED-NEXT: flat_store_byte v[5:6], v100 offset:192
-; ALIGNED-NEXT: flat_store_byte v[5:6], v112 offset:190
+; ALIGNED-NEXT: flat_store_byte v[5:6], v113 offset:190
; ALIGNED-NEXT: flat_store_byte v[5:6], v99 offset:196
; ALIGNED-NEXT: flat_store_byte v[5:6], v103 offset:194
; ALIGNED-NEXT: flat_store_byte v[5:6], v102 offset:195
-; ALIGNED-NEXT: flat_store_byte v[5:6], v113 offset:193
+; ALIGNED-NEXT: flat_store_byte v[5:6], v112 offset:193
; ALIGNED-NEXT: flat_store_byte v[5:6], v115 offset:189
; ALIGNED-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5]
; ALIGNED-NEXT: s_and_b32 vcc_lo, exec_lo, s6
@@ -4666,14 +4677,14 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Reload
-; ALIGNED-NEXT: flat_store_byte v[5:6], v116 offset:183
-; ALIGNED-NEXT: flat_store_byte v[5:6], v40 offset:184
-; ALIGNED-NEXT: flat_store_byte v[5:6], v118 offset:182
+; ALIGNED-NEXT: flat_store_byte v[5:6], v117 offset:183
+; ALIGNED-NEXT: flat_store_byte v[5:6], v116 offset:184
+; ALIGNED-NEXT: flat_store_byte v[5:6], v119 offset:182
; ALIGNED-NEXT: flat_store_byte v[5:6], v41 offset:188
; ALIGNED-NEXT: flat_store_byte v[5:6], v43 offset:186
; ALIGNED-NEXT: flat_store_byte v[5:6], v42 offset:187
; ALIGNED-NEXT: flat_store_byte v[5:6], v44 offset:185
-; ALIGNED-NEXT: flat_store_byte v[5:6], v119 offset:181
+; ALIGNED-NEXT: flat_store_byte v[5:6], v40 offset:181
; ALIGNED-NEXT: flat_store_byte v[5:6], v47 offset:175
; ALIGNED-NEXT: flat_store_byte v[5:6], v56 offset:176
; ALIGNED-NEXT: flat_store_byte v[5:6], v59 offset:174
@@ -4693,70 +4704,78 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Reload
-; ALIGNED-NEXT: flat_store_byte v[5:6], v62 offset:167
-; ALIGNED-NEXT: flat_store_byte v[5:6], v74 offset:168
-; ALIGNED-NEXT: flat_store_byte v[5:6], v63 offset:166
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[5:6], v63 offset:167
+; ALIGNED-NEXT: flat_store_byte v[5:6], v62 offset:168
+; ALIGNED-NEXT: flat_store_byte v[5:6], v72 offset:166
; ALIGNED-NEXT: flat_store_byte v[5:6], v75 offset:172
-; ALIGNED-NEXT: flat_store_byte v[5:6], v77 offset:170
-; ALIGNED-NEXT: flat_store_byte v[5:6], v76 offset:171
-; ALIGNED-NEXT: flat_store_byte v[5:6], v78 offset:169
-; ALIGNED-NEXT: flat_store_byte v[5:6], v73 offset:165
-; ALIGNED-NEXT: flat_store_byte v[5:6], v93 offset:159
-; ALIGNED-NEXT: flat_store_byte v[5:6], v92 offset:160
-; ALIGNED-NEXT: flat_store_byte v[5:6], v104 offset:158
-; ALIGNED-NEXT: flat_store_byte v[5:6], v91 offset:164
-; ALIGNED-NEXT: flat_store_byte v[5:6], v95 offset:162
-; ALIGNED-NEXT: flat_store_byte v[5:6], v94 offset:163
-; ALIGNED-NEXT: flat_store_byte v[5:6], v105 offset:161
-; ALIGNED-NEXT: flat_store_byte v[5:6], v108 offset:157
+; ALIGNED-NEXT: flat_store_byte v[5:6], v88 offset:170
+; ALIGNED-NEXT: flat_store_byte v[5:6], v77 offset:171
+; ALIGNED-NEXT: flat_store_byte v[5:6], v89 offset:169
+; ALIGNED-NEXT: flat_store_byte v[5:6], v74 offset:165
+; ALIGNED-NEXT: flat_store_byte v[5:6], v109 offset:159
+; ALIGNED-NEXT: flat_store_byte v[5:6], v108 offset:160
+; ALIGNED-NEXT: flat_store_byte v[5:6], v122 offset:158
+; ALIGNED-NEXT: flat_store_byte v[5:6], v105 offset:164
+; ALIGNED-NEXT: flat_store_byte v[5:6], v111 offset:162
+; ALIGNED-NEXT: flat_store_byte v[5:6], v120 offset:163
+; ALIGNED-NEXT: flat_store_byte v[5:6], v123 offset:161
+; ALIGNED-NEXT: flat_store_byte v[5:6], v125 offset:157
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1108 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1100 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Reload
-; ALIGNED-NEXT: flat_store_byte v[5:6], v110 offset:151
-; ALIGNED-NEXT: flat_store_byte v[5:6], v124 offset:152
-; ALIGNED-NEXT: flat_store_byte v[5:6], v120 offset:150
-; ALIGNED-NEXT: flat_store_byte v[5:6], v125 offset:156
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[5:6], v127 offset:151
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:152
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:150
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1120 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:156
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1112 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:154
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1156 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1116 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:155
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1104 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:153
-; ALIGNED-NEXT: flat_store_byte v[5:6], v123 offset:149
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1104 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:149
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1084 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:143
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1088 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1076 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:144
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1080 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1072 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:142
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1096 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:148
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1116 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1088 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:146
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1120 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:147
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1100 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1080 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:145
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1072 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:141
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1064 ; 4-byte Folded Reload
@@ -5002,13 +5021,13 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
@@ -5017,10 +5036,10 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:72
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:70
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:76
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
@@ -5213,11 +5232,9 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:21
-; ALIGNED-NEXT: flat_store_byte v[5:6], v90 offset:15
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1188 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:16
-; ALIGNED-NEXT: flat_store_byte v[5:6], v89 offset:14
+; ALIGNED-NEXT: flat_store_byte v[5:6], v78 offset:15
+; ALIGNED-NEXT: flat_store_byte v[5:6], v79 offset:16
+; ALIGNED-NEXT: flat_store_byte v[5:6], v76 offset:14
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:20
@@ -5230,55 +5247,49 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:17
-; ALIGNED-NEXT: flat_store_byte v[3:4], v106 offset:16
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v90 offset:16
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1188 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Reload
-; ALIGNED-NEXT: flat_store_byte v[5:6], v109 offset:7
-; ALIGNED-NEXT: flat_store_byte v[5:6], v121 offset:8
-; ALIGNED-NEXT: flat_store_byte v[5:6], v127 offset:10
-; ALIGNED-NEXT: flat_store_byte v[5:6], v111 offset:6
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:12
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:11
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1160 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:9
-; ALIGNED-NEXT: flat_store_byte v[3:4], v122 offset:8
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1096 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1156 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[5:6], v93 offset:7
+; ALIGNED-NEXT: flat_store_byte v[5:6], v92 offset:8
+; ALIGNED-NEXT: flat_store_byte v[5:6], v107 offset:10
+; ALIGNED-NEXT: flat_store_byte v[5:6], v95 offset:6
+; ALIGNED-NEXT: flat_store_byte v[5:6], v106 offset:12
+; ALIGNED-NEXT: flat_store_byte v[5:6], v110 offset:11
+; ALIGNED-NEXT: flat_store_byte v[5:6], v124 offset:9
+; ALIGNED-NEXT: flat_store_byte v[3:4], v104 offset:8
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:2
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1084 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1076 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:1
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:4
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1108 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1160 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:2
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1112 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:3
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:4
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0
; ALIGNED-NEXT: s_cbranch_vccnz .LBB4_1
@@ -5411,8 +5422,8 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0
; CHECK-NEXT: v_add_co_u32 v52, vcc_lo, v2, s4
; CHECK-NEXT: v_add_co_ci_u32_e64 v53, null, s5, v3, vcc_lo
; CHECK-NEXT: s_clause 0x1
-; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[52:53] offset:16
-; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[52:53]
+; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[52:53]
+; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[52:53] offset:16
; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v52, 48
; CHECK-NEXT: v_add_co_ci_u32_e64 v97, null, 0, v53, vcc_lo
; CHECK-NEXT: v_add_co_u32 v80, vcc_lo, v52, 0x60
@@ -5444,8 +5455,8 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0
; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12)
; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[16:19] offset:64
; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15] offset:32
-; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11]
-; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:16
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7]
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11] offset:16
; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15)
; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[20:23] offset:176
; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15)
@@ -5483,8 +5494,8 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0
; CHECK-NEXT: v_add_co_u32 v24, vcc_lo, v2, s4
; CHECK-NEXT: v_add_co_ci_u32_e64 v25, null, s5, v3, vcc_lo
; CHECK-NEXT: s_clause 0x4
-; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[24:25] offset:64
-; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[24:25] offset:32
+; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[24:25] offset:32
+; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[24:25] offset:64
; CHECK-NEXT: flat_load_dwordx4 v[12:15], v[24:25]
; CHECK-NEXT: flat_load_dwordx4 v[16:19], v[24:25] offset:16
; CHECK-NEXT: flat_load_dwordx4 v[20:23], v[24:25] offset:128
@@ -5513,10 +5524,9 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0
; CHECK-NEXT: v_add_co_u32 v102, vcc_lo, v100, 48
; CHECK-NEXT: s_cmp_eq_u64 s[4:5], s[6:7]
; CHECK-NEXT: v_add_co_ci_u32_e64 v103, null, 0, v101, vcc_lo
-; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:64
-; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15)
-; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11] offset:32
+; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14)
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11] offset:64
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:32
; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15)
; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15]
; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15)
@@ -12550,54 +12560,54 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:52
; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:56
; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:60
+; CHECK-NEXT: buffer_load_dword v11, v2, s[0:3], 0 offen offset:64
+; CHECK-NEXT: buffer_load_dword v12, v2, s[0:3], 0 offen offset:68
+; CHECK-NEXT: buffer_load_dword v13, v2, s[0:3], 0 offen offset:72
; CHECK-NEXT: buffer_load_dword v14, v2, s[0:3], 0 offen offset:76
-; CHECK-NEXT: buffer_load_dword v18, v2, s[0:3], 0 offen offset:92
-; CHECK-NEXT: buffer_load_dword v17, v2, s[0:3], 0 offen offset:88
-; CHECK-NEXT: buffer_load_dword v16, v2, s[0:3], 0 offen offset:84
; CHECK-NEXT: buffer_load_dword v15, v2, s[0:3], 0 offen offset:80
-; CHECK-NEXT: buffer_load_dword v13, v2, s[0:3], 0 offen offset:72
-; CHECK-NEXT: buffer_load_dword v12, v2, s[0:3], 0 offen offset:68
-; CHECK-NEXT: buffer_load_dword v11, v2, s[0:3], 0 offen offset:64
-; CHECK-NEXT: buffer_load_dword v22, v2, s[0:3], 0 offen offset:108
-; CHECK-NEXT: buffer_load_dword v26, v2, s[0:3], 0 offen offset:124
-; CHECK-NEXT: buffer_load_dword v25, v2, s[0:3], 0 offen offset:120
-; CHECK-NEXT: buffer_load_dword v24, v2, s[0:3], 0 offen offset:116
-; CHECK-NEXT: buffer_load_dword v23, v2, s[0:3], 0 offen offset:112
-; CHECK-NEXT: buffer_load_dword v21, v2, s[0:3], 0 offen offset:104
-; CHECK-NEXT: buffer_load_dword v20, v2, s[0:3], 0 offen offset:100
-; CHECK-NEXT: buffer_load_dword v19, v2, s[0:3], 0 offen offset:96
+; CHECK-NEXT: buffer_load_dword v16, v2, s[0:3], 0 offen offset:84
+; CHECK-NEXT: buffer_load_dword v17, v2, s[0:3], 0 offen offset:88
+; CHECK-NEXT: buffer_load_dword v18, v2, s[0:3], 0 offen offset:92
+; CHECK-NEXT: buffer_load_dword v19, v2, s[0:3], 0 offen offset:128
+; CHECK-NEXT: buffer_load_dword v20, v2, s[0:3], 0 offen offset:132
+; CHECK-NEXT: buffer_load_dword v21, v2, s[0:3], 0 offen offset:136
+; CHECK-NEXT: buffer_load_dword v22, v2, s[0:3], 0 offen offset:140
+; CHECK-NEXT: buffer_load_dword v23, v2, s[0:3], 0 offen offset:144
+; CHECK-NEXT: buffer_load_dword v24, v2, s[0:3], 0 offen offset:148
+; CHECK-NEXT: buffer_load_dword v25, v2, s[0:3], 0 offen offset:152
+; CHECK-NEXT: buffer_load_dword v26, v2, s[0:3], 0 offen offset:156
+; CHECK-NEXT: buffer_load_dword v27, v2, s[0:3], 0 offen offset:160
+; CHECK-NEXT: buffer_load_dword v28, v2, s[0:3], 0 offen offset:164
+; CHECK-NEXT: buffer_load_dword v29, v2, s[0:3], 0 offen offset:168
; CHECK-NEXT: buffer_load_dword v30, v2, s[0:3], 0 offen offset:172
-; CHECK-NEXT: buffer_load_dword v34, v2, s[0:3], 0 offen offset:188
-; CHECK-NEXT: buffer_load_dword v33, v2, s[0:3], 0 offen offset:184
-; CHECK-NEXT: buffer_load_dword v32, v2, s[0:3], 0 offen offset:180
; CHECK-NEXT: buffer_load_dword v31, v2, s[0:3], 0 offen offset:176
-; CHECK-NEXT: buffer_load_dword v29, v2, s[0:3], 0 offen offset:168
-; CHECK-NEXT: buffer_load_dword v28, v2, s[0:3], 0 offen offset:164
-; CHECK-NEXT: buffer_load_dword v27, v2, s[0:3], 0 offen offset:160
+; CHECK-NEXT: buffer_load_dword v32, v2, s[0:3], 0 offen offset:180
+; CHECK-NEXT: buffer_load_dword v33, v2, s[0:3], 0 offen offset:184
+; CHECK-NEXT: buffer_load_dword v34, v2, s[0:3], 0 offen offset:188
+; CHECK-NEXT: buffer_load_dword v35, v2, s[0:3], 0 offen offset:192
+; CHECK-NEXT: buffer_load_dword v36, v2, s[0:3], 0 offen offset:196
+; CHECK-NEXT: buffer_load_dword v37, v2, s[0:3], 0 offen offset:200
; CHECK-NEXT: buffer_load_dword v38, v2, s[0:3], 0 offen offset:204
-; CHECK-NEXT: buffer_load_dword v51, v2, s[0:3], 0 offen offset:220
-; CHECK-NEXT: buffer_load_dword v50, v2, s[0:3], 0 offen offset:216
-; CHECK-NEXT: buffer_load_dword v49, v2, s[0:3], 0 offen offset:212
; CHECK-NEXT: buffer_load_dword v48, v2, s[0:3], 0 offen offset:208
-; CHECK-NEXT: buffer_load_dword v37, v2, s[0:3], 0 offen offset:200
-; CHECK-NEXT: buffer_load_dword v36, v2, s[0:3], 0 offen offset:196
-; CHECK-NEXT: buffer_load_dword v35, v2, s[0:3], 0 offen offset:192
+; CHECK-NEXT: buffer_load_dword v49, v2, s[0:3], 0 offen offset:212
+; CHECK-NEXT: buffer_load_dword v50, v2, s[0:3], 0 offen offset:216
+; CHECK-NEXT: buffer_load_dword v51, v2, s[0:3], 0 offen offset:220
+; CHECK-NEXT: buffer_load_dword v52, v2, s[0:3], 0 offen offset:224
+; CHECK-NEXT: buffer_load_dword v53, v2, s[0:3], 0 offen offset:228
+; CHECK-NEXT: buffer_load_dword v54, v2, s[0:3], 0 offen offset:232
; CHECK-NEXT: buffer_load_dword v55, v2, s[0:3], 0 offen offset:236
-; CHECK-NEXT: buffer_load_dword v67, v2, s[0:3], 0 offen offset:252
-; CHECK-NEXT: buffer_load_dword v66, v2, s[0:3], 0 offen offset:248
-; CHECK-NEXT: buffer_load_dword v65, v2, s[0:3], 0 offen offset:244
; CHECK-NEXT: buffer_load_dword v64, v2, s[0:3], 0 offen offset:240
-; CHECK-NEXT: buffer_load_dword v54, v2, s[0:3], 0 offen offset:232
-; CHECK-NEXT: buffer_load_dword v53, v2, s[0:3], 0 offen offset:228
-; CHECK-NEXT: buffer_load_dword v52, v2, s[0:3], 0 offen offset:224
-; CHECK-NEXT: buffer_load_dword v71, v2, s[0:3], 0 offen offset:140
-; CHECK-NEXT: buffer_load_dword v83, v2, s[0:3], 0 offen offset:156
-; CHECK-NEXT: buffer_load_dword v82, v2, s[0:3], 0 offen offset:152
-; CHECK-NEXT: buffer_load_dword v81, v2, s[0:3], 0 offen offset:148
-; CHECK-NEXT: buffer_load_dword v80, v2, s[0:3], 0 offen offset:144
-; CHECK-NEXT: buffer_load_dword v70, v2, s[0:3], 0 offen offset:136
-; CHECK-NEXT: buffer_load_dword v69, v2, s[0:3], 0 offen offset:132
-; CHECK-NEXT: buffer_load_dword v68, v2, s[0:3], 0 offen offset:128
+; CHECK-NEXT: buffer_load_dword v65, v2, s[0:3], 0 offen offset:244
+; CHECK-NEXT: buffer_load_dword v66, v2, s[0:3], 0 offen offset:248
+; CHECK-NEXT: buffer_load_dword v67, v2, s[0:3], 0 offen offset:252
+; CHECK-NEXT: buffer_load_dword v68, v2, s[0:3], 0 offen offset:96
+; CHECK-NEXT: buffer_load_dword v69, v2, s[0:3], 0 offen offset:100
+; CHECK-NEXT: buffer_load_dword v70, v2, s[0:3], 0 offen offset:104
+; CHECK-NEXT: buffer_load_dword v71, v2, s[0:3], 0 offen offset:108
+; CHECK-NEXT: buffer_load_dword v80, v2, s[0:3], 0 offen offset:112
+; CHECK-NEXT: buffer_load_dword v81, v2, s[0:3], 0 offen offset:116
+; CHECK-NEXT: buffer_load_dword v82, v2, s[0:3], 0 offen offset:120
+; CHECK-NEXT: buffer_load_dword v83, v2, s[0:3], 0 offen offset:124
; CHECK-NEXT: buffer_load_dword v84, v2, s[0:3], 0 offen
; CHECK-NEXT: buffer_load_dword v85, v2, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_load_dword v86, v2, s[0:3], 0 offen offset:8
@@ -12613,20 +12623,18 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; CHECK-NEXT: v_add_nc_u32_e32 v2, 0x100, v2
; CHECK-NEXT: v_add_co_ci_u32_e64 v103, null, 0, v101, vcc_lo
; CHECK-NEXT: s_addc_u32 s5, s5, 0
-; CHECK-NEXT: s_waitcnt vmcnt(19)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[64:67] offset:192
; CHECK-NEXT: s_waitcnt vmcnt(16)
+; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[64:67] offset:192
; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[52:55] offset:176
; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[48:51] offset:160
; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[35:38] offset:144
; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[31:34] offset:128
; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[27:30] offset:112
-; CHECK-NEXT: s_waitcnt vmcnt(11)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[80:83] offset:96
+; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[23:26] offset:96
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[19:22] offset:128
; CHECK-NEXT: s_waitcnt vmcnt(8)
-; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:128
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[23:26] offset:64
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[19:22] offset:48
+; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[80:83] offset:64
+; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[68:71] offset:48
; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[15:18] offset:32
; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[11:14] offset:64
; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[7:10]
@@ -12656,54 +12664,54 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:52
; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:56
; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:60
+; CHECK-NEXT: buffer_load_dword v11, v2, s[0:3], 0 offen offset:64
+; CHECK-NEXT: buffer_load_dword v12, v2, s[0:3], 0 offen offset:68
+; CHECK-NEXT: buffer_load_dword v13, v2, s[0:3], 0 offen offset:72
; CHECK-NEXT: buffer_load_dword v14, v2, s[0:3], 0 offen offset:76
-; CHECK-NEXT: buffer_load_dword v18, v2, s[0:3], 0 offen offset:92
-; CHECK-NEXT: buffer_load_dword v17, v2, s[0:3], 0 offen offset:88
-; CHECK-NEXT: buffer_load_dword v16, v2, s[0:3], 0 offen offset:84
; CHECK-NEXT: buffer_load_dword v15, v2, s[0:3], 0 offen offset:80
-; CHECK-NEXT: buffer_load_dword v13, v2, s[0:3], 0 offen offset:72
-; CHECK-NEXT: buffer_load_dword v12, v2, s[0:3], 0 offen offset:68
-; CHECK-NEXT: buffer_load_dword v11, v2, s[0:3], 0 offen offset:64
-; CHECK-NEXT: buffer_load_dword v22, v2, s[0:3], 0 offen offset:108
-; CHECK-NEXT: buffer_load_dword v26, v2, s[0:3], 0 offen offset:124
-; CHECK-NEXT: buffer_load_dword v25, v2, s[0:3], 0 offen offset:120
-; CHECK-NEXT: buffer_load_dword v24, v2, s[0:3], 0 offen offset:116
-; CHECK-NEXT: buffer_load_dword v23, v2, s[0:3], 0 offen offset:112
-; CHECK-NEXT: buffer_load_dword v21, v2, s[0:3], 0 offen offset:104
-; CHECK-NEXT: buffer_load_dword v20, v2, s[0:3], 0 offen offset:100
-; CHECK-NEXT: buffer_load_dword v19, v2, s[0:3], 0 offen offset:96
-; CHECK-NEXT: buffer_load_dword v30, v2, s[0:3], 0 offen offset:236
-; CHECK-NEXT: buffer_load_dword v34, v2, s[0:3], 0 offen offset:252
-; CHECK-NEXT: buffer_load_dword v33, v2, s[0:3], 0 offen offset:248
-; CHECK-NEXT: buffer_load_dword v32, v2, s[0:3], 0 offen offset:244
-; CHECK-NEXT: buffer_load_dword v31, v2, s[0:3], 0 offen offset:240
-; CHECK-NEXT: buffer_load_dword v29, v2, s[0:3], 0 offen offset:232
-; CHECK-NEXT: buffer_load_dword v28, v2, s[0:3], 0 offen offset:228
-; CHECK-NEXT: buffer_load_dword v27, v2, s[0:3], 0 offen offset:224
-; CHECK-NEXT: buffer_load_dword v38, v2, s[0:3], 0 offen offset:204
-; CHECK-NEXT: buffer_load_dword v51, v2, s[0:3], 0 offen offset:220
-; CHECK-NEXT: buffer_load_dword v50, v2, s[0:3], 0 offen offset:216
-; CHECK-NEXT: buffer_load_dword v49, v2, s[0:3], 0 offen offset:212
-; CHECK-NEXT: buffer_load_dword v48, v2, s[0:3], 0 offen offset:208
-; CHECK-NEXT: buffer_load_dword v37, v2, s[0:3], 0 offen offset:200
-; CHECK-NEXT: buffer_load_dword v36, v2, s[0:3], 0 offen offset:196
-; CHECK-NEXT: buffer_load_dword v35, v2, s[0:3], 0 offen offset:192
+; CHECK-NEXT: buffer_load_dword v16, v2, s[0:3], 0 offen offset:84
+; CHECK-NEXT: buffer_load_dword v17, v2, s[0:3], 0 offen offset:88
+; CHECK-NEXT: buffer_load_dword v18, v2, s[0:3], 0 offen offset:92
+; CHECK-NEXT: buffer_load_dword v19, v2, s[0:3], 0 offen offset:128
+; CHECK-NEXT: buffer_load_dword v20, v2, s[0:3], 0 offen offset:132
+; CHECK-NEXT: buffer_load_dword v21, v2, s[0:3], 0 offen offset:136
+; CHECK-NEXT: buffer_load_dword v22, v2, s[0:3], 0 offen offset:140
+; CHECK-NEXT: buffer_load_dword v23, v2, s[0:3], 0 offen offset:144
+; CHECK-NEXT: buffer_load_dword v24, v2, s[0:3], 0 offen offset:148
+; CHECK-NEXT: buffer_load_dword v25, v2, s[0:3], 0 offen offset:152
+; CHECK-NEXT: buffer_load_dword v26, v2, s[0:3], 0 offen offset:156
+; CHECK-NEXT: buffer_load_dword v27, v2, s[0:3], 0 offen offset:192
+; CHECK-NEXT: buffer_load_dword v28, v2, s[0:3], 0 offen offset:196
+; CHECK-NEXT: buffer_load_dword v29, v2, s[0:3], 0 offen offset:200
+; CHECK-NEXT: buffer_load_dword v30, v2, s[0:3], 0 offen offset:204
+; CHECK-NEXT: buffer_load_dword v31, v2, s[0:3], 0 offen offset:208
+; CHECK-NEXT: buffer_load_dword v32, v2, s[0:3], 0 offen offset:212
+; CHECK-NEXT: buffer_load_dword v33, v2, s[0:3], 0 offen offset:216
+; CHECK-NEXT: buffer_load_dword v34, v2, s[0:3], 0 offen offset:220
+; CHECK-NEXT: buffer_load_dword v35, v2, s[0:3], 0 offen offset:224
+; CHECK-NEXT: buffer_load_dword v36, v2, s[0:3], 0 offen offset:228
+; CHECK-NEXT: buffer_load_dword v37, v2, s[0:3], 0 offen offset:232
+; CHECK-NEXT: buffer_load_dword v38, v2, s[0:3], 0 offen offset:236
+; CHECK-NEXT: buffer_load_dword v48, v2, s[0:3], 0 offen offset:240
+; CHECK-NEXT: buffer_load_dword v49, v2, s[0:3], 0 offen offset:244
+; CHECK-NEXT: buffer_load_dword v50, v2, s[0:3], 0 offen offset:248
+; CHECK-NEXT: buffer_load_dword v51, v2, s[0:3], 0 offen offset:252
+; CHECK-NEXT: buffer_load_dword v52, v2, s[0:3], 0 offen offset:160
+; CHECK-NEXT: buffer_load_dword v53, v2, s[0:3], 0 offen offset:164
+; CHECK-NEXT: buffer_load_dword v54, v2, s[0:3], 0 offen offset:168
; CHECK-NEXT: buffer_load_dword v55, v2, s[0:3], 0 offen offset:172
-; CHECK-NEXT: buffer_load_dword v67, v2, s[0:3], 0 offen offset:188
-; CHECK-NEXT: buffer_load_dword v66, v2, s[0:3], 0 offen offset:184
-; CHECK-NEXT: buffer_load_dword v65, v2, s[0:3], 0 offen offset:180
; CHECK-NEXT: buffer_load_dword v64, v2, s[0:3], 0 offen offset:176
-; CHECK-NEXT: buffer_load_dword v54, v2, s[0:3], 0 offen offset:168
-; CHECK-NEXT: buffer_load_dword v53, v2, s[0:3], 0 offen offset:164
-; CHECK-NEXT: buffer_load_dword v52, v2, s[0:3], 0 offen offset:160
-; CHECK-NEXT: buffer_load_dword v71, v2, s[0:3], 0 offen offset:156
-; CHECK-NEXT: buffer_load_dword v70, v2, s[0:3], 0 offen offset:152
-; CHECK-NEXT: buffer_load_dword v69, v2, s[0:3], 0 offen offset:148
-; CHECK-NEXT: buffer_load_dword v68, v2, s[0:3], 0 offen offset:144
-; CHECK-NEXT: buffer_load_dword v83, v2, s[0:3], 0 offen offset:140
-; CHECK-NEXT: buffer_load_dword v82, v2, s[0:3], 0 offen offset:136
-; CHECK-NEXT: buffer_load_dword v81, v2, s[0:3], 0 offen offset:132
-; CHECK-NEXT: buffer_load_dword v80, v2, s[0:3], 0 offen offset:128
+; CHECK-NEXT: buffer_load_dword v65, v2, s[0:3], 0 offen offset:180
+; CHECK-NEXT: buffer_load_dword v66, v2, s[0:3], 0 offen offset:184
+; CHECK-NEXT: buffer_load_dword v67, v2, s[0:3], 0 offen offset:188
+; CHECK-NEXT: buffer_load_dword v68, v2, s[0:3], 0 offen offset:96
+; CHECK-NEXT: buffer_load_dword v69, v2, s[0:3], 0 offen offset:100
+; CHECK-NEXT: buffer_load_dword v70, v2, s[0:3], 0 offen offset:104
+; CHECK-NEXT: buffer_load_dword v71, v2, s[0:3], 0 offen offset:108
+; CHECK-NEXT: buffer_load_dword v80, v2, s[0:3], 0 offen offset:112
+; CHECK-NEXT: buffer_load_dword v81, v2, s[0:3], 0 offen offset:116
+; CHECK-NEXT: buffer_load_dword v82, v2, s[0:3], 0 offen offset:120
+; CHECK-NEXT: buffer_load_dword v83, v2, s[0:3], 0 offen offset:124
; CHECK-NEXT: buffer_load_dword v84, v2, s[0:3], 0 offen
; CHECK-NEXT: buffer_load_dword v85, v2, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_load_dword v86, v2, s[0:3], 0 offen offset:8
@@ -12719,24 +12727,19 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; CHECK-NEXT: s_add_u32 s4, s4, 0xffffff00
; CHECK-NEXT: v_add_co_ci_u32_e64 v103, null, 0, v101, vcc_lo
; CHECK-NEXT: s_addc_u32 s5, s5, -1
-; CHECK-NEXT: s_waitcnt vmcnt(35)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[31:34] offset:192
-; CHECK-NEXT: s_waitcnt vmcnt(32)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[27:30] offset:176
-; CHECK-NEXT: s_waitcnt vmcnt(27)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[48:51] offset:160
; CHECK-NEXT: s_waitcnt vmcnt(24)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[35:38] offset:144
-; CHECK-NEXT: s_waitcnt vmcnt(19)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[64:67] offset:128
+; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[48:51] offset:192
+; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[35:38] offset:176
+; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[31:34] offset:160
+; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[27:30] offset:144
; CHECK-NEXT: s_waitcnt vmcnt(16)
+; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[64:67] offset:128
; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[52:55] offset:112
-; CHECK-NEXT: s_waitcnt vmcnt(12)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[68:71] offset:96
+; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[23:26] offset:96
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[19:22] offset:128
; CHECK-NEXT: s_waitcnt vmcnt(8)
-; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:128
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[23:26] offset:64
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[19:22] offset:48
+; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[80:83] offset:64
+; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[68:71] offset:48
; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[15:18] offset:32
; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[11:14] offset:64
; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[7:10]
@@ -12820,17 +12823,16 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:22
; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:23
; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:24
-; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:25
-; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:26
-; ALIGNED-NEXT: buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:19
+; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:25
+; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:26
+; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:27
; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28
; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:29
-; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:30
-; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:31
+; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:30
+; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:31
; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:32
; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:33
; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:34
-; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27
; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:35
; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:36
; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:37
@@ -12855,6 +12857,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:56
; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:57
; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:58
+; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:59
; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:60
; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:61
; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:62
@@ -12862,7 +12865,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:64
; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:65
; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:66
-; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:59
; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:67
; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:68
; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:69
@@ -12870,86 +12872,90 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:71
; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:76
; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:77
-; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:75
+; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:72
+; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:73
; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:78
; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:79
-; ALIGNED-NEXT: buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:159
-; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:155
-; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:152
-; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:153
-; ALIGNED-NEXT: buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:154
-; ALIGNED-NEXT: s_clause 0x31
-; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:160
-; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:161
-; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:162
-; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:163
-; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:164
-; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:165
-; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:166
-; ALIGNED-NEXT: buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:167
-; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:172
-; ALIGNED-NEXT: buffer_load_ubyte v89, v2, s[0:3], 0 offen offset:173
-; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:174
-; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:175
-; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:171
-; ALIGNED-NEXT: buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:204
-; ALIGNED-NEXT: buffer_load_ubyte v99, v2, s[0:3], 0 offen offset:205
-; ALIGNED-NEXT: buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:206
-; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:207
-; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:203
; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:200
-; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:201
-; ALIGNED-NEXT: buffer_load_ubyte v86, v2, s[0:3], 0 offen offset:202
-; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:168
-; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:169
-; ALIGNED-NEXT: buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:170
-; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:176
-; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:177
-; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:178
-; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:179
-; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:180
-; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:181
-; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:182
-; ALIGNED-NEXT: buffer_load_ubyte v58, v2, s[0:3], 0 offen offset:183
-; ALIGNED-NEXT: buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:188
-; ALIGNED-NEXT: buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:189
-; ALIGNED-NEXT: buffer_load_ubyte v46, v2, s[0:3], 0 offen offset:190
-; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:191
-; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:187
-; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:184
-; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:185
-; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:186
-; ALIGNED-NEXT: buffer_load_ubyte v119, v2, s[0:3], 0 offen offset:192
-; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:193
-; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:194
-; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:195
-; ALIGNED-NEXT: buffer_load_ubyte v117, v2, s[0:3], 0 offen offset:196
-; ALIGNED-NEXT: buffer_load_ubyte v115, v2, s[0:3], 0 offen offset:197
-; ALIGNED-NEXT: buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:198
-; ALIGNED-NEXT: buffer_load_ubyte v103, v2, s[0:3], 0 offen offset:199
-; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:4
-; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:6
+; ALIGNED-NEXT: buffer_load_ubyte v86, v2, s[0:3], 0 offen offset:201
+; ALIGNED-NEXT: buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:202
+; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:203
+; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:158
+; ALIGNED-NEXT: s_clause 0x34
+; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:159
+; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:152
+; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:153
+; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:154
+; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:155
+; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:160
+; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:161
+; ALIGNED-NEXT: buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:162
+; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:163
+; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:164
+; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:165
+; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:166
+; ALIGNED-NEXT: buffer_load_ubyte v91, v2, s[0:3], 0 offen offset:167
+; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:172
+; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:173
+; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:174
+; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:175
+; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:168
+; ALIGNED-NEXT: buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:169
+; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:170
+; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:171
+; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:176
+; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:177
+; ALIGNED-NEXT: buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:178
+; ALIGNED-NEXT: buffer_load_ubyte v58, v2, s[0:3], 0 offen offset:179
+; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:180
+; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:181
+; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:182
+; ALIGNED-NEXT: buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:183
+; ALIGNED-NEXT: buffer_load_ubyte v46, v2, s[0:3], 0 offen offset:188
+; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:189
+; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:190
+; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:191
+; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:184
+; ALIGNED-NEXT: buffer_load_ubyte v41, v2, s[0:3], 0 offen offset:185
+; ALIGNED-NEXT: buffer_load_ubyte v119, v2, s[0:3], 0 offen offset:186
+; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:187
+; ALIGNED-NEXT: buffer_load_ubyte v117, v2, s[0:3], 0 offen offset:192
+; ALIGNED-NEXT: buffer_load_ubyte v115, v2, s[0:3], 0 offen offset:193
+; ALIGNED-NEXT: buffer_load_ubyte v103, v2, s[0:3], 0 offen offset:194
+; ALIGNED-NEXT: buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:195
+; ALIGNED-NEXT: buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:196
+; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:197
+; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:198
+; ALIGNED-NEXT: buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:199
+; ALIGNED-NEXT: buffer_load_ubyte v99, v2, s[0:3], 0 offen offset:204
+; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:205
+; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:206
+; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:207
+; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:2
+; ALIGNED-NEXT: buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:4
+; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:5
+; ALIGNED-NEXT: buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:6
; ALIGNED-NEXT: s_waitcnt vmcnt(62)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v5
-; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v9, 8, v8
; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v6
-; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v12
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v10, 8, v9
+; ALIGNED-NEXT: v_lshl_or_b32 v5, v8, 8, v6
+; ALIGNED-NEXT: v_lshl_or_b32 v6, v12, 8, v11
; ALIGNED-NEXT: v_lshl_or_b32 v7, v15, 8, v14
; ALIGNED-NEXT: v_lshl_or_b32 v8, v19, 8, v17
; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v13
@@ -12957,8 +12963,8 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: v_lshl_or_b32 v11, v23, 8, v22
; ALIGNED-NEXT: v_lshl_or_b32 v12, v27, 8, v25
; ALIGNED-NEXT: v_lshl_or_b32 v13, v24, 8, v21
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v14, v28, 8, v26
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v6, 16, v5
; ALIGNED-NEXT: v_lshl_or_b32 v4, v8, 16, v7
@@ -12993,7 +12999,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v3, v66, 8, v64
; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(61)
; ALIGNED-NEXT: v_lshl_or_b32 v4, v68, 8, v67
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: s_clause 0x2
@@ -13002,137 +13007,133 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87
; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(62)
; ALIGNED-NEXT: v_lshl_or_b32 v0, v70, 8, v69
; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:83
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:74
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(61)
+; ALIGNED-NEXT: s_waitcnt vmcnt(62)
; ALIGNED-NEXT: v_lshl_or_b32 v1, v80, 8, v71
; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x4
-; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:212
-; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:213
-; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:214
-; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:215
-; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:211
+; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:73
+; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:75
+; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:72
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v82, 8, v81
+; ALIGNED-NEXT: s_clause 0x3
+; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:212
+; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:213
+; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:214
+; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:215
; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x7
+; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:216
+; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:217
+; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:218
+; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:219
+; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:220
+; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:221
+; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:222
+; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:223
+; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:208
-; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:209
-; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:210
+; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x3
+; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:208
+; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:209
+; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:210
+; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:211
+; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_clause 0x7
-; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:224
-; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:225
-; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:226
-; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:227
-; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:228
-; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:229
-; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:230
-; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:231
+; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:224
+; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:225
+; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:226
+; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:227
+; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:228
+; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:229
+; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:230
+; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:231
+; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x4
-; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:236
-; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:237
-; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:238
-; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:239
-; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:235
+; ALIGNED-NEXT: s_clause 0x3
+; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:236
+; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:237
+; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:238
+; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:239
+; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x2
+; ALIGNED-NEXT: s_clause 0x3
; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:232
-; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:233
-; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:234
+; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:233
+; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:234
+; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:235
; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0xc
-; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:240
-; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:241
-; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:242
-; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:243
-; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:244
-; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:245
-; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:246
-; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:247
-; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:252
+; ALIGNED-NEXT: s_clause 0x7
+; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:240
+; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:241
+; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:242
+; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:243
+; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:244
+; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:245
+; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:246
+; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:247
+; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x3
+; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:252
; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:253
-; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:254
-; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:255
-; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:251
-; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(44)
+; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:254
+; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:255
+; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(50)
; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(43)
+; ALIGNED-NEXT: s_waitcnt vmcnt(49)
; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(42)
+; ALIGNED-NEXT: s_waitcnt vmcnt(48)
; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1024 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(41)
+; ALIGNED-NEXT: s_waitcnt vmcnt(47)
; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(40)
+; ALIGNED-NEXT: s_waitcnt vmcnt(46)
; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(39)
+; ALIGNED-NEXT: s_waitcnt vmcnt(45)
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1004 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(33)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(32)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v81, 8, v3
-; ALIGNED-NEXT: s_clause 0x8
+; ALIGNED-NEXT: s_waitcnt vmcnt(44)
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 8, v3
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:84
-; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:216
-; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:217
-; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:218
-; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:219
-; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:220
-; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:221
-; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:222
-; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:223
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:81
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:80
-; ALIGNED-NEXT: s_waitcnt vmcnt(10)
+; ALIGNED-NEXT: s_waitcnt vmcnt(2)
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1016 ; 4-byte Folded Spill
@@ -13140,6 +13141,8 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1012 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7
+; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:95
; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:98
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6
@@ -13150,30 +13153,32 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3
; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:94
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:95
+; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:101
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:93
-; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:91
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:92
+; ALIGNED-NEXT: s_waitcnt vmcnt(7)
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1060 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(6)
-; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1116 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1104 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(5)
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1120 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1116 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1056 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1120 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1060 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1056 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1052 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1048 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3
+; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:90
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1072 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:101
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:91
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:89
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1084 ; 4-byte Folded Spill
@@ -13181,7 +13186,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1072 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
@@ -13191,7 +13196,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:99
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:100
-; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1104 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:97
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill
@@ -13206,6 +13210,8 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7
+; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:111
; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:114
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6
@@ -13216,40 +13222,42 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3
; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:110
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:111
+; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:117
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:109
-; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:107
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:108
+; ALIGNED-NEXT: s_waitcnt vmcnt(7)
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1144 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(6)
-; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(5)
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1140 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1144 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1136 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1136 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3
+; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:106
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1160 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:117
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:107
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:105
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1156 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:104
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1160 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1152 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1156 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
@@ -13257,7 +13265,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:115
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:116
-; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:113
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Spill
@@ -13272,6 +13279,8 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7
+; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:127
; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:130
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6
@@ -13282,30 +13291,32 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3
; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:126
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:127
+; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:133
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:125
-; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:123
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:124
+; ALIGNED-NEXT: s_waitcnt vmcnt(7)
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1224 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(6)
-; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1280 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1268 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(5)
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1284 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1276 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1280 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1224 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1272 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1216 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3
+; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:122
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1240 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:133
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:123
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:121
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1232 ; 4-byte Folded Spill
@@ -13313,7 +13324,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1244 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1276 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1240 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1236 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
@@ -13321,503 +13332,499 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:131
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:132
-; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1272 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:131
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:132
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:129
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1248 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:128
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1260 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1260 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1264 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1264 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1256 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1252 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7
+; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:142
; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:146
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6
; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:150
; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:151
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1284 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v4
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:142
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:143
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:143
+; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:149
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:141
-; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:139
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1316 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:140
+; ALIGNED-NEXT: s_waitcnt vmcnt(7)
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(6)
-; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(5)
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1336 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1328 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1288 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3
+; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:138
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:149
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:139
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:137
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1352 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:136
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1324 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1320 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1356 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1316 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1344 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:147
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:148
-; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:147
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:148
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:145
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:144
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v98, 8, v100
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:248
-; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:249
-; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:250
-; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v13
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:158
-; ALIGNED-NEXT: v_lshl_or_b32 v5, v15, 8, v20
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:157
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:156
-; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: v_lshl_or_b32 v121, v9, 8, v7
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v126, 8, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v99, 8, v101
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v86, 8, v96
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v120, 8, v110
-; ALIGNED-NEXT: v_lshl_or_b32 v91, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v87, 8, v96
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v97, 8, v86
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v111, 8, v123
-; ALIGNED-NEXT: v_lshl_or_b32 v76, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v70, 8, v84
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6
+; ALIGNED-NEXT: v_lshl_or_b32 v6, v10, 8, v11
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v4
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v84, 8, v85
+; ALIGNED-NEXT: v_lshl_or_b32 v5, v14, 8, v18
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v94, 8, v95
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v68, 8, v82
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v106, 8, v109
-; ALIGNED-NEXT: v_lshl_or_b32 v57, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v69, 8, v83
+; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:157
+; ALIGNED-NEXT: v_lshl_or_b32 v75, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v69, 8, v82
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v68, 8, v80
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:156
+; ALIGNED-NEXT: v_lshl_or_b32 v47, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v71, 8, v81
; ALIGNED-NEXT: v_lshl_or_b32 v4, v64, 8, v65
+; ALIGNED-NEXT: v_lshl_or_b32 v40, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v66, 8, v70
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v54, 8, v67
+; ALIGNED-NEXT: v_lshl_or_b32 v116, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v49, 8, v52
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v48, 8, v50
+; ALIGNED-NEXT: v_lshl_or_b32 v100, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v37, 8, v39
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v34, 8, v33
+; ALIGNED-NEXT: v_lshl_or_b32 v83, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v35, 8, v38
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v32, 8, v36
+; ALIGNED-NEXT: v_lshl_or_b32 v55, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v30, 8, v31
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v27, 8, v29
+; ALIGNED-NEXT: v_lshl_or_b32 v53, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v25, 8, v28
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v24, 8, v23
+; ALIGNED-NEXT: v_lshl_or_b32 v51, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v19, 8, v21
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v16, 8, v15
+; ALIGNED-NEXT: v_lshl_or_b32 v26, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v17, 8, v20
+; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:7
+; ALIGNED-NEXT: buffer_store_dword v110, off, s[0:3], s32 offset:1344 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v120, off, s[0:3], s32 offset:1352 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v126, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v22, v5, 16, v4
+; ALIGNED-NEXT: s_clause 0x3
+; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:248
+; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:249
+; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:250
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:251
+; ALIGNED-NEXT: v_lshl_or_b32 v5, v12, 8, v13
+; ALIGNED-NEXT: v_lshl_or_b32 v5, v6, 16, v5
+; ALIGNED-NEXT: s_waitcnt vmcnt(6)
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(5)
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v123, 8, v125
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v111, 8, v121
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v122, 8, v124
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v92, 8, v93
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v105, 8, v109
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v91, 8, v94
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v95, 8, v106
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v93, 8, v104
-; ALIGNED-NEXT: v_lshl_or_b32 v41, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v67, 8, v81
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v77, 8, v78
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v105, 8, v107
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v55, 8, v66
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v79, 8, v88
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v79, 8, v88
-; ALIGNED-NEXT: v_lshl_or_b32 v118, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v50, 8, v53
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v71, 8, v51
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v72, 8, v73
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v89, 8, v90
-; ALIGNED-NEXT: v_lshl_or_b32 v102, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v39, 8, v49
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v74, 8, v76
+; ALIGNED-NEXT: s_waitcnt vmcnt(4)
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1356 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v78, 8, v74
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v36, 8, v35
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v58, 8, v57
+; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: v_lshl_or_b32 v6, v8, 8, v9
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: v_lshl_or_b32 v107, v4, 8, v7
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v75, 8, v77
-; ALIGNED-NEXT: v_lshl_or_b32 v85, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v37, 8, v48
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v34, 8, v38
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v61, 8, v63
+; ALIGNED-NEXT: v_lshl_or_b32 v6, v107, 16, v6
+; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:1
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v60, 8, v59
-; ALIGNED-NEXT: v_lshl_or_b32 v80, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v31, 8, v33
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v56, 8, v60
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v63, 8, v73
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v30, 8, v32
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v59, 8, v62
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v58, 8, v62
-; ALIGNED-NEXT: v_lshl_or_b32 v54, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v26, 8, v28
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v29, 8, v25
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v43, 8, v44
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v61, 8, v72
-; ALIGNED-NEXT: v_lshl_or_b32 v52, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v21, 8, v23
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v45, 8, v46
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v45, 8, v46
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v18, 8, v17
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v118, 8, v119
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v47, 8, v56
-; ALIGNED-NEXT: v_lshl_or_b32 v27, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v19, 8, v22
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v41, 8, v42
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v44, 8, v40
-; ALIGNED-NEXT: v_lshl_or_b32 v24, v5, 16, v4
-; ALIGNED-NEXT: v_lshl_or_b32 v5, v12, 8, v14
-; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:7
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v102, 8, v103
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v42, 8, v43
-; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:1324 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v16, v6, 16, v5
-; ALIGNED-NEXT: v_lshl_or_b32 v6, v8, 8, v10
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:5
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v112, 8, v113
-; ALIGNED-NEXT: buffer_store_dword v124, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v6, v121, 16, v6
-; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:1
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1496 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v116, 8, v119
-; ALIGNED-NEXT: v_lshl_or_b32 v108, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v0, v115, 8, v117
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v103, 8, v114
-; ALIGNED-NEXT: v_lshl_or_b32 v92, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v104, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v113, 8, v114
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v101, 8, v112
+; ALIGNED-NEXT: v_lshl_or_b32 v90, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v98, 8, v99
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v87, 8, v97
+; ALIGNED-NEXT: v_lshl_or_b32 v89, v1, 16, v0
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:2
+; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:3
-; ALIGNED-NEXT: s_waitcnt vmcnt(5)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1268 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1308 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v121, off, s[0:3], s32 offset:1288 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v121, v121, 8, v3
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:12
+; ALIGNED-NEXT: buffer_store_dword v108, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v107, off, s[0:3], s32 offset:1336 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v122, v0, 8, v1
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:15
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v122, 16, v121
-; ALIGNED-NEXT: v_lshl_or_b32 v121, v5, 8, v124
-; ALIGNED-NEXT: v_lshl_or_b32 v122, v4, 8, v125
-; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:13
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1320 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v122, 16, v121
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: v_lshl_or_b32 v107, v107, 8, v1
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: v_lshl_or_b32 v108, v0, 8, v108
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:12
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v108, 16, v107
+; ALIGNED-NEXT: v_lshl_or_b32 v107, v120, 8, v110
+; ALIGNED-NEXT: v_lshl_or_b32 v108, v3, 8, v126
+; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:13
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v108, 16, v107
+; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:15
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:14
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:11
-; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: v_lshl_or_b32 v121, v4, 8, v3
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:10
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: v_lshl_or_b32 v122, v1, 8, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:9
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: v_mov_b32_e32 v4, v5
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v122, 16, v121
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:8
-; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: v_lshl_or_b32 v122, v5, 8, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v107, v3, 8, v1
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x2
+; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:8
+; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:10
+; ALIGNED-NEXT: buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:11
+; ALIGNED-NEXT: s_waitcnt vmcnt(4)
+; ALIGNED-NEXT: buffer_store_dword v108, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(3)
+; ALIGNED-NEXT: v_lshl_or_b32 v108, v108, 8, v0
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v108, 16, v107
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:9
+; ALIGNED-NEXT: s_waitcnt vmcnt(3)
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v108, v110, 8, v1
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: v_lshl_or_b32 v121, v1, 8, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:18
-; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:16
-; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:17
+; ALIGNED-NEXT: v_lshl_or_b32 v107, v0, 8, v3
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_mov_b32_e32 v3, v110
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v108, 16, v107
+; ALIGNED-NEXT: s_clause 0x3
+; ALIGNED-NEXT: buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:18
+; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:19
+; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:16
+; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:17
; ALIGNED-NEXT: v_add_nc_u32_e32 v2, 0x100, v2
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v122, 16, v121
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v108, 8, v110
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: v_lshl_or_b32 v126, v107, 8, v120
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v0, 16, v126
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:232
-; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:236
-; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:228
-; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:224
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:236
+; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:228
+; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:224
; ALIGNED-NEXT: s_clause 0x1 ; 8-byte Folded Reload
; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:704
; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:708
-; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v127, 8, v124
-; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: v_lshl_or_b32 v127, v1, 8, v125
-; ALIGNED-NEXT: v_lshl_or_b32 v127, v0, 16, v127
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_add_co_u32 v121, vcc_lo, v5, s4
+; ALIGNED-NEXT: v_add_co_u32 v126, vcc_lo, v5, s4
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: v_add_co_ci_u32_e64 v122, null, s5, v6, vcc_lo
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v127, null, s5, v6, vcc_lo
; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100
-; ALIGNED-NEXT: v_add_co_u32 v5, vcc_lo, v121, 3
-; ALIGNED-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v122, vcc_lo
+; ALIGNED-NEXT: v_add_co_u32 v5, vcc_lo, v126, 3
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v127, vcc_lo
; ALIGNED-NEXT: flat_store_byte v[5:6], v7 offset:247
-; ALIGNED-NEXT: flat_store_byte v[5:6], v9 offset:248
+; ALIGNED-NEXT: flat_store_byte v[5:6], v4 offset:248
; ALIGNED-NEXT: flat_store_byte v[5:6], v8 offset:246
-; ALIGNED-NEXT: flat_store_byte v[5:6], v11 offset:252
+; ALIGNED-NEXT: flat_store_byte v[5:6], v10 offset:252
; ALIGNED-NEXT: flat_store_byte v[5:6], v12 offset:250
-; ALIGNED-NEXT: flat_store_byte v[5:6], v13 offset:251
-; ALIGNED-NEXT: flat_store_byte v[5:6], v14 offset:249
-; ALIGNED-NEXT: flat_store_byte v[5:6], v10 offset:245
-; ALIGNED-NEXT: flat_store_byte v[5:6], v17 offset:239
-; ALIGNED-NEXT: flat_store_byte v[5:6], v18 offset:240
-; ALIGNED-NEXT: flat_store_byte v[5:6], v21 offset:238
-; ALIGNED-NEXT: flat_store_byte v[5:6], v15 offset:244
-; ALIGNED-NEXT: flat_store_byte v[5:6], v19 offset:242
-; ALIGNED-NEXT: flat_store_byte v[5:6], v20 offset:243
-; ALIGNED-NEXT: flat_store_byte v[5:6], v22 offset:241
-; ALIGNED-NEXT: flat_store_byte v[5:6], v23 offset:237
-; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:248
-; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:252
-; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:244
-; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:240
-; ALIGNED-NEXT: flat_store_byte v[5:6], v25 offset:231
-; ALIGNED-NEXT: flat_store_byte v[5:6], v29 offset:232
-; ALIGNED-NEXT: flat_store_byte v[5:6], v26 offset:230
-; ALIGNED-NEXT: flat_store_byte v[5:6], v30 offset:236
-; ALIGNED-NEXT: flat_store_byte v[5:6], v31 offset:234
-; ALIGNED-NEXT: flat_store_byte v[5:6], v32 offset:235
-; ALIGNED-NEXT: flat_store_byte v[5:6], v33 offset:233
+; ALIGNED-NEXT: flat_store_byte v[5:6], v11 offset:251
+; ALIGNED-NEXT: flat_store_byte v[5:6], v13 offset:249
+; ALIGNED-NEXT: flat_store_byte v[5:6], v9 offset:245
+; ALIGNED-NEXT: flat_store_byte v[5:6], v15 offset:239
+; ALIGNED-NEXT: flat_store_byte v[5:6], v16 offset:240
+; ALIGNED-NEXT: flat_store_byte v[5:6], v19 offset:238
+; ALIGNED-NEXT: flat_store_byte v[5:6], v14 offset:244
+; ALIGNED-NEXT: flat_store_byte v[5:6], v17 offset:242
+; ALIGNED-NEXT: flat_store_byte v[5:6], v18 offset:243
+; ALIGNED-NEXT: flat_store_byte v[5:6], v20 offset:241
+; ALIGNED-NEXT: flat_store_byte v[5:6], v21 offset:237
+; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:248
+; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:252
+; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:244
+; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:240
+; ALIGNED-NEXT: flat_store_byte v[5:6], v23 offset:231
+; ALIGNED-NEXT: flat_store_byte v[5:6], v24 offset:232
+; ALIGNED-NEXT: flat_store_byte v[5:6], v25 offset:230
+; ALIGNED-NEXT: flat_store_byte v[5:6], v27 offset:236
+; ALIGNED-NEXT: flat_store_byte v[5:6], v30 offset:234
+; ALIGNED-NEXT: flat_store_byte v[5:6], v29 offset:235
+; ALIGNED-NEXT: flat_store_byte v[5:6], v31 offset:233
; ALIGNED-NEXT: flat_store_byte v[5:6], v28 offset:229
-; ALIGNED-NEXT: flat_store_byte v[5:6], v35 offset:223
-; ALIGNED-NEXT: flat_store_byte v[5:6], v36 offset:224
-; ALIGNED-NEXT: flat_store_byte v[5:6], v39 offset:222
-; ALIGNED-NEXT: flat_store_byte v[5:6], v34 offset:228
-; ALIGNED-NEXT: flat_store_byte v[5:6], v37 offset:226
-; ALIGNED-NEXT: flat_store_byte v[5:6], v38 offset:227
-; ALIGNED-NEXT: flat_store_byte v[5:6], v48 offset:225
-; ALIGNED-NEXT: flat_store_byte v[5:6], v49 offset:221
-; ALIGNED-NEXT: buffer_store_dword v102, off, s[0:3], s32 offset:192
-; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:204
-; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:200
-; ALIGNED-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:196
-; ALIGNED-NEXT: flat_store_byte v[5:6], v70 offset:210
+; ALIGNED-NEXT: flat_store_byte v[5:6], v33 offset:223
+; ALIGNED-NEXT: flat_store_byte v[5:6], v34 offset:224
+; ALIGNED-NEXT: flat_store_byte v[5:6], v37 offset:222
+; ALIGNED-NEXT: flat_store_byte v[5:6], v32 offset:228
+; ALIGNED-NEXT: flat_store_byte v[5:6], v35 offset:226
+; ALIGNED-NEXT: flat_store_byte v[5:6], v36 offset:227
+; ALIGNED-NEXT: flat_store_byte v[5:6], v38 offset:225
+; ALIGNED-NEXT: flat_store_byte v[5:6], v39 offset:221
+; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:192
+; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:204
+; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200
+; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:196
+; ALIGNED-NEXT: flat_store_byte v[5:6], v69 offset:210
; ALIGNED-NEXT: flat_store_byte v[5:6], v68 offset:212
-; ALIGNED-NEXT: flat_store_byte v[5:6], v50 offset:206
-; ALIGNED-NEXT: flat_store_byte v[5:6], v71 offset:208
-; ALIGNED-NEXT: flat_store_byte v[5:6], v51 offset:207
-; ALIGNED-NEXT: flat_store_byte v[5:6], v82 offset:211
-; ALIGNED-NEXT: flat_store_byte v[5:6], v84 offset:209
+; ALIGNED-NEXT: flat_store_byte v[5:6], v49 offset:206
+; ALIGNED-NEXT: flat_store_byte v[5:6], v48 offset:208
+; ALIGNED-NEXT: flat_store_byte v[5:6], v50 offset:207
+; ALIGNED-NEXT: flat_store_byte v[5:6], v80 offset:211
+; ALIGNED-NEXT: flat_store_byte v[5:6], v82 offset:209
; ALIGNED-NEXT: flat_store_byte v[5:6], v65 offset:215
; ALIGNED-NEXT: flat_store_byte v[5:6], v64 offset:216
-; ALIGNED-NEXT: flat_store_byte v[5:6], v69 offset:214
-; ALIGNED-NEXT: flat_store_byte v[5:6], v55 offset:220
-; ALIGNED-NEXT: flat_store_byte v[5:6], v67 offset:218
-; ALIGNED-NEXT: flat_store_byte v[5:6], v66 offset:219
-; ALIGNED-NEXT: flat_store_byte v[5:6], v81 offset:217
-; ALIGNED-NEXT: flat_store_byte v[5:6], v83 offset:213
-; ALIGNED-NEXT: flat_store_byte v[5:6], v53 offset:205
-; ALIGNED-NEXT: buffer_store_dword v76, off, s[0:3], s32 offset:216
-; ALIGNED-NEXT: buffer_store_dword v91, off, s[0:3], s32 offset:220
-; ALIGNED-NEXT: buffer_store_dword v92, off, s[0:3], s32 offset:212
-; ALIGNED-NEXT: buffer_store_dword v108, off, s[0:3], s32 offset:208
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1496 ; 4-byte Folded Reload
-; ALIGNED-NEXT: flat_store_byte v[5:6], v86 offset:199
-; ALIGNED-NEXT: flat_store_byte v[5:6], v97 offset:200
-; ALIGNED-NEXT: flat_store_byte v[5:6], v87 offset:198
-; ALIGNED-NEXT: flat_store_byte v[5:6], v98 offset:204
-; ALIGNED-NEXT: flat_store_byte v[5:6], v99 offset:202
-; ALIGNED-NEXT: flat_store_byte v[5:6], v100 offset:203
-; ALIGNED-NEXT: flat_store_byte v[5:6], v101 offset:201
+; ALIGNED-NEXT: flat_store_byte v[5:6], v71 offset:214
+; ALIGNED-NEXT: flat_store_byte v[5:6], v54 offset:220
+; ALIGNED-NEXT: flat_store_byte v[5:6], v66 offset:218
+; ALIGNED-NEXT: flat_store_byte v[5:6], v67 offset:219
+; ALIGNED-NEXT: flat_store_byte v[5:6], v70 offset:217
+; ALIGNED-NEXT: flat_store_byte v[5:6], v81 offset:213
+; ALIGNED-NEXT: flat_store_byte v[5:6], v52 offset:205
+; ALIGNED-NEXT: buffer_store_dword v75, off, s[0:3], s32 offset:216
+; ALIGNED-NEXT: buffer_store_dword v89, off, s[0:3], s32 offset:220
+; ALIGNED-NEXT: buffer_store_dword v90, off, s[0:3], s32 offset:212
+; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:208
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[5:6], v85 offset:199
+; ALIGNED-NEXT: flat_store_byte v[5:6], v84 offset:200
+; ALIGNED-NEXT: flat_store_byte v[5:6], v86 offset:198
+; ALIGNED-NEXT: flat_store_byte v[5:6], v87 offset:204
+; ALIGNED-NEXT: flat_store_byte v[5:6], v98 offset:202
+; ALIGNED-NEXT: flat_store_byte v[5:6], v97 offset:203
+; ALIGNED-NEXT: flat_store_byte v[5:6], v99 offset:201
; ALIGNED-NEXT: flat_store_byte v[5:6], v96 offset:197
-; ALIGNED-NEXT: flat_store_byte v[5:6], v113 offset:191
-; ALIGNED-NEXT: flat_store_byte v[5:6], v112 offset:192
-; ALIGNED-NEXT: flat_store_byte v[5:6], v116 offset:190
-; ALIGNED-NEXT: flat_store_byte v[5:6], v103 offset:196
-; ALIGNED-NEXT: flat_store_byte v[5:6], v115 offset:194
-; ALIGNED-NEXT: flat_store_byte v[5:6], v114 offset:195
-; ALIGNED-NEXT: flat_store_byte v[5:6], v117 offset:193
-; ALIGNED-NEXT: flat_store_byte v[5:6], v119 offset:189
+; ALIGNED-NEXT: flat_store_byte v[5:6], v103 offset:191
+; ALIGNED-NEXT: flat_store_byte v[5:6], v102 offset:192
+; ALIGNED-NEXT: flat_store_byte v[5:6], v115 offset:190
+; ALIGNED-NEXT: flat_store_byte v[5:6], v101 offset:196
+; ALIGNED-NEXT: flat_store_byte v[5:6], v113 offset:194
+; ALIGNED-NEXT: flat_store_byte v[5:6], v112 offset:195
+; ALIGNED-NEXT: flat_store_byte v[5:6], v114 offset:193
+; ALIGNED-NEXT: flat_store_byte v[5:6], v117 offset:189
; ALIGNED-NEXT: s_addc_u32 s5, s5, 0
; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0x800
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload
-; ALIGNED-NEXT: flat_store_byte v[5:6], v40 offset:183
-; ALIGNED-NEXT: flat_store_byte v[5:6], v44 offset:184
-; ALIGNED-NEXT: flat_store_byte v[5:6], v42 offset:182
-; ALIGNED-NEXT: flat_store_byte v[5:6], v45 offset:188
-; ALIGNED-NEXT: flat_store_byte v[5:6], v47 offset:186
-; ALIGNED-NEXT: flat_store_byte v[5:6], v46 offset:187
-; ALIGNED-NEXT: flat_store_byte v[5:6], v56 offset:185
-; ALIGNED-NEXT: flat_store_byte v[5:6], v43 offset:181
-; ALIGNED-NEXT: flat_store_byte v[5:6], v59 offset:175
-; ALIGNED-NEXT: flat_store_byte v[5:6], v60 offset:176
-; ALIGNED-NEXT: flat_store_byte v[5:6], v63 offset:174
-; ALIGNED-NEXT: flat_store_byte v[5:6], v58 offset:180
-; ALIGNED-NEXT: flat_store_byte v[5:6], v61 offset:178
-; ALIGNED-NEXT: flat_store_byte v[5:6], v62 offset:179
-; ALIGNED-NEXT: flat_store_byte v[5:6], v72 offset:177
-; ALIGNED-NEXT: flat_store_byte v[5:6], v73 offset:173
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[5:6], v119 offset:183
+; ALIGNED-NEXT: flat_store_byte v[5:6], v118 offset:184
+; ALIGNED-NEXT: flat_store_byte v[5:6], v41 offset:182
+; ALIGNED-NEXT: flat_store_byte v[5:6], v43 offset:188
+; ALIGNED-NEXT: flat_store_byte v[5:6], v45 offset:186
+; ALIGNED-NEXT: flat_store_byte v[5:6], v44 offset:187
+; ALIGNED-NEXT: flat_store_byte v[5:6], v46 offset:185
+; ALIGNED-NEXT: flat_store_byte v[5:6], v42 offset:181
+; ALIGNED-NEXT: flat_store_byte v[5:6], v57 offset:175
+; ALIGNED-NEXT: flat_store_byte v[5:6], v58 offset:176
+; ALIGNED-NEXT: flat_store_byte v[5:6], v61 offset:174
+; ALIGNED-NEXT: flat_store_byte v[5:6], v56 offset:180
+; ALIGNED-NEXT: flat_store_byte v[5:6], v59 offset:178
+; ALIGNED-NEXT: flat_store_byte v[5:6], v60 offset:179
+; ALIGNED-NEXT: flat_store_byte v[5:6], v62 offset:177
+; ALIGNED-NEXT: flat_store_byte v[5:6], v63 offset:173
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload
-; ALIGNED-NEXT: flat_store_byte v[5:6], v74 offset:167
-; ALIGNED-NEXT: flat_store_byte v[5:6], v78 offset:168
-; ALIGNED-NEXT: flat_store_byte v[5:6], v75 offset:166
-; ALIGNED-NEXT: flat_store_byte v[5:6], v79 offset:172
-; ALIGNED-NEXT: flat_store_byte v[5:6], v89 offset:170
-; ALIGNED-NEXT: flat_store_byte v[5:6], v88 offset:171
-; ALIGNED-NEXT: flat_store_byte v[5:6], v90 offset:169
-; ALIGNED-NEXT: flat_store_byte v[5:6], v77 offset:165
-; ALIGNED-NEXT: flat_store_byte v[5:6], v95 offset:159
-; ALIGNED-NEXT: flat_store_byte v[5:6], v94 offset:160
-; ALIGNED-NEXT: flat_store_byte v[5:6], v106 offset:158
-; ALIGNED-NEXT: flat_store_byte v[5:6], v93 offset:164
-; ALIGNED-NEXT: flat_store_byte v[5:6], v105 offset:162
-; ALIGNED-NEXT: flat_store_byte v[5:6], v104 offset:163
-; ALIGNED-NEXT: flat_store_byte v[5:6], v107 offset:161
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[5:6], v73 offset:167
+; ALIGNED-NEXT: flat_store_byte v[5:6], v72 offset:168
+; ALIGNED-NEXT: flat_store_byte v[5:6], v74 offset:166
+; ALIGNED-NEXT: flat_store_byte v[5:6], v77 offset:172
+; ALIGNED-NEXT: flat_store_byte v[5:6], v79 offset:170
+; ALIGNED-NEXT: flat_store_byte v[5:6], v78 offset:171
+; ALIGNED-NEXT: flat_store_byte v[5:6], v88 offset:169
+; ALIGNED-NEXT: flat_store_byte v[5:6], v76 offset:165
+; ALIGNED-NEXT: flat_store_byte v[5:6], v93 offset:159
+; ALIGNED-NEXT: flat_store_byte v[5:6], v92 offset:160
+; ALIGNED-NEXT: flat_store_byte v[5:6], v105 offset:158
+; ALIGNED-NEXT: flat_store_byte v[5:6], v91 offset:164
+; ALIGNED-NEXT: flat_store_byte v[5:6], v95 offset:162
+; ALIGNED-NEXT: flat_store_byte v[5:6], v94 offset:163
+; ALIGNED-NEXT: flat_store_byte v[5:6], v106 offset:161
; ALIGNED-NEXT: flat_store_byte v[5:6], v109 offset:157
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1452 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1452 ; 4-byte Folded Reload
-; ALIGNED-NEXT: flat_store_byte v[5:6], v110 offset:151
-; ALIGNED-NEXT: flat_store_byte v[5:6], v120 offset:152
-; ALIGNED-NEXT: flat_store_byte v[5:6], v111 offset:150
-; ALIGNED-NEXT: flat_store_byte v[5:6], v126 offset:156
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[5:6], v121 offset:151
+; ALIGNED-NEXT: flat_store_byte v[5:6], v111 offset:152
+; ALIGNED-NEXT: flat_store_byte v[5:6], v122 offset:150
+; ALIGNED-NEXT: flat_store_byte v[5:6], v123 offset:156
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:154
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:155
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[5:6], v125 offset:155
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:153
-; ALIGNED-NEXT: flat_store_byte v[5:6], v123 offset:149
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[5:6], v124 offset:149
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:143
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:144
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1372 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:142
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1416 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:148
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:146
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:147
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1416 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1384 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:145
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1360 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:141
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1384 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1352 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1316 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1304 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1284 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1324 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:135
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1364 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1320 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:136
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1356 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1316 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:134
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1304 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:140
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1296 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:138
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1300 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:139
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1288 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:137
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1344 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:133
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1272 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1268 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:127
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1260 ; 4-byte Folded Reload
@@ -13826,13 +13833,13 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1256 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:126
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1284 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1280 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:132
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1276 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1272 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:130
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1280 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1276 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:131
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1264 ; 4-byte Folded Reload
@@ -13840,7 +13847,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:129
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1252 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[121:122], v0 offset:128
+; ALIGNED-NEXT: flat_store_byte v[126:127], v0 offset:128
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1248 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:360
@@ -13904,10 +13911,10 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:376
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1156 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:380
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:372
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Reload
@@ -13919,16 +13926,16 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1160 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:104
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1156 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:102
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:108
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:106
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:107
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Reload
@@ -14080,7 +14087,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:65
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[121:122], v0 offset:64
+; ALIGNED-NEXT: flat_store_byte v[126:127], v0 offset:64
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424
@@ -14188,7 +14195,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:30
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[121:122], v0 offset:32
+; ALIGNED-NEXT: flat_store_byte v[126:127], v0 offset:32
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:36
@@ -14210,7 +14217,9 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:388
-; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:384
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:384
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:23
@@ -14235,11 +14244,9 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:21
-; ALIGNED-NEXT: flat_store_byte v[5:6], v124 offset:15
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:16
-; ALIGNED-NEXT: flat_store_byte v[5:6], v1 offset:14
+; ALIGNED-NEXT: flat_store_byte v[5:6], v110 offset:15
+; ALIGNED-NEXT: flat_store_byte v[5:6], v108 offset:16
+; ALIGNED-NEXT: flat_store_byte v[5:6], v107 offset:14
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:20
@@ -14252,63 +14259,63 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:17
-; ALIGNED-NEXT: flat_store_byte v[121:122], v125 offset:16
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[126:127], v120 offset:16
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1320 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1364 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Reload
-; ALIGNED-NEXT: flat_store_byte v[5:6], v3 offset:7
-; ALIGNED-NEXT: flat_store_byte v[5:6], v4 offset:8
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[5:6], v1 offset:7
+; ALIGNED-NEXT: flat_store_byte v[5:6], v3 offset:8
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:10
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:6
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:12
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1372 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:11
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1360 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:9
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[121:122], v0 offset:8
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1300 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[126:127], v0 offset:8
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[121:122], v0 offset:2
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[126:127], v0 offset:2
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1340 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1288 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[121:122], v0 offset:1
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1340 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[126:127], v0 offset:1
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1356 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:4
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1352 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:2
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1324 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:3
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1296 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1344 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[121:122], v0 offset:4
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1268 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[126:127], v0 offset:4
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[121:122], v0
+; ALIGNED-NEXT: flat_store_byte v[126:127], v0
; ALIGNED-NEXT: s_cbranch_scc1 .LBB9_1
; ALIGNED-NEXT: .LBB9_2: ; %Flow10
; ALIGNED-NEXT: s_andn2_saveexec_b32 s8, s6
@@ -14326,30 +14333,29 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:22
; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:23
; ALIGNED-NEXT: buffer_load_ubyte v5, v6, s[0:3], 0 offen offset:24
-; ALIGNED-NEXT: buffer_load_ubyte v10, v6, s[0:3], 0 offen offset:25
-; ALIGNED-NEXT: buffer_load_ubyte v127, v6, s[0:3], 0 offen offset:19
+; ALIGNED-NEXT: buffer_load_ubyte v8, v6, s[0:3], 0 offen offset:25
+; ALIGNED-NEXT: buffer_load_ubyte v11, v6, s[0:3], 0 offen offset:26
+; ALIGNED-NEXT: buffer_load_ubyte v12, v6, s[0:3], 0 offen offset:27
; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:28
; ALIGNED-NEXT: buffer_load_ubyte v7, v6, s[0:3], 0 offen offset:29
-; ALIGNED-NEXT: buffer_load_ubyte v8, v6, s[0:3], 0 offen offset:30
-; ALIGNED-NEXT: buffer_load_ubyte v9, v6, s[0:3], 0 offen offset:31
-; ALIGNED-NEXT: buffer_load_ubyte v12, v6, s[0:3], 0 offen offset:26
+; ALIGNED-NEXT: buffer_load_ubyte v9, v6, s[0:3], 0 offen offset:30
+; ALIGNED-NEXT: buffer_load_ubyte v10, v6, s[0:3], 0 offen offset:31
; ALIGNED-NEXT: buffer_load_ubyte v14, v6, s[0:3], 0 offen offset:32
; ALIGNED-NEXT: buffer_load_ubyte v15, v6, s[0:3], 0 offen offset:33
-; ALIGNED-NEXT: buffer_load_ubyte v11, v6, s[0:3], 0 offen offset:27
; ALIGNED-NEXT: buffer_load_ubyte v19, v6, s[0:3], 0 offen offset:34
-; ALIGNED-NEXT: buffer_load_ubyte v20, v6, s[0:3], 0 offen offset:35
+; ALIGNED-NEXT: buffer_load_ubyte v21, v6, s[0:3], 0 offen offset:35
; ALIGNED-NEXT: buffer_load_ubyte v13, v6, s[0:3], 0 offen offset:36
-; ALIGNED-NEXT: buffer_load_ubyte v16, v6, s[0:3], 0 offen offset:37
-; ALIGNED-NEXT: buffer_load_ubyte v17, v6, s[0:3], 0 offen offset:38
-; ALIGNED-NEXT: buffer_load_ubyte v18, v6, s[0:3], 0 offen offset:39
+; ALIGNED-NEXT: buffer_load_ubyte v17, v6, s[0:3], 0 offen offset:37
+; ALIGNED-NEXT: buffer_load_ubyte v18, v6, s[0:3], 0 offen offset:38
+; ALIGNED-NEXT: buffer_load_ubyte v20, v6, s[0:3], 0 offen offset:39
; ALIGNED-NEXT: buffer_load_ubyte v22, v6, s[0:3], 0 offen offset:40
-; ALIGNED-NEXT: buffer_load_ubyte v26, v6, s[0:3], 0 offen offset:41
-; ALIGNED-NEXT: buffer_load_ubyte v27, v6, s[0:3], 0 offen offset:42
-; ALIGNED-NEXT: buffer_load_ubyte v28, v6, s[0:3], 0 offen offset:43
-; ALIGNED-NEXT: buffer_load_ubyte v21, v6, s[0:3], 0 offen offset:44
-; ALIGNED-NEXT: buffer_load_ubyte v23, v6, s[0:3], 0 offen offset:45
-; ALIGNED-NEXT: buffer_load_ubyte v24, v6, s[0:3], 0 offen offset:46
-; ALIGNED-NEXT: buffer_load_ubyte v25, v6, s[0:3], 0 offen offset:47
+; ALIGNED-NEXT: buffer_load_ubyte v27, v6, s[0:3], 0 offen offset:41
+; ALIGNED-NEXT: buffer_load_ubyte v28, v6, s[0:3], 0 offen offset:42
+; ALIGNED-NEXT: buffer_load_ubyte v29, v6, s[0:3], 0 offen offset:43
+; ALIGNED-NEXT: buffer_load_ubyte v23, v6, s[0:3], 0 offen offset:44
+; ALIGNED-NEXT: buffer_load_ubyte v24, v6, s[0:3], 0 offen offset:45
+; ALIGNED-NEXT: buffer_load_ubyte v25, v6, s[0:3], 0 offen offset:46
+; ALIGNED-NEXT: buffer_load_ubyte v26, v6, s[0:3], 0 offen offset:47
; ALIGNED-NEXT: buffer_load_ubyte v30, v6, s[0:3], 0 offen offset:48
; ALIGNED-NEXT: buffer_load_ubyte v31, v6, s[0:3], 0 offen offset:49
; ALIGNED-NEXT: buffer_load_ubyte v33, v6, s[0:3], 0 offen offset:50
@@ -14360,45 +14366,43 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_ubyte v36, v6, s[0:3], 0 offen offset:55
; ALIGNED-NEXT: buffer_load_ubyte v38, v6, s[0:3], 0 offen offset:56
; ALIGNED-NEXT: buffer_load_ubyte v51, v6, s[0:3], 0 offen offset:57
+; ALIGNED-NEXT: buffer_load_ubyte v16, v6, s[0:3], 0 offen offset:58
+; ALIGNED-NEXT: buffer_load_ubyte v52, v6, s[0:3], 0 offen offset:59
; ALIGNED-NEXT: buffer_load_ubyte v39, v6, s[0:3], 0 offen offset:60
; ALIGNED-NEXT: buffer_load_ubyte v50, v6, s[0:3], 0 offen offset:61
; ALIGNED-NEXT: buffer_load_ubyte v48, v6, s[0:3], 0 offen offset:62
; ALIGNED-NEXT: buffer_load_ubyte v49, v6, s[0:3], 0 offen offset:63
-; ALIGNED-NEXT: buffer_load_ubyte v52, v6, s[0:3], 0 offen offset:58
-; ALIGNED-NEXT: buffer_load_ubyte v29, v6, s[0:3], 0 offen offset:64
-; ALIGNED-NEXT: buffer_load_ubyte v55, v6, s[0:3], 0 offen offset:65
-; ALIGNED-NEXT: buffer_load_ubyte v53, v6, s[0:3], 0 offen offset:59
+; ALIGNED-NEXT: buffer_load_ubyte v53, v6, s[0:3], 0 offen offset:64
+; ALIGNED-NEXT: buffer_load_ubyte v54, v6, s[0:3], 0 offen offset:65
; ALIGNED-NEXT: buffer_load_ubyte v66, v6, s[0:3], 0 offen offset:66
; ALIGNED-NEXT: buffer_load_ubyte v67, v6, s[0:3], 0 offen offset:67
-; ALIGNED-NEXT: buffer_load_ubyte v54, v6, s[0:3], 0 offen offset:68
+; ALIGNED-NEXT: buffer_load_ubyte v55, v6, s[0:3], 0 offen offset:68
; ALIGNED-NEXT: buffer_load_ubyte v64, v6, s[0:3], 0 offen offset:69
; ALIGNED-NEXT: buffer_load_ubyte v65, v6, s[0:3], 0 offen offset:70
; ALIGNED-NEXT: buffer_load_ubyte v68, v6, s[0:3], 0 offen offset:71
; ALIGNED-NEXT: buffer_load_ubyte v69, v6, s[0:3], 0 offen offset:76
-; ALIGNED-NEXT: buffer_load_ubyte v81, v6, s[0:3], 0 offen offset:75
; ALIGNED-NEXT: buffer_load_ubyte v70, v6, s[0:3], 0 offen offset:77
+; ALIGNED-NEXT: buffer_load_ubyte v81, v6, s[0:3], 0 offen offset:72
+; ALIGNED-NEXT: buffer_load_ubyte v82, v6, s[0:3], 0 offen offset:73
; ALIGNED-NEXT: buffer_load_ubyte v71, v6, s[0:3], 0 offen offset:78
; ALIGNED-NEXT: buffer_load_ubyte v80, v6, s[0:3], 0 offen offset:79
-; ALIGNED-NEXT: buffer_load_ubyte v123, v6, s[0:3], 0 offen offset:152
-; ALIGNED-NEXT: buffer_load_ubyte v122, v6, s[0:3], 0 offen offset:153
-; ALIGNED-NEXT: buffer_load_ubyte v111, v6, s[0:3], 0 offen offset:154
-; ALIGNED-NEXT: buffer_load_ubyte v108, v6, s[0:3], 0 offen offset:160
-; ALIGNED-NEXT: buffer_load_ubyte v105, v6, s[0:3], 0 offen offset:161
-; ALIGNED-NEXT: s_clause 0x34
-; ALIGNED-NEXT: buffer_load_ubyte v94, v6, s[0:3], 0 offen offset:162
-; ALIGNED-NEXT: buffer_load_ubyte v92, v6, s[0:3], 0 offen offset:163
-; ALIGNED-NEXT: buffer_load_ubyte v106, v6, s[0:3], 0 offen offset:164
-; ALIGNED-NEXT: buffer_load_ubyte v95, v6, s[0:3], 0 offen offset:165
-; ALIGNED-NEXT: buffer_load_ubyte v104, v6, s[0:3], 0 offen offset:166
-; ALIGNED-NEXT: buffer_load_ubyte v91, v6, s[0:3], 0 offen offset:167
-; ALIGNED-NEXT: buffer_load_ubyte v78, v6, s[0:3], 0 offen offset:172
-; ALIGNED-NEXT: buffer_load_ubyte v77, v6, s[0:3], 0 offen offset:173
-; ALIGNED-NEXT: buffer_load_ubyte v76, v6, s[0:3], 0 offen offset:174
-; ALIGNED-NEXT: buffer_load_ubyte v75, v6, s[0:3], 0 offen offset:175
-; ALIGNED-NEXT: buffer_load_ubyte v74, v6, s[0:3], 0 offen offset:171
-; ALIGNED-NEXT: buffer_load_ubyte v73, v6, s[0:3], 0 offen offset:168
-; ALIGNED-NEXT: buffer_load_ubyte v63, v6, s[0:3], 0 offen offset:169
-; ALIGNED-NEXT: buffer_load_ubyte v62, v6, s[0:3], 0 offen offset:170
+; ALIGNED-NEXT: buffer_load_ubyte v125, v6, s[0:3], 0 offen offset:160
+; ALIGNED-NEXT: buffer_load_ubyte v123, v6, s[0:3], 0 offen offset:161
+; ALIGNED-NEXT: buffer_load_ubyte v111, v6, s[0:3], 0 offen offset:162
+; ALIGNED-NEXT: buffer_load_ubyte v109, v6, s[0:3], 0 offen offset:163
+; ALIGNED-NEXT: buffer_load_ubyte v124, v6, s[0:3], 0 offen offset:164
+; ALIGNED-NEXT: s_clause 0x30
+; ALIGNED-NEXT: buffer_load_ubyte v121, v6, s[0:3], 0 offen offset:165
+; ALIGNED-NEXT: buffer_load_ubyte v120, v6, s[0:3], 0 offen offset:166
+; ALIGNED-NEXT: buffer_load_ubyte v108, v6, s[0:3], 0 offen offset:167
+; ALIGNED-NEXT: buffer_load_ubyte v90, v6, s[0:3], 0 offen offset:172
+; ALIGNED-NEXT: buffer_load_ubyte v88, v6, s[0:3], 0 offen offset:173
+; ALIGNED-NEXT: buffer_load_ubyte v79, v6, s[0:3], 0 offen offset:174
+; ALIGNED-NEXT: buffer_load_ubyte v77, v6, s[0:3], 0 offen offset:175
+; ALIGNED-NEXT: buffer_load_ubyte v78, v6, s[0:3], 0 offen offset:168
+; ALIGNED-NEXT: buffer_load_ubyte v72, v6, s[0:3], 0 offen offset:169
+; ALIGNED-NEXT: buffer_load_ubyte v63, v6, s[0:3], 0 offen offset:170
+; ALIGNED-NEXT: buffer_load_ubyte v62, v6, s[0:3], 0 offen offset:171
; ALIGNED-NEXT: buffer_load_ubyte v61, v6, s[0:3], 0 offen offset:176
; ALIGNED-NEXT: buffer_load_ubyte v59, v6, s[0:3], 0 offen offset:177
; ALIGNED-NEXT: buffer_load_ubyte v47, v6, s[0:3], 0 offen offset:178
@@ -14411,63 +14415,62 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_ubyte v43, v6, s[0:3], 0 offen offset:189
; ALIGNED-NEXT: buffer_load_ubyte v42, v6, s[0:3], 0 offen offset:190
; ALIGNED-NEXT: buffer_load_ubyte v41, v6, s[0:3], 0 offen offset:191
-; ALIGNED-NEXT: buffer_load_ubyte v40, v6, s[0:3], 0 offen offset:187
-; ALIGNED-NEXT: buffer_load_ubyte v119, v6, s[0:3], 0 offen offset:184
-; ALIGNED-NEXT: buffer_load_ubyte v118, v6, s[0:3], 0 offen offset:185
-; ALIGNED-NEXT: buffer_load_ubyte v116, v6, s[0:3], 0 offen offset:186
+; ALIGNED-NEXT: buffer_load_ubyte v40, v6, s[0:3], 0 offen offset:184
+; ALIGNED-NEXT: buffer_load_ubyte v119, v6, s[0:3], 0 offen offset:185
+; ALIGNED-NEXT: buffer_load_ubyte v118, v6, s[0:3], 0 offen offset:186
+; ALIGNED-NEXT: buffer_load_ubyte v116, v6, s[0:3], 0 offen offset:187
; ALIGNED-NEXT: buffer_load_ubyte v115, v6, s[0:3], 0 offen offset:192
-; ALIGNED-NEXT: buffer_load_ubyte v112, v6, s[0:3], 0 offen offset:193
+; ALIGNED-NEXT: buffer_load_ubyte v113, v6, s[0:3], 0 offen offset:193
; ALIGNED-NEXT: buffer_load_ubyte v101, v6, s[0:3], 0 offen offset:194
; ALIGNED-NEXT: buffer_load_ubyte v100, v6, s[0:3], 0 offen offset:195
-; ALIGNED-NEXT: buffer_load_ubyte v113, v6, s[0:3], 0 offen offset:196
+; ALIGNED-NEXT: buffer_load_ubyte v112, v6, s[0:3], 0 offen offset:196
; ALIGNED-NEXT: buffer_load_ubyte v103, v6, s[0:3], 0 offen offset:197
; ALIGNED-NEXT: buffer_load_ubyte v102, v6, s[0:3], 0 offen offset:198
; ALIGNED-NEXT: buffer_load_ubyte v99, v6, s[0:3], 0 offen offset:199
; ALIGNED-NEXT: buffer_load_ubyte v97, v6, s[0:3], 0 offen offset:204
-; ALIGNED-NEXT: buffer_load_ubyte v87, v6, s[0:3], 0 offen offset:205
-; ALIGNED-NEXT: buffer_load_ubyte v96, v6, s[0:3], 0 offen offset:206
-; ALIGNED-NEXT: buffer_load_ubyte v86, v6, s[0:3], 0 offen offset:207
-; ALIGNED-NEXT: buffer_load_ubyte v85, v6, s[0:3], 0 offen offset:203
-; ALIGNED-NEXT: buffer_load_ubyte v84, v6, s[0:3], 0 offen offset:200
-; ALIGNED-NEXT: buffer_load_ubyte v83, v6, s[0:3], 0 offen offset:201
-; ALIGNED-NEXT: buffer_load_ubyte v82, v6, s[0:3], 0 offen offset:202
-; ALIGNED-NEXT: buffer_load_ubyte v124, v6, s[0:3], 0 offen offset:155
-; ALIGNED-NEXT: buffer_load_ubyte v120, v6, s[0:3], 0 offen offset:2
-; ALIGNED-NEXT: buffer_load_ubyte v89, v6, s[0:3], 0 offen offset:3
-; ALIGNED-NEXT: buffer_load_ubyte v93, v6, s[0:3], 0 offen offset:4
-; ALIGNED-NEXT: buffer_load_ubyte v109, v6, s[0:3], 0 offen offset:5
-; ALIGNED-NEXT: buffer_load_ubyte v110, v6, s[0:3], 0 offen offset:6
-; ALIGNED-NEXT: buffer_load_ubyte v121, v6, s[0:3], 0 offen offset:7
+; ALIGNED-NEXT: buffer_load_ubyte v96, v6, s[0:3], 0 offen offset:205
+; ALIGNED-NEXT: buffer_load_ubyte v87, v6, s[0:3], 0 offen offset:206
+; ALIGNED-NEXT: buffer_load_ubyte v85, v6, s[0:3], 0 offen offset:207
+; ALIGNED-NEXT: buffer_load_ubyte v110, v6, s[0:3], 0 offen offset:12
+; ALIGNED-NEXT: buffer_load_ubyte v105, v6, s[0:3], 0 offen offset:13
+; ALIGNED-NEXT: buffer_load_ubyte v107, v6, s[0:3], 0 offen offset:14
+; ALIGNED-NEXT: buffer_load_ubyte v106, v6, s[0:3], 0 offen offset:15
+; ALIGNED-NEXT: buffer_load_ubyte v93, v6, s[0:3], 0 offen offset:2
+; ALIGNED-NEXT: buffer_load_ubyte v75, v6, s[0:3], 0 offen offset:3
+; ALIGNED-NEXT: buffer_load_ubyte v76, v6, s[0:3], 0 offen offset:4
+; ALIGNED-NEXT: buffer_load_ubyte v89, v6, s[0:3], 0 offen offset:5
+; ALIGNED-NEXT: buffer_load_ubyte v92, v6, s[0:3], 0 offen offset:6
+; ALIGNED-NEXT: buffer_load_ubyte v94, v6, s[0:3], 0 offen offset:7
; ALIGNED-NEXT: s_waitcnt vmcnt(62)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2
-; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v2, v7, 8, v4
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v9, 8, v8
; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v10, 8, v5
-; ALIGNED-NEXT: v_lshl_or_b32 v5, v11, 8, v12
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v10, 8, v9
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v8, 8, v5
+; ALIGNED-NEXT: v_lshl_or_b32 v5, v12, 8, v11
; ALIGNED-NEXT: v_lshl_or_b32 v7, v15, 8, v14
-; ALIGNED-NEXT: v_lshl_or_b32 v8, v20, 8, v19
-; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v13
-; ALIGNED-NEXT: v_lshl_or_b32 v10, v18, 8, v17
-; ALIGNED-NEXT: v_lshl_or_b32 v11, v26, 8, v22
-; ALIGNED-NEXT: v_lshl_or_b32 v12, v28, 8, v27
-; ALIGNED-NEXT: v_lshl_or_b32 v13, v23, 8, v21
+; ALIGNED-NEXT: v_lshl_or_b32 v8, v21, 8, v19
+; ALIGNED-NEXT: v_lshl_or_b32 v9, v17, 8, v13
+; ALIGNED-NEXT: v_lshl_or_b32 v10, v20, 8, v18
+; ALIGNED-NEXT: v_lshl_or_b32 v11, v27, 8, v22
+; ALIGNED-NEXT: v_lshl_or_b32 v12, v29, 8, v28
+; ALIGNED-NEXT: v_lshl_or_b32 v13, v24, 8, v23
+; ALIGNED-NEXT: v_lshl_or_b32 v14, v26, 8, v25
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v14, v25, 8, v24
; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v2
; ALIGNED-NEXT: v_lshl_or_b32 v2, v5, 16, v4
; ALIGNED-NEXT: v_lshl_or_b32 v3, v8, 16, v7
@@ -14489,72 +14492,81 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v5, v51, 8, v38
; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v7, v53, 8, v52
+; ALIGNED-NEXT: v_lshl_or_b32 v7, v52, 8, v16
; ALIGNED-NEXT: v_lshl_or_b32 v0, v0, 16, v15
; ALIGNED-NEXT: v_lshl_or_b32 v1, v2, 16, v1
; ALIGNED-NEXT: v_lshl_or_b32 v2, v4, 16, v3
-; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:85
+; ALIGNED-NEXT: buffer_load_ubyte v8, v6, s[0:3], 0 offen offset:87
; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 16, v5
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v55, 8, v29
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v54, 8, v53
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v1, v67, 8, v66
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v64, 8, v54
+; ALIGNED-NEXT: v_lshl_or_b32 v2, v64, 8, v55
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(61)
; ALIGNED-NEXT: v_lshl_or_b32 v3, v68, 8, v65
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v5, v6, s[0:3], 0 offen offset:86
; ALIGNED-NEXT: buffer_load_ubyte v7, v6, s[0:3], 0 offen offset:82
-; ALIGNED-NEXT: buffer_load_ubyte v8, v6, s[0:3], 0 offen offset:87
+; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:85
+; ALIGNED-NEXT: buffer_load_ubyte v5, v6, s[0:3], 0 offen offset:86
; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v2
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:83
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(62)
; ALIGNED-NEXT: v_lshl_or_b32 v0, v70, 8, v69
+; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:84
; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:74
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(60)
; ALIGNED-NEXT: v_lshl_or_b32 v1, v80, 8, v71
+; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:73
-; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x4
+; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:75
+; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x3
; ALIGNED-NEXT: buffer_load_ubyte v80, v6, s[0:3], 0 offen offset:212
; ALIGNED-NEXT: buffer_load_ubyte v67, v6, s[0:3], 0 offen offset:213
; ALIGNED-NEXT: buffer_load_ubyte v70, v6, s[0:3], 0 offen offset:214
-; ALIGNED-NEXT: buffer_load_ubyte v64, v6, s[0:3], 0 offen offset:215
-; ALIGNED-NEXT: buffer_load_ubyte v65, v6, s[0:3], 0 offen offset:211
-; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:72
-; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v65, v6, s[0:3], 0 offen offset:215
+; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v82, 8, v81
+; ALIGNED-NEXT: s_clause 0x3
+; ALIGNED-NEXT: buffer_load_ubyte v86, v6, s[0:3], 0 offen offset:200
+; ALIGNED-NEXT: buffer_load_ubyte v84, v6, s[0:3], 0 offen offset:201
+; ALIGNED-NEXT: buffer_load_ubyte v83, v6, s[0:3], 0 offen offset:202
+; ALIGNED-NEXT: buffer_load_ubyte v82, v6, s[0:3], 0 offen offset:203
+; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_clause 0x7
; ALIGNED-NEXT: buffer_load_ubyte v71, v6, s[0:3], 0 offen offset:216
-; ALIGNED-NEXT: buffer_load_ubyte v66, v6, s[0:3], 0 offen offset:217
-; ALIGNED-NEXT: buffer_load_ubyte v53, v6, s[0:3], 0 offen offset:218
-; ALIGNED-NEXT: buffer_load_ubyte v52, v6, s[0:3], 0 offen offset:219
-; ALIGNED-NEXT: buffer_load_ubyte v69, v6, s[0:3], 0 offen offset:220
-; ALIGNED-NEXT: buffer_load_ubyte v55, v6, s[0:3], 0 offen offset:221
-; ALIGNED-NEXT: buffer_load_ubyte v54, v6, s[0:3], 0 offen offset:222
-; ALIGNED-NEXT: buffer_load_ubyte v51, v6, s[0:3], 0 offen offset:223
+; ALIGNED-NEXT: buffer_load_ubyte v69, v6, s[0:3], 0 offen offset:217
+; ALIGNED-NEXT: buffer_load_ubyte v55, v6, s[0:3], 0 offen offset:218
+; ALIGNED-NEXT: buffer_load_ubyte v54, v6, s[0:3], 0 offen offset:219
+; ALIGNED-NEXT: buffer_load_ubyte v68, v6, s[0:3], 0 offen offset:220
+; ALIGNED-NEXT: buffer_load_ubyte v66, v6, s[0:3], 0 offen offset:221
+; ALIGNED-NEXT: buffer_load_ubyte v64, v6, s[0:3], 0 offen offset:222
+; ALIGNED-NEXT: buffer_load_ubyte v52, v6, s[0:3], 0 offen offset:223
; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v49, v6, s[0:3], 0 offen offset:208
-; ALIGNED-NEXT: buffer_load_ubyte v38, v6, s[0:3], 0 offen offset:209
-; ALIGNED-NEXT: buffer_load_ubyte v39, v6, s[0:3], 0 offen offset:210
+; ALIGNED-NEXT: s_clause 0x3
+; ALIGNED-NEXT: buffer_load_ubyte v50, v6, s[0:3], 0 offen offset:208
+; ALIGNED-NEXT: buffer_load_ubyte v39, v6, s[0:3], 0 offen offset:209
+; ALIGNED-NEXT: buffer_load_ubyte v48, v6, s[0:3], 0 offen offset:210
+; ALIGNED-NEXT: buffer_load_ubyte v38, v6, s[0:3], 0 offen offset:211
; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill
@@ -14572,31 +14584,31 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_ubyte v33, v6, s[0:3], 0 offen offset:229
; ALIGNED-NEXT: buffer_load_ubyte v34, v6, s[0:3], 0 offen offset:230
; ALIGNED-NEXT: buffer_load_ubyte v30, v6, s[0:3], 0 offen offset:231
-; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x4
+; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x3
; ALIGNED-NEXT: buffer_load_ubyte v29, v6, s[0:3], 0 offen offset:236
-; ALIGNED-NEXT: buffer_load_ubyte v27, v6, s[0:3], 0 offen offset:237
-; ALIGNED-NEXT: buffer_load_ubyte v28, v6, s[0:3], 0 offen offset:238
-; ALIGNED-NEXT: buffer_load_ubyte v26, v6, s[0:3], 0 offen offset:239
-; ALIGNED-NEXT: buffer_load_ubyte v25, v6, s[0:3], 0 offen offset:235
+; ALIGNED-NEXT: buffer_load_ubyte v28, v6, s[0:3], 0 offen offset:237
+; ALIGNED-NEXT: buffer_load_ubyte v27, v6, s[0:3], 0 offen offset:238
+; ALIGNED-NEXT: buffer_load_ubyte v25, v6, s[0:3], 0 offen offset:239
; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v24, v6, s[0:3], 0 offen offset:232
-; ALIGNED-NEXT: buffer_load_ubyte v23, v6, s[0:3], 0 offen offset:233
-; ALIGNED-NEXT: buffer_load_ubyte v22, v6, s[0:3], 0 offen offset:234
+; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x3
+; ALIGNED-NEXT: buffer_load_ubyte v26, v6, s[0:3], 0 offen offset:232
+; ALIGNED-NEXT: buffer_load_ubyte v24, v6, s[0:3], 0 offen offset:233
+; ALIGNED-NEXT: buffer_load_ubyte v23, v6, s[0:3], 0 offen offset:234
+; ALIGNED-NEXT: buffer_load_ubyte v22, v6, s[0:3], 0 offen offset:235
; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0xc
+; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0xb
; ALIGNED-NEXT: buffer_load_ubyte v21, v6, s[0:3], 0 offen offset:240
; ALIGNED-NEXT: buffer_load_ubyte v19, v6, s[0:3], 0 offen offset:241
; ALIGNED-NEXT: buffer_load_ubyte v15, v6, s[0:3], 0 offen offset:242
@@ -14606,59 +14618,53 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_ubyte v18, v6, s[0:3], 0 offen offset:246
; ALIGNED-NEXT: buffer_load_ubyte v14, v6, s[0:3], 0 offen offset:247
; ALIGNED-NEXT: buffer_load_ubyte v13, v6, s[0:3], 0 offen offset:252
-; ALIGNED-NEXT: buffer_load_ubyte v11, v6, s[0:3], 0 offen offset:253
-; ALIGNED-NEXT: buffer_load_ubyte v12, v6, s[0:3], 0 offen offset:254
-; ALIGNED-NEXT: buffer_load_ubyte v10, v6, s[0:3], 0 offen offset:255
-; ALIGNED-NEXT: buffer_load_ubyte v9, v6, s[0:3], 0 offen offset:251
-; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(57)
-; ALIGNED-NEXT: buffer_store_dword v89, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v89, v89, 8, v120
-; ALIGNED-NEXT: buffer_store_dword v120, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v126, v6, s[0:3], 0 offen offset:15
-; ALIGNED-NEXT: buffer_load_ubyte v120, v6, s[0:3], 0 offen offset:11
-; ALIGNED-NEXT: s_waitcnt vmcnt(57)
-; ALIGNED-NEXT: buffer_store_dword v109, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v12, v6, s[0:3], 0 offen offset:253
+; ALIGNED-NEXT: buffer_load_ubyte v11, v6, s[0:3], 0 offen offset:254
+; ALIGNED-NEXT: buffer_load_ubyte v9, v6, s[0:3], 0 offen offset:255
+; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(59)
+; ALIGNED-NEXT: buffer_store_dword v75, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v75, v75, 8, v93
+; ALIGNED-NEXT: buffer_store_dword v93, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(56)
-; ALIGNED-NEXT: buffer_store_dword v110, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v92, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(55)
-; ALIGNED-NEXT: buffer_store_dword v121, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v93, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v94, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v76, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v89, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(54)
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(53)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(52)
; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(52)
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(51)
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(50)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1024 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(49)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1004 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(48)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(42)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v81, 8, v2
-; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:84
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 8, v2
+; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:83
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:81
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:80
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1024 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1016 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1012 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v2, 8, v7
+; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:94
; ALIGNED-NEXT: buffer_load_ubyte v7, v6, s[0:3], 0 offen offset:98
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v5
@@ -14666,33 +14672,35 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_ubyte v5, v6, s[0:3], 0 offen offset:102
; ALIGNED-NEXT: buffer_load_ubyte v8, v6, s[0:3], 0 offen offset:103
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1044 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 8, v2
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 8, v3
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:94
-; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:95
+; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:95
+; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:101
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:93
-; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:91
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1052 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:92
+; ALIGNED-NEXT: s_waitcnt vmcnt(7)
+; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1060 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(6)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1116 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1108 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(5)
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1120 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1116 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1060 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1120 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1064 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1064 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1056 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1048 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v2
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2
+; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:90
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:101
+; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:91
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:89
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1072 ; 4-byte Folded Spill
@@ -14700,7 +14708,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1084 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
@@ -14708,23 +14716,24 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:99
-; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:100
-; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1108 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:99
+; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:100
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:97
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:96
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1100 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1100 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1104 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1104 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1096 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v2, 8, v7
+; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:110
; ALIGNED-NEXT: buffer_load_ubyte v7, v6, s[0:3], 0 offen offset:114
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v5
@@ -14732,33 +14741,35 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_ubyte v5, v6, s[0:3], 0 offen offset:118
; ALIGNED-NEXT: buffer_load_ubyte v8, v6, s[0:3], 0 offen offset:119
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 8, v2
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 8, v3
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:110
-; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:111
+; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:111
+; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:117
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:109
-; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:107
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:108
+; ALIGNED-NEXT: s_waitcnt vmcnt(7)
+; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1140 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(6)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(5)
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1140 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1144 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1144 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1136 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v2
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2
+; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:106
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1160 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:117
+; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:107
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:105
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Spill
@@ -14766,7 +14777,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1160 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1156 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
@@ -14774,23 +14785,24 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:115
-; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:116
-; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:115
+; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:116
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:113
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:112
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1180 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1180 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1184 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1184 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1176 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v2, 8, v7
+; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:126
; ALIGNED-NEXT: buffer_load_ubyte v7, v6, s[0:3], 0 offen offset:130
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v5
@@ -14798,33 +14810,35 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_ubyte v5, v6, s[0:3], 0 offen offset:134
; ALIGNED-NEXT: buffer_load_ubyte v8, v6, s[0:3], 0 offen offset:135
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 8, v2
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 8, v3
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:126
-; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:127
+; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:127
+; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:133
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:125
-; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:123
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:124
+; ALIGNED-NEXT: s_waitcnt vmcnt(7)
+; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(6)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1276 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1268 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(5)
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1280 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1276 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1280 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1224 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1224 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1272 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1216 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v2
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2
+; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:122
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1240 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:133
+; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:123
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:121
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1232 ; 4-byte Folded Spill
@@ -14832,7 +14846,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1244 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1272 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1240 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1236 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
@@ -14840,23 +14854,24 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:131
-; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:132
-; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1268 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:131
+; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:132
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:129
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1248 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:128
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1260 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1260 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1264 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1264 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1256 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1252 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v2, 8, v7
+; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:142
; ALIGNED-NEXT: buffer_load_ubyte v7, v6, s[0:3], 0 offen offset:146
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v5
@@ -14864,33 +14879,35 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_ubyte v5, v6, s[0:3], 0 offen offset:150
; ALIGNED-NEXT: buffer_load_ubyte v8, v6, s[0:3], 0 offen offset:151
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1284 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 8, v2
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 8, v3
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:142
-; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:143
+; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:143
+; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:149
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:141
-; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:139
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:140
+; ALIGNED-NEXT: s_waitcnt vmcnt(7)
+; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(6)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(5)
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1356 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1352 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1288 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v2
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2
+; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:138
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1320 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:149
+; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:139
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:137
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill
@@ -14898,7 +14915,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1324 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1320 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1316 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
@@ -14906,174 +14923,180 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:147
-; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:148
-; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1356 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:147
+; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:148
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:145
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:144
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1344 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1352 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1344 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1336 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7
-; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:159
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v2, 8, v7
+; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:158
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v5
-; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v12
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 8, v2
-; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:158
+; ALIGNED-NEXT: v_lshl_or_b32 v5, v9, 8, v11
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 8, v3
+; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:159
; ALIGNED-NEXT: v_lshl_or_b32 v4, v14, 8, v18
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:157
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:156
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v100, 8, v101
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v86, 8, v96
+; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: buffer_load_ubyte v127, v6, s[0:3], 0 offen offset:154
+; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:155
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v82, 8, v83
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v124, 8, v111
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v122, 8, v123
+; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:153
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:152
+; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v2, 8, v127
+; ALIGNED-NEXT: v_lshl_or_b32 v2, v84, 8, v86
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v92, 8, v94
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v105, 8, v108
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v109, 8, v111
+; ALIGNED-NEXT: v_lshl_or_b32 v74, v3, 16, v2
+; ALIGNED-NEXT: v_lshl_or_b32 v2, v67, 8, v80
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v65, 8, v70
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v123, 8, v125
+; ALIGNED-NEXT: v_lshl_or_b32 v45, v3, 16, v2
+; ALIGNED-NEXT: v_lshl_or_b32 v2, v69, 8, v71
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v91, 8, v104
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v108, 8, v120
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v54, 8, v55
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v95, 8, v106
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v121, 8, v124
+; ALIGNED-NEXT: v_lshl_or_b32 v117, v3, 16, v2
+; ALIGNED-NEXT: v_lshl_or_b32 v2, v66, 8, v68
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v52, 8, v64
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v75, 8, v76
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v77, 8, v79
+; ALIGNED-NEXT: v_lshl_or_b32 v114, v3, 16, v2
+; ALIGNED-NEXT: v_lshl_or_b32 v2, v39, 8, v50
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 8, v78
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v88, 8, v90
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v38, 8, v48
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v74, 8, v62
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v62, 8, v63
+; ALIGNED-NEXT: v_lshl_or_b32 v98, v3, 16, v2
+; ALIGNED-NEXT: v_lshl_or_b32 v2, v35, 8, v37
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v32, 8, v31
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v63, 8, v73
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v72, 8, v78
+; ALIGNED-NEXT: v_lshl_or_b32 v81, v3, 16, v2
+; ALIGNED-NEXT: v_lshl_or_b32 v2, v33, 8, v36
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v56, 8, v47
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v30, 8, v34
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v59, 8, v61
+; ALIGNED-NEXT: v_lshl_or_b32 v53, v3, 16, v2
+; ALIGNED-NEXT: v_lshl_or_b32 v2, v28, 8, v29
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v25, 8, v27
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v46, 8, v58
+; ALIGNED-NEXT: v_lshl_or_b32 v51, v3, 16, v2
+; ALIGNED-NEXT: v_lshl_or_b32 v2, v24, 8, v26
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v57, 8, v60
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v22, 8, v23
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v41, 8, v42
+; ALIGNED-NEXT: v_lshl_or_b32 v49, v3, 16, v2
+; ALIGNED-NEXT: v_lshl_or_b32 v2, v19, 8, v21
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v16, 8, v15
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v44
+; ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 16, v2
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v17, 8, v20
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v116
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v116, 8, v118
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v118, 8, v119
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v119, 8, v40
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v12, 8, v13
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v8, v6, s[0:3], 0 offen offset:248
-; ALIGNED-NEXT: buffer_load_ubyte v7, v6, s[0:3], 0 offen offset:249
-; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:250
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v100, 8, v101
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v5, 16, v4
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v112, 8, v115
-; ALIGNED-NEXT: v_lshl_or_b32 v107, v2, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v103, 8, v113
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v99, 8, v102
-; ALIGNED-NEXT: v_lshl_or_b32 v90, v2, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v87, 8, v97
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v113, 8, v115
+; ALIGNED-NEXT: v_lshl_or_b32 v122, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v103, 8, v112
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v99, 8, v102
+; ALIGNED-NEXT: v_lshl_or_b32 v95, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v96, 8, v97
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v85, 8, v87
+; ALIGNED-NEXT: v_lshl_or_b32 v91, v1, 16, v0
+; ALIGNED-NEXT: s_clause 0x4
+; ALIGNED-NEXT: buffer_load_ubyte v10, v6, s[0:3], 0 offen offset:248
+; ALIGNED-NEXT: buffer_load_ubyte v8, v6, s[0:3], 0 offen offset:249
+; ALIGNED-NEXT: buffer_load_ubyte v7, v6, s[0:3], 0 offen offset:250
+; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:251
; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen
-; ALIGNED-NEXT: v_lshl_or_b32 v79, v3, 16, v2
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v83, 8, v84
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v85, 8, v82
-; ALIGNED-NEXT: v_lshl_or_b32 v72, v3, 16, v2
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v67, 8, v80
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v64, 8, v70
-; ALIGNED-NEXT: v_lshl_or_b32 v45, v3, 16, v2
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v66, 8, v71
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v52, 8, v53
-; ALIGNED-NEXT: v_lshl_or_b32 v117, v3, 16, v2
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v55, 8, v69
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v51, 8, v54
-; ALIGNED-NEXT: v_lshl_or_b32 v114, v3, 16, v2
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v38, 8, v49
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v65, 8, v39
-; ALIGNED-NEXT: v_lshl_or_b32 v98, v3, 16, v2
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v35, 8, v37
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v32, 8, v31
-; ALIGNED-NEXT: v_lshl_or_b32 v81, v3, 16, v2
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v33, 8, v36
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v30, 8, v34
-; ALIGNED-NEXT: v_lshl_or_b32 v68, v3, 16, v2
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v27, 8, v29
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v26, 8, v28
-; ALIGNED-NEXT: v_lshl_or_b32 v50, v3, 16, v2
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v23, 8, v24
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v25, 8, v22
-; ALIGNED-NEXT: v_lshl_or_b32 v48, v3, 16, v2
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v19, 8, v21
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v16, 8, v15
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 16, v2
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v17, 8, v20
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v11, 8, v13
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v5, 16, v4
-; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: v_lshl_or_b32 v5, v7, 8, v8
+; ALIGNED-NEXT: s_waitcnt vmcnt(3)
+; ALIGNED-NEXT: v_lshl_or_b32 v5, v8, 8, v10
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v88, v9, 8, v1
-; ALIGNED-NEXT: v_lshl_or_b32 v5, v88, 16, v5
-; ALIGNED-NEXT: buffer_load_ubyte v88, v6, s[0:3], 0 offen offset:1
+; ALIGNED-NEXT: v_lshl_or_b32 v73, v1, 8, v7
+; ALIGNED-NEXT: v_lshl_or_b32 v5, v73, 16, v5
+; ALIGNED-NEXT: buffer_load_ubyte v73, v6, s[0:3], 0 offen offset:1
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v88, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v88, v88, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v89, 16, v88
-; ALIGNED-NEXT: v_lshl_or_b32 v88, v109, 8, v93
-; ALIGNED-NEXT: v_lshl_or_b32 v89, v121, 8, v110
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v121, v6, s[0:3], 0 offen offset:8
-; ALIGNED-NEXT: buffer_load_ubyte v110, v6, s[0:3], 0 offen offset:9
-; ALIGNED-NEXT: buffer_load_ubyte v109, v6, s[0:3], 0 offen offset:10
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v89, 16, v88
-; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v125, v6, s[0:3], 0 offen offset:13
-; ALIGNED-NEXT: buffer_load_ubyte v89, v6, s[0:3], 0 offen offset:14
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:12
-; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: buffer_store_dword v89, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v89, v126, 8, v89
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: v_lshl_or_b32 v88, v125, 8, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v89, 16, v88
-; ALIGNED-NEXT: v_lshl_or_b32 v88, v110, 8, v121
-; ALIGNED-NEXT: v_lshl_or_b32 v89, v120, 8, v109
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v73, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v73, v73, 8, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v75, 16, v73
+; ALIGNED-NEXT: v_lshl_or_b32 v75, v94, 8, v92
+; ALIGNED-NEXT: s_clause 0x3
+; ALIGNED-NEXT: buffer_load_ubyte v104, v6, s[0:3], 0 offen offset:8
+; ALIGNED-NEXT: buffer_load_ubyte v94, v6, s[0:3], 0 offen offset:9
+; ALIGNED-NEXT: buffer_load_ubyte v93, v6, s[0:3], 0 offen offset:10
+; ALIGNED-NEXT: buffer_load_ubyte v92, v6, s[0:3], 0 offen offset:11
+; ALIGNED-NEXT: v_lshl_or_b32 v73, v89, 8, v76
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v75, 16, v73
+; ALIGNED-NEXT: v_lshl_or_b32 v73, v105, 8, v110
+; ALIGNED-NEXT: v_lshl_or_b32 v75, v106, 8, v107
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v89, 16, v88
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v88, v6, s[0:3], 0 offen offset:18
-; ALIGNED-NEXT: buffer_load_ubyte v93, v6, s[0:3], 0 offen offset:16
-; ALIGNED-NEXT: buffer_load_ubyte v89, v6, s[0:3], 0 offen offset:17
-; ALIGNED-NEXT: v_add_nc_u32_e32 v6, 0xffffff00, v6
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v75, 16, v73
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: v_lshl_or_b32 v73, v94, 8, v104
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: v_lshl_or_b32 v75, v92, 8, v93
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v75, 16, v73
+; ALIGNED-NEXT: s_clause 0x3
+; ALIGNED-NEXT: buffer_load_ubyte v75, v6, s[0:3], 0 offen offset:18
+; ALIGNED-NEXT: buffer_load_ubyte v76, v6, s[0:3], 0 offen offset:19
+; ALIGNED-NEXT: buffer_load_ubyte v89, v6, s[0:3], 0 offen offset:16
+; ALIGNED-NEXT: buffer_load_ubyte v73, v6, s[0:3], 0 offen offset:17
+; ALIGNED-NEXT: v_add_nc_u32_e32 v6, 0xffffff00, v6
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:488
; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:492
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:484
@@ -15082,24 +15105,24 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:704
; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708
; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v127, 8, v88
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 8, v75
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: v_lshl_or_b32 v127, v89, 8, v93
+; ALIGNED-NEXT: v_lshl_or_b32 v126, v73, 8, v89
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: v_add_co_u32 v2, vcc_lo, v2, s4
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: v_add_co_ci_u32_e64 v3, null, s5, v3, vcc_lo
-; ALIGNED-NEXT: v_lshl_or_b32 v127, v0, 16, v127
+; ALIGNED-NEXT: v_lshl_or_b32 v126, v0, 16, v126
; ALIGNED-NEXT: v_add_co_u32 v4, vcc_lo, v2, 3
; ALIGNED-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v3, vcc_lo
-; ALIGNED-NEXT: flat_store_byte v[4:5], v1 offset:247
-; ALIGNED-NEXT: flat_store_byte v[4:5], v9 offset:248
-; ALIGNED-NEXT: flat_store_byte v[4:5], v7 offset:246
-; ALIGNED-NEXT: flat_store_byte v[4:5], v10 offset:252
-; ALIGNED-NEXT: flat_store_byte v[4:5], v11 offset:250
-; ALIGNED-NEXT: flat_store_byte v[4:5], v12 offset:251
+; ALIGNED-NEXT: flat_store_byte v[4:5], v7 offset:247
+; ALIGNED-NEXT: flat_store_byte v[4:5], v1 offset:248
+; ALIGNED-NEXT: flat_store_byte v[4:5], v8 offset:246
+; ALIGNED-NEXT: flat_store_byte v[4:5], v9 offset:252
+; ALIGNED-NEXT: flat_store_byte v[4:5], v12 offset:250
+; ALIGNED-NEXT: flat_store_byte v[4:5], v11 offset:251
; ALIGNED-NEXT: flat_store_byte v[4:5], v13 offset:249
-; ALIGNED-NEXT: flat_store_byte v[4:5], v8 offset:245
+; ALIGNED-NEXT: flat_store_byte v[4:5], v10 offset:245
; ALIGNED-NEXT: flat_store_byte v[4:5], v15 offset:239
; ALIGNED-NEXT: flat_store_byte v[4:5], v16 offset:240
; ALIGNED-NEXT: flat_store_byte v[4:5], v19 offset:238
@@ -15108,18 +15131,18 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: flat_store_byte v[4:5], v18 offset:243
; ALIGNED-NEXT: flat_store_byte v[4:5], v20 offset:241
; ALIGNED-NEXT: flat_store_byte v[4:5], v21 offset:237
-; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:504
-; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:508
-; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:500
+; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:504
+; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:508
+; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:500
; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:496
-; ALIGNED-NEXT: flat_store_byte v[4:5], v22 offset:231
-; ALIGNED-NEXT: flat_store_byte v[4:5], v25 offset:232
-; ALIGNED-NEXT: flat_store_byte v[4:5], v23 offset:230
-; ALIGNED-NEXT: flat_store_byte v[4:5], v26 offset:236
-; ALIGNED-NEXT: flat_store_byte v[4:5], v27 offset:234
-; ALIGNED-NEXT: flat_store_byte v[4:5], v28 offset:235
+; ALIGNED-NEXT: flat_store_byte v[4:5], v23 offset:231
+; ALIGNED-NEXT: flat_store_byte v[4:5], v22 offset:232
+; ALIGNED-NEXT: flat_store_byte v[4:5], v24 offset:230
+; ALIGNED-NEXT: flat_store_byte v[4:5], v25 offset:236
+; ALIGNED-NEXT: flat_store_byte v[4:5], v28 offset:234
+; ALIGNED-NEXT: flat_store_byte v[4:5], v27 offset:235
; ALIGNED-NEXT: flat_store_byte v[4:5], v29 offset:233
-; ALIGNED-NEXT: flat_store_byte v[4:5], v24 offset:229
+; ALIGNED-NEXT: flat_store_byte v[4:5], v26 offset:229
; ALIGNED-NEXT: flat_store_byte v[4:5], v31 offset:223
; ALIGNED-NEXT: flat_store_byte v[4:5], v32 offset:224
; ALIGNED-NEXT: flat_store_byte v[4:5], v35 offset:222
@@ -15133,41 +15156,41 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:456
; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:452
; ALIGNED-NEXT: flat_store_byte v[4:5], v67 offset:210
-; ALIGNED-NEXT: flat_store_byte v[4:5], v64 offset:212
-; ALIGNED-NEXT: flat_store_byte v[4:5], v38 offset:206
-; ALIGNED-NEXT: flat_store_byte v[4:5], v65 offset:208
-; ALIGNED-NEXT: flat_store_byte v[4:5], v39 offset:207
+; ALIGNED-NEXT: flat_store_byte v[4:5], v65 offset:212
+; ALIGNED-NEXT: flat_store_byte v[4:5], v39 offset:206
+; ALIGNED-NEXT: flat_store_byte v[4:5], v38 offset:208
+; ALIGNED-NEXT: flat_store_byte v[4:5], v48 offset:207
; ALIGNED-NEXT: flat_store_byte v[4:5], v70 offset:211
; ALIGNED-NEXT: flat_store_byte v[4:5], v80 offset:209
-; ALIGNED-NEXT: flat_store_byte v[4:5], v53 offset:215
-; ALIGNED-NEXT: flat_store_byte v[4:5], v52 offset:216
-; ALIGNED-NEXT: flat_store_byte v[4:5], v66 offset:214
-; ALIGNED-NEXT: flat_store_byte v[4:5], v51 offset:220
-; ALIGNED-NEXT: flat_store_byte v[4:5], v55 offset:218
-; ALIGNED-NEXT: flat_store_byte v[4:5], v54 offset:219
-; ALIGNED-NEXT: flat_store_byte v[4:5], v69 offset:217
+; ALIGNED-NEXT: flat_store_byte v[4:5], v55 offset:215
+; ALIGNED-NEXT: flat_store_byte v[4:5], v54 offset:216
+; ALIGNED-NEXT: flat_store_byte v[4:5], v69 offset:214
+; ALIGNED-NEXT: flat_store_byte v[4:5], v52 offset:220
+; ALIGNED-NEXT: flat_store_byte v[4:5], v66 offset:218
+; ALIGNED-NEXT: flat_store_byte v[4:5], v64 offset:219
+; ALIGNED-NEXT: flat_store_byte v[4:5], v68 offset:217
; ALIGNED-NEXT: flat_store_byte v[4:5], v71 offset:213
-; ALIGNED-NEXT: flat_store_byte v[4:5], v49 offset:205
-; ALIGNED-NEXT: buffer_store_dword v72, off, s[0:3], s32 offset:472
-; ALIGNED-NEXT: buffer_store_dword v79, off, s[0:3], s32 offset:476
-; ALIGNED-NEXT: buffer_store_dword v90, off, s[0:3], s32 offset:468
-; ALIGNED-NEXT: buffer_store_dword v107, off, s[0:3], s32 offset:464
+; ALIGNED-NEXT: flat_store_byte v[4:5], v50 offset:205
+; ALIGNED-NEXT: buffer_store_dword v74, off, s[0:3], s32 offset:472
+; ALIGNED-NEXT: buffer_store_dword v91, off, s[0:3], s32 offset:476
+; ALIGNED-NEXT: buffer_store_dword v95, off, s[0:3], s32 offset:468
+; ALIGNED-NEXT: buffer_store_dword v122, off, s[0:3], s32 offset:464
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload
-; ALIGNED-NEXT: flat_store_byte v[4:5], v82 offset:199
-; ALIGNED-NEXT: flat_store_byte v[4:5], v85 offset:200
-; ALIGNED-NEXT: flat_store_byte v[4:5], v83 offset:198
-; ALIGNED-NEXT: flat_store_byte v[4:5], v86 offset:204
-; ALIGNED-NEXT: flat_store_byte v[4:5], v87 offset:202
-; ALIGNED-NEXT: flat_store_byte v[4:5], v96 offset:203
+; ALIGNED-NEXT: flat_store_byte v[4:5], v83 offset:199
+; ALIGNED-NEXT: flat_store_byte v[4:5], v82 offset:200
+; ALIGNED-NEXT: flat_store_byte v[4:5], v84 offset:198
+; ALIGNED-NEXT: flat_store_byte v[4:5], v85 offset:204
+; ALIGNED-NEXT: flat_store_byte v[4:5], v96 offset:202
+; ALIGNED-NEXT: flat_store_byte v[4:5], v87 offset:203
; ALIGNED-NEXT: flat_store_byte v[4:5], v97 offset:201
-; ALIGNED-NEXT: flat_store_byte v[4:5], v84 offset:197
+; ALIGNED-NEXT: flat_store_byte v[4:5], v86 offset:197
; ALIGNED-NEXT: flat_store_byte v[4:5], v101 offset:191
; ALIGNED-NEXT: flat_store_byte v[4:5], v100 offset:192
-; ALIGNED-NEXT: flat_store_byte v[4:5], v112 offset:190
+; ALIGNED-NEXT: flat_store_byte v[4:5], v113 offset:190
; ALIGNED-NEXT: flat_store_byte v[4:5], v99 offset:196
; ALIGNED-NEXT: flat_store_byte v[4:5], v103 offset:194
; ALIGNED-NEXT: flat_store_byte v[4:5], v102 offset:195
-; ALIGNED-NEXT: flat_store_byte v[4:5], v113 offset:193
+; ALIGNED-NEXT: flat_store_byte v[4:5], v112 offset:193
; ALIGNED-NEXT: flat_store_byte v[4:5], v115 offset:189
; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00
; ALIGNED-NEXT: s_addc_u32 s5, s5, -1
@@ -15184,14 +15207,14 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload
-; ALIGNED-NEXT: flat_store_byte v[4:5], v116 offset:183
-; ALIGNED-NEXT: flat_store_byte v[4:5], v40 offset:184
-; ALIGNED-NEXT: flat_store_byte v[4:5], v118 offset:182
+; ALIGNED-NEXT: flat_store_byte v[4:5], v118 offset:183
+; ALIGNED-NEXT: flat_store_byte v[4:5], v116 offset:184
+; ALIGNED-NEXT: flat_store_byte v[4:5], v119 offset:182
; ALIGNED-NEXT: flat_store_byte v[4:5], v41 offset:188
; ALIGNED-NEXT: flat_store_byte v[4:5], v43 offset:186
; ALIGNED-NEXT: flat_store_byte v[4:5], v42 offset:187
; ALIGNED-NEXT: flat_store_byte v[4:5], v44 offset:185
-; ALIGNED-NEXT: flat_store_byte v[4:5], v119 offset:181
+; ALIGNED-NEXT: flat_store_byte v[4:5], v40 offset:181
; ALIGNED-NEXT: flat_store_byte v[4:5], v47 offset:175
; ALIGNED-NEXT: flat_store_byte v[4:5], v56 offset:176
; ALIGNED-NEXT: flat_store_byte v[4:5], v59 offset:174
@@ -15211,69 +15234,75 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Reload
-; ALIGNED-NEXT: flat_store_byte v[4:5], v62 offset:167
-; ALIGNED-NEXT: flat_store_byte v[4:5], v74 offset:168
-; ALIGNED-NEXT: flat_store_byte v[4:5], v63 offset:166
-; ALIGNED-NEXT: flat_store_byte v[4:5], v75 offset:172
-; ALIGNED-NEXT: flat_store_byte v[4:5], v77 offset:170
-; ALIGNED-NEXT: flat_store_byte v[4:5], v76 offset:171
-; ALIGNED-NEXT: flat_store_byte v[4:5], v78 offset:169
-; ALIGNED-NEXT: flat_store_byte v[4:5], v73 offset:165
-; ALIGNED-NEXT: flat_store_byte v[4:5], v94 offset:159
-; ALIGNED-NEXT: flat_store_byte v[4:5], v92 offset:160
-; ALIGNED-NEXT: flat_store_byte v[4:5], v105 offset:158
-; ALIGNED-NEXT: flat_store_byte v[4:5], v91 offset:164
-; ALIGNED-NEXT: flat_store_byte v[4:5], v95 offset:162
-; ALIGNED-NEXT: flat_store_byte v[4:5], v104 offset:163
-; ALIGNED-NEXT: flat_store_byte v[4:5], v106 offset:161
-; ALIGNED-NEXT: flat_store_byte v[4:5], v108 offset:157
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[4:5], v63 offset:167
+; ALIGNED-NEXT: flat_store_byte v[4:5], v62 offset:168
+; ALIGNED-NEXT: flat_store_byte v[4:5], v72 offset:166
+; ALIGNED-NEXT: flat_store_byte v[4:5], v77 offset:172
+; ALIGNED-NEXT: flat_store_byte v[4:5], v88 offset:170
+; ALIGNED-NEXT: flat_store_byte v[4:5], v79 offset:171
+; ALIGNED-NEXT: flat_store_byte v[4:5], v90 offset:169
+; ALIGNED-NEXT: flat_store_byte v[4:5], v78 offset:165
+; ALIGNED-NEXT: flat_store_byte v[4:5], v111 offset:159
+; ALIGNED-NEXT: flat_store_byte v[4:5], v109 offset:160
+; ALIGNED-NEXT: flat_store_byte v[4:5], v123 offset:158
+; ALIGNED-NEXT: flat_store_byte v[4:5], v108 offset:164
+; ALIGNED-NEXT: flat_store_byte v[4:5], v121 offset:162
+; ALIGNED-NEXT: flat_store_byte v[4:5], v120 offset:163
+; ALIGNED-NEXT: flat_store_byte v[4:5], v124 offset:161
+; ALIGNED-NEXT: flat_store_byte v[4:5], v125 offset:157
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1372 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1364 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Reload
-; ALIGNED-NEXT: flat_store_byte v[4:5], v111 offset:151
-; ALIGNED-NEXT: flat_store_byte v[4:5], v124 offset:152
-; ALIGNED-NEXT: flat_store_byte v[4:5], v122 offset:150
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[4:5], v127 offset:151
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:152
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:150
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1384 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:156
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:154
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1416 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:155
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:153
-; ALIGNED-NEXT: flat_store_byte v[4:5], v123 offset:149
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1356 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:149
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:143
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1344 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1340 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:144
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1340 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:142
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1360 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:148
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1360 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1352 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:146
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1356 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:147
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1352 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1344 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:145
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Reload
@@ -15522,13 +15551,13 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:992 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:996 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:988 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1004 ; 4-byte Folded Reload
@@ -15537,10 +15566,10 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1000 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:72
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:996 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:992 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:70
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:76
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload
@@ -15708,7 +15737,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644
-; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:640
+; ALIGNED-NEXT: buffer_store_dword v126, off, s[0:3], s32 offset:640
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:23
@@ -15733,11 +15762,9 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:21
-; ALIGNED-NEXT: flat_store_byte v[4:5], v88 offset:15
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1452 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:16
-; ALIGNED-NEXT: flat_store_byte v[4:5], v89 offset:14
+; ALIGNED-NEXT: flat_store_byte v[4:5], v75 offset:15
+; ALIGNED-NEXT: flat_store_byte v[4:5], v76 offset:16
+; ALIGNED-NEXT: flat_store_byte v[4:5], v73 offset:14
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:20
@@ -15750,53 +15777,49 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:17
-; ALIGNED-NEXT: flat_store_byte v[2:3], v93 offset:16
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[2:3], v89 offset:16
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1452 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Reload
-; ALIGNED-NEXT: flat_store_byte v[4:5], v109 offset:7
-; ALIGNED-NEXT: flat_store_byte v[4:5], v120 offset:8
-; ALIGNED-NEXT: flat_store_byte v[4:5], v125 offset:10
-; ALIGNED-NEXT: flat_store_byte v[4:5], v110 offset:6
-; ALIGNED-NEXT: flat_store_byte v[4:5], v126 offset:12
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:11
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:9
-; ALIGNED-NEXT: flat_store_byte v[2:3], v121 offset:8
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[4:5], v93 offset:7
+; ALIGNED-NEXT: flat_store_byte v[4:5], v92 offset:8
+; ALIGNED-NEXT: flat_store_byte v[4:5], v105 offset:10
+; ALIGNED-NEXT: flat_store_byte v[4:5], v94 offset:6
+; ALIGNED-NEXT: flat_store_byte v[4:5], v106 offset:12
+; ALIGNED-NEXT: flat_store_byte v[4:5], v107 offset:11
+; ALIGNED-NEXT: flat_store_byte v[4:5], v110 offset:9
+; ALIGNED-NEXT: flat_store_byte v[2:3], v104 offset:8
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:2
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1364 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1416 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[4:5], v0
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:1
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:4
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1384 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:2
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:3
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1372 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:4
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[2:3], v0
; ALIGNED-NEXT: s_cbranch_scc0 .LBB9_4
diff --git a/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll b/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll
index 01b7f40f6256f..a8b3ffbb21ce7 100644
--- a/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll
+++ b/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll
@@ -142,8 +142,8 @@ define void @memmove_p0_p0_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr add
; CHECK-NEXT: s_clause 0x4
; CHECK-NEXT: flat_load_dword v8, v[2:3] offset:16
; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3]
-; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28
; CHECK-NEXT: flat_load_dword v9, v[2:3] offset:24
+; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28
; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:30
; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
; CHECK-NEXT: flat_store_dword v[0:1], v8 offset:16
@@ -198,18 +198,18 @@ define void @memmove_p0_p0_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:30
-; CHECK-NEXT: flat_load_dword v9, v[2:3] offset:16
-; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28
+; CHECK-NEXT: flat_load_dword v8, v[2:3] offset:16
; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3]
-; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT: flat_store_dword v[0:1], v9 offset:16
-; CHECK-NEXT: flat_load_dword v9, v[2:3] offset:20
+; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:28
+; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
+; CHECK-NEXT: flat_store_dword v[0:1], v8 offset:16
+; CHECK-NEXT: flat_load_dword v8, v[2:3] offset:20
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_dword v[0:1], v9 offset:20
+; CHECK-NEXT: flat_store_dword v[0:1], v8 offset:20
; CHECK-NEXT: flat_load_dword v2, v[2:3] offset:24
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:30
-; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:30
+; CHECK-NEXT: flat_store_short v[0:1], v9 offset:28
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(2)
; CHECK-NEXT: flat_store_dword v[0:1], v2 offset:24
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
@@ -371,8 +371,8 @@ define void @memmove_p0_p1_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr add
; CHECK-NEXT: s_clause 0x4
; CHECK-NEXT: global_load_dword v8, v[2:3], off offset:16
; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
-; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:24
+; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:30
; CHECK-NEXT: s_waitcnt vmcnt(4)
; CHECK-NEXT: flat_store_dword v[0:1], v8 offset:16
@@ -427,18 +427,18 @@ define void @memmove_p0_p1_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:30
-; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:16
-; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
+; CHECK-NEXT: global_load_dword v8, v[2:3], off offset:16
; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
-; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: flat_store_dword v[0:1], v9 offset:16
-; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:20
+; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:28
+; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: flat_store_dword v[0:1], v8 offset:16
+; CHECK-NEXT: global_load_dword v8, v[2:3], off offset:20
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dword v[0:1], v9 offset:20
+; CHECK-NEXT: flat_store_dword v[0:1], v8 offset:20
; CHECK-NEXT: global_load_dword v2, v[2:3], off offset:24
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:30
-; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:30
+; CHECK-NEXT: flat_store_short v[0:1], v9 offset:28
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dword v[0:1], v2 offset:24
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
@@ -820,8 +820,8 @@ define void @memmove_p0_p4_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr add
; CHECK-NEXT: s_clause 0x4
; CHECK-NEXT: global_load_dword v8, v[2:3], off offset:16
; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
-; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:24
+; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:30
; CHECK-NEXT: s_waitcnt vmcnt(4)
; CHECK-NEXT: flat_store_dword v[0:1], v8 offset:16
@@ -876,18 +876,18 @@ define void @memmove_p0_p4_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:30
-; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:16
-; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
+; CHECK-NEXT: global_load_dword v8, v[2:3], off offset:16
; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
-; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: flat_store_dword v[0:1], v9 offset:16
-; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:20
+; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:28
+; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: flat_store_dword v[0:1], v8 offset:16
+; CHECK-NEXT: global_load_dword v8, v[2:3], off offset:20
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dword v[0:1], v9 offset:20
+; CHECK-NEXT: flat_store_dword v[0:1], v8 offset:20
; CHECK-NEXT: global_load_dword v2, v[2:3], off offset:24
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:30
-; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:30
+; CHECK-NEXT: flat_store_short v[0:1], v9 offset:28
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dword v[0:1], v2 offset:24
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
@@ -1320,8 +1320,8 @@ define void @memmove_p1_p0_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr add
; CHECK-NEXT: s_clause 0x4
; CHECK-NEXT: flat_load_dword v8, v[2:3] offset:16
; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3]
-; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28
; CHECK-NEXT: flat_load_dword v9, v[2:3] offset:24
+; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28
; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:30
; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
; CHECK-NEXT: global_store_dword v[0:1], v8, off offset:16
@@ -1373,18 +1373,18 @@ define void @memmove_p1_p0_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:30
-; CHECK-NEXT: flat_load_dword v9, v[2:3] offset:16
-; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28
+; CHECK-NEXT: flat_load_dword v8, v[2:3] offset:16
; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3]
-; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT: global_store_dword v[0:1], v9, off offset:16
-; CHECK-NEXT: flat_load_dword v9, v[2:3] offset:20
+; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:28
+; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
+; CHECK-NEXT: global_store_dword v[0:1], v8, off offset:16
+; CHECK-NEXT: flat_load_dword v8, v[2:3] offset:20
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_store_dword v[0:1], v9, off offset:20
+; CHECK-NEXT: global_store_dword v[0:1], v8, off offset:20
; CHECK-NEXT: flat_load_dword v2, v[2:3] offset:24
-; CHECK-NEXT: global_store_byte v[0:1], v8, off offset:30
-; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT: global_store_byte v[0:1], v10, off offset:30
+; CHECK-NEXT: global_store_short v[0:1], v9, off offset:28
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: global_store_dword v[0:1], v2, off offset:24
; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
@@ -1537,8 +1537,8 @@ define void @memmove_p1_p1_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr add
; CHECK-NEXT: s_clause 0x4
; CHECK-NEXT: global_load_dword v8, v[2:3], off offset:16
; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
-; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:24
+; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:30
; CHECK-NEXT: s_waitcnt vmcnt(4)
; CHECK-NEXT: global_store_dword v[0:1], v8, off offset:16
@@ -1590,18 +1590,18 @@ define void @memmove_p1_p1_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:30
-; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:16
-; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
+; CHECK-NEXT: global_load_dword v8, v[2:3], off offset:16
; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
-; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: global_store_dword v[0:1], v9, off offset:16
-; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:20
+; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:28
+; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: global_store_dword v[0:1], v8, off offset:16
+; CHECK-NEXT: global_load_dword v8, v[2:3], off offset:20
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dword v[0:1], v9, off offset:20
+; CHECK-NEXT: global_store_dword v[0:1], v8, off offset:20
; CHECK-NEXT: global_load_dword v2, v[2:3], off offset:24
-; CHECK-NEXT: global_store_byte v[0:1], v8, off offset:30
-; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT: global_store_byte v[0:1], v10, off offset:30
+; CHECK-NEXT: global_store_short v[0:1], v9, off offset:28
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_dword v[0:1], v2, off offset:24
; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
@@ -1755,16 +1755,17 @@ define void @memmove_p1_p3_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr add
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: ds_read2_b64 v[3:6], v2 offset1:1
; CHECK-NEXT: ds_read_b32 v7, v2 offset:16
-; CHECK-NEXT: ds_read_u8 v10, v2 offset:30
; CHECK-NEXT: ds_read2_b32 v[8:9], v2 offset0:5 offset1:6
-; CHECK-NEXT: ds_read_u16 v2, v2 offset:28
+; CHECK-NEXT: ds_read_u16 v10, v2 offset:28
+; CHECK-NEXT: ds_read_u8 v2, v2 offset:30
; CHECK-NEXT: s_waitcnt lgkmcnt(4)
; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
-; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
; CHECK-NEXT: global_store_dwordx3 v[0:1], v[7:9], off offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_store_short v[0:1], v2, off offset:28
-; CHECK-NEXT: global_store_byte v[0:1], v10, off offset:30
+; CHECK-NEXT: global_store_byte v[0:1], v2, off offset:30
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p1.p3.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(3) noundef nonnull align 8 %src, i64 31, i1 false)
@@ -1806,16 +1807,17 @@ define void @memmove_p1_p3_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: ds_read_b128 v[3:6], v2
; CHECK-NEXT: ds_read_b32 v7, v2 offset:16
-; CHECK-NEXT: ds_read_u8 v10, v2 offset:30
; CHECK-NEXT: ds_read2_b32 v[8:9], v2 offset0:5 offset1:6
-; CHECK-NEXT: ds_read_u16 v2, v2 offset:28
+; CHECK-NEXT: ds_read_u16 v10, v2 offset:28
+; CHECK-NEXT: ds_read_u8 v2, v2 offset:30
; CHECK-NEXT: s_waitcnt lgkmcnt(4)
; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
-; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
; CHECK-NEXT: global_store_dwordx3 v[0:1], v[7:9], off offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_store_short v[0:1], v2, off offset:28
-; CHECK-NEXT: global_store_byte v[0:1], v10, off offset:30
+; CHECK-NEXT: global_store_byte v[0:1], v2, off offset:30
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p1.p3.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(3) noundef nonnull align 16 %src, i64 31, i1 false)
@@ -1964,8 +1966,8 @@ define void @memmove_p1_p4_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr add
; CHECK-NEXT: s_clause 0x4
; CHECK-NEXT: global_load_dword v8, v[2:3], off offset:16
; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
-; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:24
+; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:30
; CHECK-NEXT: s_waitcnt vmcnt(4)
; CHECK-NEXT: global_store_dword v[0:1], v8, off offset:16
@@ -2017,18 +2019,18 @@ define void @memmove_p1_p4_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:30
-; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:16
-; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
+; CHECK-NEXT: global_load_dword v8, v[2:3], off offset:16
; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
-; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: global_store_dword v[0:1], v9, off offset:16
-; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:20
+; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:28
+; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: global_store_dword v[0:1], v8, off offset:16
+; CHECK-NEXT: global_load_dword v8, v[2:3], off offset:20
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dword v[0:1], v9, off offset:20
+; CHECK-NEXT: global_store_dword v[0:1], v8, off offset:20
; CHECK-NEXT: global_load_dword v2, v[2:3], off offset:24
-; CHECK-NEXT: global_store_byte v[0:1], v8, off offset:30
-; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT: global_store_byte v[0:1], v10, off offset:30
+; CHECK-NEXT: global_store_short v[0:1], v9, off offset:28
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_dword v[0:1], v2, off offset:24
; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
@@ -2513,18 +2515,18 @@ define void @memmove_p3_p0_sz31_align_16_16(ptr addrspace(3) align 16 %dst, ptr
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:30
-; CHECK-NEXT: flat_load_dword v8, v[1:2] offset:16
-; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:28
+; CHECK-NEXT: flat_load_dword v7, v[1:2] offset:16
; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2]
-; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT: ds_write_b32 v0, v8 offset:16
-; CHECK-NEXT: flat_load_dword v8, v[1:2] offset:20
+; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:28
+; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
+; CHECK-NEXT: ds_write_b32 v0, v7 offset:16
+; CHECK-NEXT: flat_load_dword v7, v[1:2] offset:20
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: ds_write_b32 v0, v8 offset:20
+; CHECK-NEXT: ds_write_b32 v0, v7 offset:20
; CHECK-NEXT: flat_load_dword v1, v[1:2] offset:24
-; CHECK-NEXT: ds_write_b8 v0, v7 offset:30
-; CHECK-NEXT: ds_write_b16 v0, v9 offset:28
+; CHECK-NEXT: ds_write_b8 v0, v9 offset:30
+; CHECK-NEXT: ds_write_b16 v0, v8 offset:28
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(2)
; CHECK-NEXT: ds_write_b32 v0, v1 offset:24
; CHECK-NEXT: ds_write_b128 v0, v[3:6]
@@ -3751,18 +3753,18 @@ define void @memmove_p5_p0_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:30
-; CHECK-NEXT: flat_load_dword v8, v[1:2] offset:16
-; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:28
+; CHECK-NEXT: flat_load_dword v7, v[1:2] offset:16
; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2]
-; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT: flat_load_dword v8, v[1:2] offset:20
+; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:28
+; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: flat_load_dword v7, v[1:2] offset:20
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT: flat_load_dword v1, v[1:2] offset:24
-; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28
; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll b/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll
index 9585c486aeb9e..fd86113a3538d 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll
@@ -300,14 +300,14 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr(ptr
; GFX908-NEXT: ; def a0
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
; GFX908-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
-; GFX908-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
+; GFX908-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
; GFX908-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
-; GFX908-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
+; GFX908-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
; GFX908-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
-; GFX908-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
+; GFX908-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
; GFX908-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
+; GFX908-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_accvgpr_write_b32 a0, v0
; GFX908-NEXT: v_accvgpr_write_b32 a1, v1
@@ -406,14 +406,14 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_phys_agpr(ptr add
; GFX908-NEXT: ; use a[100:131]
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
; GFX908-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
-; GFX908-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
+; GFX908-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
; GFX908-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
-; GFX908-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
+; GFX908-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
; GFX908-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
-; GFX908-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
+; GFX908-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
; GFX908-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
+; GFX908-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_accvgpr_write_b32 a0, v0
; GFX908-NEXT: v_accvgpr_write_b32 a1, v1
@@ -512,14 +512,14 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_no_agprs(ptr addr
; GFX908-NEXT: ; def v0
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
; GFX908-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
-; GFX908-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
+; GFX908-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
; GFX908-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
-; GFX908-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
+; GFX908-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
; GFX908-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
-; GFX908-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
+; GFX908-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
; GFX908-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
+; GFX908-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_accvgpr_write_b32 a0, v0
; GFX908-NEXT: v_accvgpr_write_b32 a1, v1
@@ -640,14 +640,14 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call(ptr addrspace(1) %arg)
; GFX908-NEXT: v_mov_b32_e32 v40, 0
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX908-NEXT: global_load_dwordx4 v[28:31], v40, s[34:35] offset:112
; GFX908-NEXT: global_load_dwordx4 v[24:27], v40, s[34:35] offset:96
-; GFX908-NEXT: global_load_dwordx4 v[20:23], v40, s[34:35] offset:80
+; GFX908-NEXT: global_load_dwordx4 v[28:31], v40, s[34:35] offset:112
; GFX908-NEXT: global_load_dwordx4 v[16:19], v40, s[34:35] offset:64
-; GFX908-NEXT: global_load_dwordx4 v[12:15], v40, s[34:35] offset:48
+; GFX908-NEXT: global_load_dwordx4 v[20:23], v40, s[34:35] offset:80
; GFX908-NEXT: global_load_dwordx4 v[8:11], v40, s[34:35] offset:32
-; GFX908-NEXT: global_load_dwordx4 v[4:7], v40, s[34:35] offset:16
+; GFX908-NEXT: global_load_dwordx4 v[12:15], v40, s[34:35] offset:48
; GFX908-NEXT: global_load_dwordx4 v[0:3], v40, s[34:35]
+; GFX908-NEXT: global_load_dwordx4 v[4:7], v40, s[34:35] offset:16
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_accvgpr_write_b32 a0, v0
; GFX908-NEXT: v_accvgpr_write_b32 a1, v1
@@ -925,14 +925,14 @@ define void @test_mfma_f32_32x32x1f32_nonentry_noagpr(ptr addrspace(1) %arg) #0
; GFX908-LABEL: test_mfma_f32_32x32x1f32_nonentry_noagpr:
; GFX908: ; %bb.0: ; %bb
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dwordx4 v[30:33], v[0:1], off offset:112
; GFX908-NEXT: global_load_dwordx4 v[26:29], v[0:1], off offset:96
-; GFX908-NEXT: global_load_dwordx4 v[22:25], v[0:1], off offset:80
+; GFX908-NEXT: global_load_dwordx4 v[30:33], v[0:1], off offset:112
; GFX908-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:64
-; GFX908-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:48
+; GFX908-NEXT: global_load_dwordx4 v[22:25], v[0:1], off offset:80
; GFX908-NEXT: global_load_dwordx4 v[10:13], v[0:1], off offset:32
-; GFX908-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16
+; GFX908-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:48
; GFX908-NEXT: global_load_dwordx4 v[2:5], v[0:1], off
+; GFX908-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_accvgpr_write_b32 a0, v2
; GFX908-NEXT: v_accvgpr_write_b32 a1, v3
@@ -1026,14 +1026,14 @@ define void @test_mfma_f32_32x32x1f32_nonentry_with_agpr(ptr addrspace(1) %arg)
; GFX908-LABEL: test_mfma_f32_32x32x1f32_nonentry_with_agpr:
; GFX908: ; %bb.0: ; %bb
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dwordx4 v[30:33], v[0:1], off offset:112
; GFX908-NEXT: global_load_dwordx4 v[26:29], v[0:1], off offset:96
-; GFX908-NEXT: global_load_dwordx4 v[22:25], v[0:1], off offset:80
+; GFX908-NEXT: global_load_dwordx4 v[30:33], v[0:1], off offset:112
; GFX908-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:64
-; GFX908-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:48
+; GFX908-NEXT: global_load_dwordx4 v[22:25], v[0:1], off offset:80
; GFX908-NEXT: global_load_dwordx4 v[10:13], v[0:1], off offset:32
-; GFX908-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16
+; GFX908-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:48
; GFX908-NEXT: global_load_dwordx4 v[2:5], v[0:1], off
+; GFX908-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_accvgpr_write_b32 a0, v2
; GFX908-NEXT: v_accvgpr_write_b32 a1, v3
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll
index 30ad3be46053c..829895d6784a4 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll
@@ -12,28 +12,29 @@ define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr ad
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5
-; GFX9-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6
-; GFX9-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7
-; GFX9-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1
-; GFX9-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3
-; GFX9-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4
+; GFX9-NEXT: global_load_ubyte v2, v1, s[2:3] offset:7
+; GFX9-NEXT: global_load_ubyte v3, v1, s[2:3] offset:1
+; GFX9-NEXT: global_load_ubyte v4, v1, s[2:3] offset:3
+; GFX9-NEXT: global_load_ubyte v5, v1, s[2:3] offset:4
+; GFX9-NEXT: global_load_ubyte v6, v1, s[2:3] offset:6
; GFX9-NEXT: global_load_ubyte v7, v1, s[2:3]
; GFX9-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2
; GFX9-NEXT: s_waitcnt vmcnt(7)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0
+; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; GFX9-NEXT: s_waitcnt vmcnt(5)
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; GFX9-NEXT: s_waitcnt vmcnt(4)
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v4
; GFX9-NEXT: s_waitcnt vmcnt(3)
-; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5
+; GFX9-NEXT: v_or_b32_e32 v0, v0, v5
; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_or_b32_e32 v0, v0, v6
-; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_or_b32_e32 v3, v4, v7
+; GFX9-NEXT: v_or_b32_e32 v3, v3, v7
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v4, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v4, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_or_b32_e32 v0, v2, v0
; GFX9-NEXT: v_or_b32_e32 v2, v4, v3
; GFX9-NEXT: v_ffbh_u32_e32 v2, v2
@@ -49,33 +50,36 @@ define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr ad
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x7
-; GFX10-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5
-; GFX10-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6
-; GFX10-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7
-; GFX10-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1
-; GFX10-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3
-; GFX10-NEXT: global_load_ubyte v6, v1, s[2:3]
-; GFX10-NEXT: global_load_ubyte v7, v1, s[2:3] offset:2
-; GFX10-NEXT: global_load_ubyte v8, v1, s[2:3] offset:4
+; GFX10-NEXT: global_load_ubyte v0, v1, s[2:3] offset:1
+; GFX10-NEXT: global_load_ubyte v2, v1, s[2:3] offset:3
+; GFX10-NEXT: global_load_ubyte v3, v1, s[2:3] offset:5
+; GFX10-NEXT: global_load_ubyte v4, v1, s[2:3] offset:7
+; GFX10-NEXT: global_load_ubyte v5, v1, s[2:3]
+; GFX10-NEXT: global_load_ubyte v6, v1, s[2:3] offset:2
+; GFX10-NEXT: global_load_ubyte v7, v1, s[2:3] offset:4
+; GFX10-NEXT: global_load_ubyte v8, v1, s[2:3] offset:6
; GFX10-NEXT: s_waitcnt vmcnt(7)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0
+; GFX10-NEXT: s_waitcnt vmcnt(6)
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; GFX10-NEXT: s_waitcnt vmcnt(5)
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; GFX10-NEXT: s_waitcnt vmcnt(4)
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 8, v4
; GFX10-NEXT: s_waitcnt vmcnt(3)
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 8, v5
+; GFX10-NEXT: v_or_b32_e32 v0, v0, v5
+; GFX10-NEXT: s_waitcnt vmcnt(2)
+; GFX10-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_or_b32_e32 v3, v3, v7
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_or_b32_e32 v0, v0, v8
-; GFX10-NEXT: v_or_b32_e32 v4, v4, v6
-; GFX10-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_or_b32_e32 v3, v5, v4
+; GFX10-NEXT: v_or_b32_sdwa v4, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX10-NEXT: v_ffbh_u32_e32 v2, v3
+; GFX10-NEXT: v_or_b32_e32 v2, v4, v3
; GFX10-NEXT: v_ffbh_u32_e32 v0, v0
-; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, 32 clamp
-; GFX10-NEXT: v_min_u32_e32 v0, v2, v0
+; GFX10-NEXT: v_ffbh_u32_e32 v2, v2
+; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, 32 clamp
+; GFX10-NEXT: v_min_u32_e32 v0, v0, v2
; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
%val = load i64, ptr addrspace(1) %arrayidx, align 1
@@ -91,28 +95,29 @@ define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5
-; GFX9-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6
-; GFX9-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7
-; GFX9-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1
-; GFX9-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3
-; GFX9-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4
+; GFX9-NEXT: global_load_ubyte v2, v1, s[2:3] offset:7
+; GFX9-NEXT: global_load_ubyte v3, v1, s[2:3] offset:1
+; GFX9-NEXT: global_load_ubyte v4, v1, s[2:3] offset:3
+; GFX9-NEXT: global_load_ubyte v5, v1, s[2:3] offset:4
+; GFX9-NEXT: global_load_ubyte v6, v1, s[2:3] offset:6
; GFX9-NEXT: global_load_ubyte v7, v1, s[2:3]
; GFX9-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2
; GFX9-NEXT: s_waitcnt vmcnt(7)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0
+; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; GFX9-NEXT: s_waitcnt vmcnt(5)
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; GFX9-NEXT: s_waitcnt vmcnt(4)
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v4
; GFX9-NEXT: s_waitcnt vmcnt(3)
-; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5
+; GFX9-NEXT: v_or_b32_e32 v0, v0, v5
; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_or_b32_e32 v0, v0, v6
-; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_or_b32_e32 v3, v4, v7
+; GFX9-NEXT: v_or_b32_e32 v3, v3, v7
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v4, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v4, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_or_b32_e32 v0, v2, v0
; GFX9-NEXT: v_or_b32_e32 v2, v4, v3
; GFX9-NEXT: v_ffbh_u32_e32 v2, v2
@@ -129,33 +134,36 @@ define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x7
-; GFX10-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5
-; GFX10-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6
-; GFX10-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7
-; GFX10-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1
-; GFX10-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3
-; GFX10-NEXT: global_load_ubyte v6, v1, s[2:3]
-; GFX10-NEXT: global_load_ubyte v7, v1, s[2:3] offset:2
-; GFX10-NEXT: global_load_ubyte v8, v1, s[2:3] offset:4
+; GFX10-NEXT: global_load_ubyte v0, v1, s[2:3] offset:1
+; GFX10-NEXT: global_load_ubyte v2, v1, s[2:3] offset:3
+; GFX10-NEXT: global_load_ubyte v3, v1, s[2:3] offset:5
+; GFX10-NEXT: global_load_ubyte v4, v1, s[2:3] offset:7
+; GFX10-NEXT: global_load_ubyte v5, v1, s[2:3]
+; GFX10-NEXT: global_load_ubyte v6, v1, s[2:3] offset:2
+; GFX10-NEXT: global_load_ubyte v7, v1, s[2:3] offset:4
+; GFX10-NEXT: global_load_ubyte v8, v1, s[2:3] offset:6
; GFX10-NEXT: s_waitcnt vmcnt(7)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0
+; GFX10-NEXT: s_waitcnt vmcnt(6)
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; GFX10-NEXT: s_waitcnt vmcnt(5)
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; GFX10-NEXT: s_waitcnt vmcnt(4)
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 8, v4
; GFX10-NEXT: s_waitcnt vmcnt(3)
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 8, v5
+; GFX10-NEXT: v_or_b32_e32 v0, v0, v5
+; GFX10-NEXT: s_waitcnt vmcnt(2)
+; GFX10-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_or_b32_e32 v3, v3, v7
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_or_b32_e32 v0, v0, v8
-; GFX10-NEXT: v_or_b32_e32 v4, v4, v6
-; GFX10-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_or_b32_e32 v3, v5, v4
+; GFX10-NEXT: v_or_b32_sdwa v4, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX10-NEXT: v_ffbh_u32_e32 v2, v3
+; GFX10-NEXT: v_or_b32_e32 v2, v4, v3
; GFX10-NEXT: v_ffbh_u32_e32 v0, v0
-; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, 32 clamp
-; GFX10-NEXT: v_min_u32_e32 v0, v2, v0
+; GFX10-NEXT: v_ffbh_u32_e32 v2, v2
+; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, 32 clamp
+; GFX10-NEXT: v_min_u32_e32 v0, v0, v2
; GFX10-NEXT: v_min_u32_e32 v0, 64, v0
; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
@@ -172,28 +180,29 @@ define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr ad
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5
-; GFX9-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6
-; GFX9-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7
-; GFX9-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1
-; GFX9-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3
-; GFX9-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4
+; GFX9-NEXT: global_load_ubyte v2, v1, s[2:3] offset:7
+; GFX9-NEXT: global_load_ubyte v3, v1, s[2:3] offset:1
+; GFX9-NEXT: global_load_ubyte v4, v1, s[2:3] offset:3
+; GFX9-NEXT: global_load_ubyte v5, v1, s[2:3] offset:4
+; GFX9-NEXT: global_load_ubyte v6, v1, s[2:3] offset:6
; GFX9-NEXT: global_load_ubyte v7, v1, s[2:3]
; GFX9-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2
; GFX9-NEXT: s_waitcnt vmcnt(7)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0
+; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; GFX9-NEXT: s_waitcnt vmcnt(5)
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; GFX9-NEXT: s_waitcnt vmcnt(4)
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v4
; GFX9-NEXT: s_waitcnt vmcnt(3)
-; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5
+; GFX9-NEXT: v_or_b32_e32 v0, v0, v5
; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_or_b32_e32 v0, v0, v6
-; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_or_b32_e32 v3, v4, v7
+; GFX9-NEXT: v_or_b32_e32 v3, v3, v7
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v4, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v4, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_or_b32_e32 v0, v2, v0
; GFX9-NEXT: v_or_b32_e32 v2, v4, v3
; GFX9-NEXT: v_ffbl_b32_e32 v0, v0
@@ -211,27 +220,28 @@ define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr ad
; GFX10-NEXT: s_clause 0x7
; GFX10-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5
; GFX10-NEXT: global_load_ubyte v2, v1, s[2:3] offset:7
-; GFX10-NEXT: global_load_ubyte v3, v1, s[2:3] offset:6
-; GFX10-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1
-; GFX10-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3
-; GFX10-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4
+; GFX10-NEXT: global_load_ubyte v3, v1, s[2:3] offset:1
+; GFX10-NEXT: global_load_ubyte v4, v1, s[2:3] offset:3
+; GFX10-NEXT: global_load_ubyte v5, v1, s[2:3] offset:4
+; GFX10-NEXT: global_load_ubyte v6, v1, s[2:3] offset:6
; GFX10-NEXT: global_load_ubyte v7, v1, s[2:3]
; GFX10-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2
; GFX10-NEXT: s_waitcnt vmcnt(7)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0
; GFX10-NEXT: s_waitcnt vmcnt(6)
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX10-NEXT: s_waitcnt vmcnt(5)
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; GFX10-NEXT: s_waitcnt vmcnt(4)
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 8, v4
; GFX10-NEXT: s_waitcnt vmcnt(3)
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 8, v5
+; GFX10-NEXT: v_or_b32_e32 v0, v0, v5
; GFX10-NEXT: s_waitcnt vmcnt(2)
-; GFX10-NEXT: v_or_b32_e32 v0, v0, v6
-; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: v_or_b32_e32 v3, v4, v7
+; GFX10-NEXT: v_or_b32_e32 v3, v3, v7
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_or_b32_sdwa v4, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_or_b32_sdwa v4, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_or_b32_e32 v0, v2, v0
; GFX10-NEXT: v_or_b32_e32 v2, v4, v3
; GFX10-NEXT: v_ffbl_b32_e32 v0, v0
@@ -253,28 +263,29 @@ define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5
-; GFX9-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6
-; GFX9-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7
-; GFX9-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1
-; GFX9-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3
-; GFX9-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4
+; GFX9-NEXT: global_load_ubyte v2, v1, s[2:3] offset:7
+; GFX9-NEXT: global_load_ubyte v3, v1, s[2:3] offset:1
+; GFX9-NEXT: global_load_ubyte v4, v1, s[2:3] offset:3
+; GFX9-NEXT: global_load_ubyte v5, v1, s[2:3] offset:4
+; GFX9-NEXT: global_load_ubyte v6, v1, s[2:3] offset:6
; GFX9-NEXT: global_load_ubyte v7, v1, s[2:3]
; GFX9-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2
; GFX9-NEXT: s_waitcnt vmcnt(7)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0
+; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; GFX9-NEXT: s_waitcnt vmcnt(5)
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; GFX9-NEXT: s_waitcnt vmcnt(4)
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v4
; GFX9-NEXT: s_waitcnt vmcnt(3)
-; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5
+; GFX9-NEXT: v_or_b32_e32 v0, v0, v5
; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_or_b32_e32 v0, v0, v6
-; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_or_b32_e32 v3, v4, v7
+; GFX9-NEXT: v_or_b32_e32 v3, v3, v7
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v4, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v4, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_or_b32_e32 v0, v2, v0
; GFX9-NEXT: v_or_b32_e32 v2, v4, v3
; GFX9-NEXT: v_ffbl_b32_e32 v0, v0
@@ -293,27 +304,28 @@ define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace
; GFX10-NEXT: s_clause 0x7
; GFX10-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5
; GFX10-NEXT: global_load_ubyte v2, v1, s[2:3] offset:7
-; GFX10-NEXT: global_load_ubyte v3, v1, s[2:3] offset:6
-; GFX10-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1
-; GFX10-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3
-; GFX10-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4
+; GFX10-NEXT: global_load_ubyte v3, v1, s[2:3] offset:1
+; GFX10-NEXT: global_load_ubyte v4, v1, s[2:3] offset:3
+; GFX10-NEXT: global_load_ubyte v5, v1, s[2:3] offset:4
+; GFX10-NEXT: global_load_ubyte v6, v1, s[2:3] offset:6
; GFX10-NEXT: global_load_ubyte v7, v1, s[2:3]
; GFX10-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2
; GFX10-NEXT: s_waitcnt vmcnt(7)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0
; GFX10-NEXT: s_waitcnt vmcnt(6)
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX10-NEXT: s_waitcnt vmcnt(5)
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; GFX10-NEXT: s_waitcnt vmcnt(4)
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 8, v4
; GFX10-NEXT: s_waitcnt vmcnt(3)
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 8, v5
+; GFX10-NEXT: v_or_b32_e32 v0, v0, v5
; GFX10-NEXT: s_waitcnt vmcnt(2)
-; GFX10-NEXT: v_or_b32_e32 v0, v0, v6
-; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: v_or_b32_e32 v3, v4, v7
+; GFX10-NEXT: v_or_b32_e32 v3, v3, v7
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_or_b32_sdwa v4, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_or_b32_sdwa v4, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_or_b32_e32 v0, v2, v0
; GFX10-NEXT: v_or_b32_e32 v2, v4, v3
; GFX10-NEXT: v_ffbl_b32_e32 v0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/neg_ashr64_reduce.ll b/llvm/test/CodeGen/AMDGPU/neg_ashr64_reduce.ll
index db82530f66aa4..abe729d41f86c 100644
--- a/llvm/test/CodeGen/AMDGPU/neg_ashr64_reduce.ll
+++ b/llvm/test/CodeGen/AMDGPU/neg_ashr64_reduce.ll
@@ -48,19 +48,23 @@ define <3 x i64> @v3_ashr_metadata(ptr %arg0.ptr, ptr %arg1.ptr) {
; CHECK-LABEL: v3_ashr_metadata:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
-; CHECK-NEXT: v_add_co_u32_e32 v0, vcc, 20, v0
-; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; CHECK-NEXT: v_add_co_u32_e32 v11, vcc, 20, v0
+; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3]
+; CHECK-NEXT: v_addc_co_u32_e32 v12, vcc, 0, v1, vcc
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_load_dword v4, v[2:3] offset:16
-; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3]
-; CHECK-NEXT: v_mov_b32_e32 v3, -1
-; CHECK-NEXT: flat_load_dword v1, v[0:1]
+; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[0:1]
+; CHECK-NEXT: flat_load_dword v5, v[11:12]
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_ashrrev_i32_e32 v0, v8, v5
-; CHECK-NEXT: v_ashrrev_i32_e32 v2, v10, v7
-; CHECK-NEXT: v_ashrrev_i32_e32 v4, v4, v1
+; CHECK-NEXT: flat_load_dword v7, v[2:3] offset:16
+; CHECK-NEXT: ; kill: killed $vgpr2 killed $vgpr3
+; CHECK-NEXT: ; kill: killed $vgpr0 killed $vgpr1
; CHECK-NEXT: v_mov_b32_e32 v1, -1
+; CHECK-NEXT: v_mov_b32_e32 v3, -1
+; CHECK-NEXT: ; kill: killed $vgpr11 killed $vgpr12
+; CHECK-NEXT: v_ashrrev_i32_e32 v0, v4, v8
+; CHECK-NEXT: v_ashrrev_i32_e32 v2, v6, v10
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_ashrrev_i32_e32 v4, v7, v5
; CHECK-NEXT: v_mov_b32_e32 v5, -1
; CHECK-NEXT: s_setpc_b64 s[30:31]
%val = load <3 x i64>, ptr %arg0.ptr, !range !4, !noundef !{}
diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
index e92bc1f997c58..5b5ecabe1dc58 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -188,64 +188,61 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX900-LABEL: fadd_v32_vs:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0
+; GFX900-NEXT: v_lshlrev_b32_e32 v32, 7, v0
; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
-; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
-; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16
-; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[0:1]
-; GFX900-NEXT: global_load_dwordx4 v[9:12], v0, s[0:1] offset:48
-; GFX900-NEXT: global_load_dwordx4 v[13:16], v0, s[0:1] offset:32
-; GFX900-NEXT: global_load_dwordx4 v[17:20], v0, s[0:1] offset:80
-; GFX900-NEXT: global_load_dwordx4 v[21:24], v0, s[0:1] offset:64
-; GFX900-NEXT: s_waitcnt vmcnt(5)
-; GFX900-NEXT: v_add_f32_e32 v4, s43, v4
-; GFX900-NEXT: v_add_f32_e32 v3, s42, v3
-; GFX900-NEXT: v_add_f32_e32 v2, s41, v2
-; GFX900-NEXT: v_add_f32_e32 v1, s40, v1
-; GFX900-NEXT: v_add_f32_e32 v32, s19, v32
-; GFX900-NEXT: v_add_f32_e32 v31, s18, v31
-; GFX900-NEXT: v_add_f32_e32 v30, s17, v30
-; GFX900-NEXT: v_add_f32_e32 v29, s16, v29
-; GFX900-NEXT: s_waitcnt vmcnt(4)
-; GFX900-NEXT: v_add_f32_e32 v8, s39, v8
-; GFX900-NEXT: v_add_f32_e32 v7, s38, v7
-; GFX900-NEXT: v_add_f32_e32 v6, s37, v6
-; GFX900-NEXT: v_add_f32_e32 v5, s36, v5
-; GFX900-NEXT: s_waitcnt vmcnt(3)
-; GFX900-NEXT: v_add_f32_e32 v12, s51, v12
-; GFX900-NEXT: v_add_f32_e32 v11, s50, v11
-; GFX900-NEXT: v_add_f32_e32 v10, s49, v10
-; GFX900-NEXT: v_add_f32_e32 v9, s48, v9
-; GFX900-NEXT: s_waitcnt vmcnt(2)
-; GFX900-NEXT: v_add_f32_e32 v16, s47, v16
-; GFX900-NEXT: v_add_f32_e32 v15, s46, v15
-; GFX900-NEXT: v_add_f32_e32 v14, s45, v14
-; GFX900-NEXT: v_add_f32_e32 v13, s44, v13
-; GFX900-NEXT: s_waitcnt vmcnt(1)
-; GFX900-NEXT: v_add_f32_e32 v20, s15, v20
-; GFX900-NEXT: v_add_f32_e32 v19, s14, v19
-; GFX900-NEXT: v_add_f32_e32 v18, s13, v18
-; GFX900-NEXT: v_add_f32_e32 v17, s12, v17
+; GFX900-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
+; GFX900-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1]
+; GFX900-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:48
+; GFX900-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:32
+; GFX900-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:80
+; GFX900-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:64
+; GFX900-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
+; GFX900-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
+; GFX900-NEXT: s_waitcnt vmcnt(7)
+; GFX900-NEXT: v_add_f32_e32 v3, s43, v3
+; GFX900-NEXT: v_add_f32_e32 v2, s42, v2
+; GFX900-NEXT: v_add_f32_e32 v1, s41, v1
+; GFX900-NEXT: v_add_f32_e32 v0, s40, v0
+; GFX900-NEXT: s_waitcnt vmcnt(6)
+; GFX900-NEXT: v_add_f32_e32 v7, s39, v7
+; GFX900-NEXT: v_add_f32_e32 v6, s38, v6
+; GFX900-NEXT: v_add_f32_e32 v5, s37, v5
; GFX900-NEXT: s_waitcnt vmcnt(0)
-; GFX900-NEXT: v_add_f32_e32 v24, s11, v24
-; GFX900-NEXT: v_add_f32_e32 v23, s10, v23
-; GFX900-NEXT: v_add_f32_e32 v22, s9, v22
-; GFX900-NEXT: v_add_f32_e32 v21, s8, v21
-; GFX900-NEXT: v_add_f32_e32 v28, s23, v28
-; GFX900-NEXT: v_add_f32_e32 v27, s22, v27
-; GFX900-NEXT: v_add_f32_e32 v26, s21, v26
-; GFX900-NEXT: v_add_f32_e32 v25, s20, v25
-; GFX900-NEXT: global_store_dwordx4 v0, v[29:32], s[0:1] offset:96
-; GFX900-NEXT: global_store_dwordx4 v0, v[25:28], s[0:1] offset:112
-; GFX900-NEXT: global_store_dwordx4 v0, v[21:24], s[0:1] offset:64
-; GFX900-NEXT: global_store_dwordx4 v0, v[17:20], s[0:1] offset:80
-; GFX900-NEXT: global_store_dwordx4 v0, v[13:16], s[0:1] offset:32
-; GFX900-NEXT: global_store_dwordx4 v0, v[9:12], s[0:1] offset:48
-; GFX900-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1]
-; GFX900-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] offset:16
+; GFX900-NEXT: v_add_f32_e32 v31, s19, v31
+; GFX900-NEXT: v_add_f32_e32 v30, s18, v30
+; GFX900-NEXT: v_add_f32_e32 v29, s17, v29
+; GFX900-NEXT: v_add_f32_e32 v28, s16, v28
+; GFX900-NEXT: v_add_f32_e32 v4, s36, v4
+; GFX900-NEXT: v_add_f32_e32 v11, s51, v11
+; GFX900-NEXT: v_add_f32_e32 v10, s50, v10
+; GFX900-NEXT: v_add_f32_e32 v9, s49, v9
+; GFX900-NEXT: v_add_f32_e32 v8, s48, v8
+; GFX900-NEXT: v_add_f32_e32 v15, s47, v15
+; GFX900-NEXT: v_add_f32_e32 v14, s46, v14
+; GFX900-NEXT: v_add_f32_e32 v13, s45, v13
+; GFX900-NEXT: v_add_f32_e32 v12, s44, v12
+; GFX900-NEXT: v_add_f32_e32 v19, s15, v19
+; GFX900-NEXT: v_add_f32_e32 v18, s14, v18
+; GFX900-NEXT: v_add_f32_e32 v17, s13, v17
+; GFX900-NEXT: v_add_f32_e32 v16, s12, v16
+; GFX900-NEXT: v_add_f32_e32 v23, s11, v23
+; GFX900-NEXT: v_add_f32_e32 v22, s10, v22
+; GFX900-NEXT: v_add_f32_e32 v21, s9, v21
+; GFX900-NEXT: v_add_f32_e32 v20, s8, v20
+; GFX900-NEXT: v_add_f32_e32 v27, s23, v27
+; GFX900-NEXT: v_add_f32_e32 v26, s22, v26
+; GFX900-NEXT: v_add_f32_e32 v25, s21, v25
+; GFX900-NEXT: v_add_f32_e32 v24, s20, v24
+; GFX900-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:96
+; GFX900-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:112
+; GFX900-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:64
+; GFX900-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:80
+; GFX900-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:32
+; GFX900-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:48
+; GFX900-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1]
+; GFX900-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16
; GFX900-NEXT: s_endpgm
;
; PACKED-SDAG-LABEL: fadd_v32_vs:
@@ -1475,64 +1472,61 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX900-LABEL: fmul_v32_vs:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0
+; GFX900-NEXT: v_lshlrev_b32_e32 v32, 7, v0
; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
-; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
-; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16
-; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[0:1]
-; GFX900-NEXT: global_load_dwordx4 v[9:12], v0, s[0:1] offset:48
-; GFX900-NEXT: global_load_dwordx4 v[13:16], v0, s[0:1] offset:32
-; GFX900-NEXT: global_load_dwordx4 v[17:20], v0, s[0:1] offset:80
-; GFX900-NEXT: global_load_dwordx4 v[21:24], v0, s[0:1] offset:64
-; GFX900-NEXT: s_waitcnt vmcnt(5)
-; GFX900-NEXT: v_mul_f32_e32 v4, s43, v4
-; GFX900-NEXT: v_mul_f32_e32 v3, s42, v3
-; GFX900-NEXT: v_mul_f32_e32 v2, s41, v2
-; GFX900-NEXT: v_mul_f32_e32 v1, s40, v1
-; GFX900-NEXT: v_mul_f32_e32 v32, s19, v32
-; GFX900-NEXT: v_mul_f32_e32 v31, s18, v31
-; GFX900-NEXT: v_mul_f32_e32 v30, s17, v30
-; GFX900-NEXT: v_mul_f32_e32 v29, s16, v29
-; GFX900-NEXT: s_waitcnt vmcnt(4)
-; GFX900-NEXT: v_mul_f32_e32 v8, s39, v8
-; GFX900-NEXT: v_mul_f32_e32 v7, s38, v7
-; GFX900-NEXT: v_mul_f32_e32 v6, s37, v6
-; GFX900-NEXT: v_mul_f32_e32 v5, s36, v5
-; GFX900-NEXT: s_waitcnt vmcnt(3)
-; GFX900-NEXT: v_mul_f32_e32 v12, s51, v12
-; GFX900-NEXT: v_mul_f32_e32 v11, s50, v11
-; GFX900-NEXT: v_mul_f32_e32 v10, s49, v10
-; GFX900-NEXT: v_mul_f32_e32 v9, s48, v9
-; GFX900-NEXT: s_waitcnt vmcnt(2)
-; GFX900-NEXT: v_mul_f32_e32 v16, s47, v16
-; GFX900-NEXT: v_mul_f32_e32 v15, s46, v15
-; GFX900-NEXT: v_mul_f32_e32 v14, s45, v14
-; GFX900-NEXT: v_mul_f32_e32 v13, s44, v13
-; GFX900-NEXT: s_waitcnt vmcnt(1)
-; GFX900-NEXT: v_mul_f32_e32 v20, s15, v20
-; GFX900-NEXT: v_mul_f32_e32 v19, s14, v19
-; GFX900-NEXT: v_mul_f32_e32 v18, s13, v18
-; GFX900-NEXT: v_mul_f32_e32 v17, s12, v17
+; GFX900-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
+; GFX900-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1]
+; GFX900-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:48
+; GFX900-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:32
+; GFX900-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:80
+; GFX900-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:64
+; GFX900-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
+; GFX900-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
+; GFX900-NEXT: s_waitcnt vmcnt(7)
+; GFX900-NEXT: v_mul_f32_e32 v3, s43, v3
+; GFX900-NEXT: v_mul_f32_e32 v2, s42, v2
+; GFX900-NEXT: v_mul_f32_e32 v1, s41, v1
+; GFX900-NEXT: v_mul_f32_e32 v0, s40, v0
+; GFX900-NEXT: s_waitcnt vmcnt(6)
+; GFX900-NEXT: v_mul_f32_e32 v7, s39, v7
+; GFX900-NEXT: v_mul_f32_e32 v6, s38, v6
+; GFX900-NEXT: v_mul_f32_e32 v5, s37, v5
; GFX900-NEXT: s_waitcnt vmcnt(0)
-; GFX900-NEXT: v_mul_f32_e32 v24, s11, v24
-; GFX900-NEXT: v_mul_f32_e32 v23, s10, v23
-; GFX900-NEXT: v_mul_f32_e32 v22, s9, v22
-; GFX900-NEXT: v_mul_f32_e32 v21, s8, v21
-; GFX900-NEXT: v_mul_f32_e32 v28, s23, v28
-; GFX900-NEXT: v_mul_f32_e32 v27, s22, v27
-; GFX900-NEXT: v_mul_f32_e32 v26, s21, v26
-; GFX900-NEXT: v_mul_f32_e32 v25, s20, v25
-; GFX900-NEXT: global_store_dwordx4 v0, v[29:32], s[0:1] offset:96
-; GFX900-NEXT: global_store_dwordx4 v0, v[25:28], s[0:1] offset:112
-; GFX900-NEXT: global_store_dwordx4 v0, v[21:24], s[0:1] offset:64
-; GFX900-NEXT: global_store_dwordx4 v0, v[17:20], s[0:1] offset:80
-; GFX900-NEXT: global_store_dwordx4 v0, v[13:16], s[0:1] offset:32
-; GFX900-NEXT: global_store_dwordx4 v0, v[9:12], s[0:1] offset:48
-; GFX900-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1]
-; GFX900-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] offset:16
+; GFX900-NEXT: v_mul_f32_e32 v31, s19, v31
+; GFX900-NEXT: v_mul_f32_e32 v30, s18, v30
+; GFX900-NEXT: v_mul_f32_e32 v29, s17, v29
+; GFX900-NEXT: v_mul_f32_e32 v28, s16, v28
+; GFX900-NEXT: v_mul_f32_e32 v4, s36, v4
+; GFX900-NEXT: v_mul_f32_e32 v11, s51, v11
+; GFX900-NEXT: v_mul_f32_e32 v10, s50, v10
+; GFX900-NEXT: v_mul_f32_e32 v9, s49, v9
+; GFX900-NEXT: v_mul_f32_e32 v8, s48, v8
+; GFX900-NEXT: v_mul_f32_e32 v15, s47, v15
+; GFX900-NEXT: v_mul_f32_e32 v14, s46, v14
+; GFX900-NEXT: v_mul_f32_e32 v13, s45, v13
+; GFX900-NEXT: v_mul_f32_e32 v12, s44, v12
+; GFX900-NEXT: v_mul_f32_e32 v19, s15, v19
+; GFX900-NEXT: v_mul_f32_e32 v18, s14, v18
+; GFX900-NEXT: v_mul_f32_e32 v17, s13, v17
+; GFX900-NEXT: v_mul_f32_e32 v16, s12, v16
+; GFX900-NEXT: v_mul_f32_e32 v23, s11, v23
+; GFX900-NEXT: v_mul_f32_e32 v22, s10, v22
+; GFX900-NEXT: v_mul_f32_e32 v21, s9, v21
+; GFX900-NEXT: v_mul_f32_e32 v20, s8, v20
+; GFX900-NEXT: v_mul_f32_e32 v27, s23, v27
+; GFX900-NEXT: v_mul_f32_e32 v26, s22, v26
+; GFX900-NEXT: v_mul_f32_e32 v25, s21, v25
+; GFX900-NEXT: v_mul_f32_e32 v24, s20, v24
+; GFX900-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:96
+; GFX900-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:112
+; GFX900-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:64
+; GFX900-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:80
+; GFX900-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:32
+; GFX900-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:48
+; GFX900-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1]
+; GFX900-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16
; GFX900-NEXT: s_endpgm
;
; PACKED-SDAG-LABEL: fmul_v32_vs:
@@ -2323,64 +2317,61 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX900-LABEL: fma_v32_vs:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0
+; GFX900-NEXT: v_lshlrev_b32_e32 v32, 7, v0
; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
-; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
-; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16
-; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[0:1]
-; GFX900-NEXT: global_load_dwordx4 v[9:12], v0, s[0:1] offset:48
-; GFX900-NEXT: global_load_dwordx4 v[13:16], v0, s[0:1] offset:32
-; GFX900-NEXT: global_load_dwordx4 v[17:20], v0, s[0:1] offset:80
-; GFX900-NEXT: global_load_dwordx4 v[21:24], v0, s[0:1] offset:64
-; GFX900-NEXT: s_waitcnt vmcnt(5)
-; GFX900-NEXT: v_fma_f32 v4, v4, s43, s43
-; GFX900-NEXT: v_fma_f32 v3, v3, s42, s42
-; GFX900-NEXT: v_fma_f32 v2, v2, s41, s41
-; GFX900-NEXT: v_fma_f32 v1, v1, s40, s40
-; GFX900-NEXT: v_fma_f32 v32, v32, s19, s19
-; GFX900-NEXT: v_fma_f32 v31, v31, s18, s18
-; GFX900-NEXT: v_fma_f32 v30, v30, s17, s17
-; GFX900-NEXT: v_fma_f32 v29, v29, s16, s16
-; GFX900-NEXT: s_waitcnt vmcnt(4)
-; GFX900-NEXT: v_fma_f32 v8, v8, s39, s39
-; GFX900-NEXT: v_fma_f32 v7, v7, s38, s38
-; GFX900-NEXT: v_fma_f32 v6, v6, s37, s37
-; GFX900-NEXT: v_fma_f32 v5, v5, s36, s36
-; GFX900-NEXT: s_waitcnt vmcnt(3)
-; GFX900-NEXT: v_fma_f32 v12, v12, s51, s51
-; GFX900-NEXT: v_fma_f32 v11, v11, s50, s50
-; GFX900-NEXT: v_fma_f32 v10, v10, s49, s49
-; GFX900-NEXT: v_fma_f32 v9, v9, s48, s48
-; GFX900-NEXT: s_waitcnt vmcnt(2)
-; GFX900-NEXT: v_fma_f32 v16, v16, s47, s47
-; GFX900-NEXT: v_fma_f32 v15, v15, s46, s46
-; GFX900-NEXT: v_fma_f32 v14, v14, s45, s45
-; GFX900-NEXT: v_fma_f32 v13, v13, s44, s44
-; GFX900-NEXT: s_waitcnt vmcnt(1)
-; GFX900-NEXT: v_fma_f32 v20, v20, s15, s15
-; GFX900-NEXT: v_fma_f32 v19, v19, s14, s14
-; GFX900-NEXT: v_fma_f32 v18, v18, s13, s13
-; GFX900-NEXT: v_fma_f32 v17, v17, s12, s12
+; GFX900-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
+; GFX900-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1]
+; GFX900-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:48
+; GFX900-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:32
+; GFX900-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:80
+; GFX900-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:64
+; GFX900-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
+; GFX900-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
+; GFX900-NEXT: s_waitcnt vmcnt(7)
+; GFX900-NEXT: v_fma_f32 v3, v3, s43, s43
+; GFX900-NEXT: v_fma_f32 v2, v2, s42, s42
+; GFX900-NEXT: v_fma_f32 v1, v1, s41, s41
+; GFX900-NEXT: v_fma_f32 v0, v0, s40, s40
+; GFX900-NEXT: s_waitcnt vmcnt(6)
+; GFX900-NEXT: v_fma_f32 v7, v7, s39, s39
+; GFX900-NEXT: v_fma_f32 v6, v6, s38, s38
+; GFX900-NEXT: v_fma_f32 v5, v5, s37, s37
; GFX900-NEXT: s_waitcnt vmcnt(0)
-; GFX900-NEXT: v_fma_f32 v24, v24, s11, s11
-; GFX900-NEXT: v_fma_f32 v23, v23, s10, s10
-; GFX900-NEXT: v_fma_f32 v22, v22, s9, s9
-; GFX900-NEXT: v_fma_f32 v21, v21, s8, s8
-; GFX900-NEXT: v_fma_f32 v28, v28, s23, s23
-; GFX900-NEXT: v_fma_f32 v27, v27, s22, s22
-; GFX900-NEXT: v_fma_f32 v26, v26, s21, s21
-; GFX900-NEXT: v_fma_f32 v25, v25, s20, s20
-; GFX900-NEXT: global_store_dwordx4 v0, v[29:32], s[0:1] offset:96
-; GFX900-NEXT: global_store_dwordx4 v0, v[25:28], s[0:1] offset:112
-; GFX900-NEXT: global_store_dwordx4 v0, v[21:24], s[0:1] offset:64
-; GFX900-NEXT: global_store_dwordx4 v0, v[17:20], s[0:1] offset:80
-; GFX900-NEXT: global_store_dwordx4 v0, v[13:16], s[0:1] offset:32
-; GFX900-NEXT: global_store_dwordx4 v0, v[9:12], s[0:1] offset:48
-; GFX900-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1]
-; GFX900-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] offset:16
+; GFX900-NEXT: v_fma_f32 v31, v31, s19, s19
+; GFX900-NEXT: v_fma_f32 v30, v30, s18, s18
+; GFX900-NEXT: v_fma_f32 v29, v29, s17, s17
+; GFX900-NEXT: v_fma_f32 v28, v28, s16, s16
+; GFX900-NEXT: v_fma_f32 v4, v4, s36, s36
+; GFX900-NEXT: v_fma_f32 v11, v11, s51, s51
+; GFX900-NEXT: v_fma_f32 v10, v10, s50, s50
+; GFX900-NEXT: v_fma_f32 v9, v9, s49, s49
+; GFX900-NEXT: v_fma_f32 v8, v8, s48, s48
+; GFX900-NEXT: v_fma_f32 v15, v15, s47, s47
+; GFX900-NEXT: v_fma_f32 v14, v14, s46, s46
+; GFX900-NEXT: v_fma_f32 v13, v13, s45, s45
+; GFX900-NEXT: v_fma_f32 v12, v12, s44, s44
+; GFX900-NEXT: v_fma_f32 v19, v19, s15, s15
+; GFX900-NEXT: v_fma_f32 v18, v18, s14, s14
+; GFX900-NEXT: v_fma_f32 v17, v17, s13, s13
+; GFX900-NEXT: v_fma_f32 v16, v16, s12, s12
+; GFX900-NEXT: v_fma_f32 v23, v23, s11, s11
+; GFX900-NEXT: v_fma_f32 v22, v22, s10, s10
+; GFX900-NEXT: v_fma_f32 v21, v21, s9, s9
+; GFX900-NEXT: v_fma_f32 v20, v20, s8, s8
+; GFX900-NEXT: v_fma_f32 v27, v27, s23, s23
+; GFX900-NEXT: v_fma_f32 v26, v26, s22, s22
+; GFX900-NEXT: v_fma_f32 v25, v25, s21, s21
+; GFX900-NEXT: v_fma_f32 v24, v24, s20, s20
+; GFX900-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:96
+; GFX900-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:112
+; GFX900-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:64
+; GFX900-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:80
+; GFX900-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:32
+; GFX900-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:48
+; GFX900-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1]
+; GFX900-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16
; GFX900-NEXT: s_endpgm
;
; PACKED-SDAG-LABEL: fma_v32_vs:
diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
index f8c8c022567af..66fc49b604848 100644
--- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
@@ -3630,14 +3630,14 @@ define hidden void @extract_v6i16(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_clause 0x3
-; GFX10-NEXT: global_load_ushort v2, v[0:1], off offset:6
-; GFX10-NEXT: global_load_ushort v3, v[0:1], off
-; GFX10-NEXT: global_load_ushort v8, v[0:1], off offset:2
-; GFX10-NEXT: global_load_ushort v9, v[0:1], off offset:4
-; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: v_lshl_or_b32 v0, v8, 16, v3
+; GFX10-NEXT: global_load_ushort v2, v[0:1], off
+; GFX10-NEXT: global_load_ushort v3, v[0:1], off offset:2
+; GFX10-NEXT: global_load_ushort v8, v[0:1], off offset:4
+; GFX10-NEXT: global_load_ushort v9, v[0:1], off offset:6
+; GFX10-NEXT: s_waitcnt vmcnt(2)
+; GFX10-NEXT: v_lshl_or_b32 v0, v3, 16, v2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v9
+; GFX10-NEXT: v_lshl_or_b32 v1, v9, 16, v8
; GFX10-NEXT: global_store_dword v[4:5], v0, off
; GFX10-NEXT: global_store_dword v[6:7], v1, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -3776,14 +3776,14 @@ define hidden void @extract_v13i64(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_clause 0x2
-; GFX10-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:48
-; GFX10-NEXT: global_load_dwordx4 v[11:14], v[0:1], off
-; GFX10-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:64
+; GFX10-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:64
+; GFX10-NEXT: global_load_dwordx4 v[9:12], v[0:1], off
+; GFX10-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:48
; GFX10-NEXT: ; kill: killed $vgpr0 killed $vgpr1
; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: v_perm_b32 v0, v12, v13, 0x1000504
+; GFX10-NEXT: v_perm_b32 v0, v10, v11, 0x1000504
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_perm_b32 v1, v10, v14, 0x1000504
+; GFX10-NEXT: v_perm_b32 v1, v14, v8, 0x1000504
; GFX10-NEXT: global_store_dword v[4:5], v0, off
; GFX10-NEXT: global_store_dword v[6:7], v1, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -3791,15 +3791,15 @@ define hidden void @extract_v13i64(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
; GFX9-LABEL: extract_v13i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:48
-; GFX9-NEXT: global_load_dwordx4 v[11:14], v[0:1], off
-; GFX9-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:64
+; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:64
+; GFX9-NEXT: global_load_dwordx4 v[9:12], v[0:1], off
+; GFX9-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:48
; GFX9-NEXT: s_mov_b32 s4, 0x1000504
; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_perm_b32 v0, v12, v13, s4
+; GFX9-NEXT: v_perm_b32 v0, v10, v11, s4
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v1, v10, v14, s4
+; GFX9-NEXT: v_perm_b32 v1, v14, v8, s4
; GFX9-NEXT: global_store_dword v[4:5], v0, off
; GFX9-NEXT: global_store_dword v[6:7], v1, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/pr51516.mir b/llvm/test/CodeGen/AMDGPU/pr51516.mir
index 69983faf2b154..8894e21dde7de 100644
--- a/llvm/test/CodeGen/AMDGPU/pr51516.mir
+++ b/llvm/test/CodeGen/AMDGPU/pr51516.mir
@@ -1,6 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-disable-unclustered-high-rp-reschedule -verify-misched -start-before=machine-scheduler -stop-after=virtregrewriter,2 -o - %s | FileCheck -check-prefix=GCN %s
-# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-disable-unclustered-high-rp-reschedule -amdgpu-use-amdgpu-trackers=1 -verify-misched -start-before=machine-scheduler -stop-after=virtregrewriter,2 -o - %s | FileCheck -check-prefix=GCN-GCNTRACKER %s
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-reorder-while-clustering=0 -amdgpu-disable-unclustered-high-rp-reschedule -verify-misched -start-before=machine-scheduler -stop-after=virtregrewriter,2 -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-reorder-while-clustering=0 -amdgpu-disable-unclustered-high-rp-reschedule -amdgpu-use-amdgpu-trackers=1 -verify-misched -start-before=machine-scheduler -stop-after=virtregrewriter,2 -o - %s | FileCheck -check-prefix=GCN-GCNTRACKER %s
# Check that %3 was not rematerialized before the last store since its operand %1
# is killed by that store.
@@ -117,6 +117,3 @@ body: |
S_ENDPGM 0
...
-## NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-# GCN: {{.*}}
-# GCN-GCNTRACKER: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index a2b0f4d56ebea..c2dd96ab75221 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -2142,9 +2142,9 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 7, v0
-; GFX9-NEXT: v_and_b32_e32 v22, 0xffff8000, v1
+; GFX9-NEXT: v_and_b32_e32 v20, 0xffff8000, v1
; GFX9-NEXT: v_mov_b32_e32 v1, s35
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s34, v22
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s34, v20
; GFX9-NEXT: v_mov_b32_e32 v3, 3
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -2160,13 +2160,13 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) {
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dwordx2 v[10:11], v[2:3], off offset:2048
+; GFX9-NEXT: global_load_dwordx2 v[12:13], v[2:3], off
; GFX9-NEXT: s_movk_i32 s0, 0x1000
-; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, s0, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dwordx2 v[14:15], v[12:13], off
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dwordx2 v[14:15], v[2:3], off offset:2048
; GFX9-NEXT: global_load_dwordx2 v[16:17], v[2:3], off
-; GFX9-NEXT: global_load_dwordx2 v[18:19], v[12:13], off offset:2048
-; GFX9-NEXT: global_load_dwordx2 v[20:21], v[0:1], off offset:2048
+; GFX9-NEXT: global_load_dwordx2 v[18:19], v[0:1], off offset:2048
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v4
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v5, vcc
@@ -2176,18 +2176,19 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) {
; GFX9-NEXT: s_waitcnt vmcnt(4)
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v11, v1, vcc
+; GFX9-NEXT: s_waitcnt vmcnt(3)
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v12, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v13, v1, vcc
; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v14, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v15, v1, vcc
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v16, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v17, v1, vcc
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v18, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v19, v1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v14, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v15, v1, vcc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v20, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v21, v1, vcc
-; GFX9-NEXT: global_store_dwordx2 v22, v[0:1], s[34:35]
+; GFX9-NEXT: global_store_dwordx2 v20, v[0:1], s[34:35]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: ReverseOrder:
@@ -2297,14 +2298,14 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) {
; GFX11-NEXT: global_load_b64 v[6:7], v[2:3], off offset:2048
; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off
; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, 0x1000, v0
+; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, 0x1000, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v1, vcc_lo
; GFX11-NEXT: s_clause 0x4
-; GFX11-NEXT: global_load_b64 v[12:13], v[8:9], off offset:2048
-; GFX11-NEXT: global_load_b64 v[14:15], v[10:11], off
+; GFX11-NEXT: global_load_b64 v[10:11], v[8:9], off offset:2048
; GFX11-NEXT: global_load_b64 v[8:9], v[8:9], off
-; GFX11-NEXT: global_load_b64 v[10:11], v[10:11], off offset:2048
+; GFX11-NEXT: global_load_b64 v[14:15], v[12:13], off offset:2048
+; GFX11-NEXT: global_load_b64 v[12:13], v[12:13], off
; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off offset:2048
; GFX11-NEXT: s_waitcnt vmcnt(6)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v6, v4
@@ -2315,20 +2316,21 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v5, vcc_lo
; GFX11-NEXT: s_waitcnt vmcnt(4)
-; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v12, v2
+; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v10, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v13, v3, vcc_lo
-; GFX11-NEXT: s_waitcnt vmcnt(2)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v11, v3, vcc_lo
+; GFX11-NEXT: s_waitcnt vmcnt(3)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v8, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v9, v3, vcc_lo
-; GFX11-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v10, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v11, v3, vcc_lo
+; GFX11-NEXT: s_waitcnt vmcnt(2)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v14, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v15, v3, vcc_lo
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v12, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v13, v3, vcc_lo
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll
index fc154604b8700..475fcf0d81829 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll
@@ -14,14 +14,14 @@ define amdgpu_kernel void @test_rewrite_mfma_copy_to_agpr_phi(ptr addrspace(1) %
; CHECK-NEXT: s_bitcmp0_b32 s6, 0
; CHECK-NEXT: s_cbranch_scc0 .LBB0_2
; CHECK-NEXT: ; %bb.1: ; %else
-; CHECK-NEXT: global_load_dwordx4 a[28:31], v32, s[2:3] offset:112
; CHECK-NEXT: global_load_dwordx4 a[24:27], v32, s[2:3] offset:96
-; CHECK-NEXT: global_load_dwordx4 a[20:23], v32, s[2:3] offset:80
+; CHECK-NEXT: global_load_dwordx4 a[28:31], v32, s[2:3] offset:112
; CHECK-NEXT: global_load_dwordx4 a[16:19], v32, s[2:3] offset:64
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v32, s[2:3] offset:48
+; CHECK-NEXT: global_load_dwordx4 a[20:23], v32, s[2:3] offset:80
; CHECK-NEXT: global_load_dwordx4 a[8:11], v32, s[2:3] offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v32, s[2:3] offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v32, s[2:3] offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v32, s[2:3]
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v32, s[2:3] offset:16
; CHECK-NEXT: v_mov_b32_e32 v33, 2.0
; CHECK-NEXT: v_mov_b32_e32 v34, 4.0
; CHECK-NEXT: s_waitcnt vmcnt(0)
@@ -34,14 +34,14 @@ define amdgpu_kernel void @test_rewrite_mfma_copy_to_agpr_phi(ptr addrspace(1) %
; CHECK-NEXT: ; implicit-def: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
; CHECK-NEXT: .LBB0_3: ; %if
; CHECK-NEXT: s_nop 15
-; CHECK-NEXT: global_load_dwordx4 a[28:31], v32, s[0:1] offset:112
; CHECK-NEXT: global_load_dwordx4 a[24:27], v32, s[0:1] offset:96
-; CHECK-NEXT: global_load_dwordx4 a[20:23], v32, s[0:1] offset:80
+; CHECK-NEXT: global_load_dwordx4 a[28:31], v32, s[0:1] offset:112
; CHECK-NEXT: global_load_dwordx4 a[16:19], v32, s[0:1] offset:64
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v32, s[0:1] offset:48
+; CHECK-NEXT: global_load_dwordx4 a[20:23], v32, s[0:1] offset:80
; CHECK-NEXT: global_load_dwordx4 a[8:11], v32, s[0:1] offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v32, s[0:1] offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v32, s[0:1] offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v32, s[0:1]
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v32, s[0:1] offset:16
; CHECK-NEXT: v_mov_b32_e32 v32, 2.0
; CHECK-NEXT: v_mov_b32_e32 v33, 4.0
; CHECK-NEXT: s_waitcnt vmcnt(0)
@@ -84,14 +84,14 @@ define amdgpu_kernel void @test_rewrite_mfma_copy_to_agpr_phi_loop(ptr addrspace
; CHECK-NEXT: v_lshlrev_b32_e32 v32, 7, v0
; CHECK-NEXT: s_mov_b32 s1, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 v[28:31], v32, s[2:3] offset:112
; CHECK-NEXT: global_load_dwordx4 v[24:27], v32, s[2:3] offset:96
-; CHECK-NEXT: global_load_dwordx4 v[20:23], v32, s[2:3] offset:80
+; CHECK-NEXT: global_load_dwordx4 v[28:31], v32, s[2:3] offset:112
; CHECK-NEXT: global_load_dwordx4 v[16:19], v32, s[2:3] offset:64
-; CHECK-NEXT: global_load_dwordx4 v[12:15], v32, s[2:3] offset:48
+; CHECK-NEXT: global_load_dwordx4 v[20:23], v32, s[2:3] offset:80
; CHECK-NEXT: global_load_dwordx4 v[8:11], v32, s[2:3] offset:32
-; CHECK-NEXT: global_load_dwordx4 v[4:7], v32, s[2:3] offset:16
+; CHECK-NEXT: global_load_dwordx4 v[12:15], v32, s[2:3] offset:48
; CHECK-NEXT: global_load_dwordx4 v[0:3], v32, s[2:3]
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v32, s[2:3] offset:16
; CHECK-NEXT: v_mov_b32_e32 v64, 4.0
; CHECK-NEXT: v_mov_b32_e32 v65, 2.0
; CHECK-NEXT: .LBB1_1: ; %loop
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.gfx90a.ll
index 7d00b12e7334a..9fc0e6abd6334 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.gfx90a.ll
@@ -7,10 +7,10 @@ define void @test_rewrite_mfma_i32_32x32x8i8(i32 %arg0, i32 %arg1, ptr addrspace
; CHECK-LABEL: test_rewrite_mfma_i32_32x32x8i8:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[2:3], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[2:3], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_i32_32x32x8i8 a[0:15], v0, v1, a[0:15]
; CHECK-NEXT: ;;#ASMSTART
@@ -44,14 +44,14 @@ define void @test_rewrite_mfma_f32_32x32x2bf16(<2 x i16> %arg0, <2 x i16> %arg1,
; CHECK-LABEL: test_rewrite_mfma_f32_32x32x2bf16:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[28:31], v[2:3], off offset:112
; CHECK-NEXT: global_load_dwordx4 a[24:27], v[2:3], off offset:96
-; CHECK-NEXT: global_load_dwordx4 a[20:23], v[2:3], off offset:80
+; CHECK-NEXT: global_load_dwordx4 a[28:31], v[2:3], off offset:112
; CHECK-NEXT: global_load_dwordx4 a[16:19], v[2:3], off offset:64
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
+; CHECK-NEXT: global_load_dwordx4 a[20:23], v[2:3], off offset:80
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[2:3], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[2:3], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x2bf16 a[0:31], v0, v1, a[0:31]
; CHECK-NEXT: ;;#ASMSTART
@@ -68,10 +68,10 @@ define void @test_rewrite_mfma_f32_16x16x2bf16(<2 x i16> %arg0, <2 x i16> %arg1,
; CHECK-LABEL: test_rewrite_mfma_f32_16x16x2bf16:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[2:3], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[2:3], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_16x16x2bf16 a[0:15], v0, v1, a[0:15]
; CHECK-NEXT: ;;#ASMSTART
@@ -105,10 +105,10 @@ define void @test_rewrite_mfma_f32_32x32x4bf16(<2 x i16> %arg0, <2 x i16> %arg1,
; CHECK-LABEL: test_rewrite_mfma_f32_32x32x4bf16:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[2:3], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[2:3], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x4bf16 a[0:15], v0, v1, a[0:15]
; CHECK-NEXT: ;;#ASMSTART
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.gfx950.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.gfx950.ll
index b2465b02f2eee..ece60b9b10d8a 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.gfx950.ll
@@ -24,10 +24,10 @@ define void @test_rewrite_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x half> %ar
; CHECK-LABEL: test_rewrite_mfma_f32_32x32x16_f16:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[8:9], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[8:9], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[8:9], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[8:9], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[8:9], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[8:9], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
; CHECK-NEXT: ;;#ASMSTART
@@ -61,10 +61,10 @@ define void @test_rewrite_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> %arg1,
; CHECK-LABEL: test_rewrite_mfma_i32_32x32x32_i8:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[8:9], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[8:9], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[8:9], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[8:9], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[8:9], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[8:9], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
; CHECK-NEXT: ;;#ASMSTART
@@ -98,10 +98,10 @@ define void @test_rewrite_mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x bfloat
; CHECK-LABEL: test_rewrite_mfma_f32_32x32x16_bf16:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[8:9], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[8:9], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[8:9], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[8:9], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[8:9], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[8:9], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15]
; CHECK-NEXT: ;;#ASMSTART
@@ -240,10 +240,10 @@ define void @test_rewrite_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0(<8 x
; CHECK-LABEL: test_rewrite_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[18:19], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[18:19], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[18:19], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[18:19], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[18:19], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[18:19], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0]
; CHECK-NEXT: ;;#ASMSTART
@@ -263,10 +263,10 @@ define void @test_rewrite_mfma_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0(<8 x i32> %
; CHECK-LABEL: test_rewrite_mfma_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[16:17], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[16:17], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[16:17], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[16:17], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[16:17], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[16:17], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15]
; CHECK-NEXT: ;;#ASMSTART
@@ -286,10 +286,10 @@ define void @test_rewrite_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2(<6 x
; CHECK-LABEL: test_rewrite_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[14:15], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[14:15], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[14:15], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[14:15], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[14:15], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[14:15], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v12, v13 op_sel_hi:[0,0,0] cbsz:2 blgp:2
; CHECK-NEXT: ;;#ASMSTART
@@ -309,10 +309,10 @@ define void @test_rewrite_mfma_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2(<6 x i32> %
; CHECK-LABEL: test_rewrite_mfma_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[12:13], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[12:13], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[12:13], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[12:13], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[12:13], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[12:13], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:2
; CHECK-NEXT: ;;#ASMSTART
@@ -332,10 +332,10 @@ define void @test_rewrite_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4(<4 x
; CHECK-LABEL: test_rewrite_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[10:11], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[10:11], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[10:11], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[10:11], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[10:11], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[10:11], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15], v8, v9 op_sel_hi:[0,0,0] cbsz:4 blgp:4
; CHECK-NEXT: ;;#ASMSTART
@@ -355,10 +355,10 @@ define void @test_rewrite_mfma_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4(<4 x i32> %
; CHECK-LABEL: test_rewrite_mfma_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[8:9], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[8:9], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[8:9], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[8:9], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[8:9], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[8:9], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:4 blgp:4
; CHECK-NEXT: ;;#ASMSTART
@@ -399,10 +399,10 @@ define void @test_rewrite_smfmac_f32_32x32x32_f16(<8 x half> %arg0, <16 x half>
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v15, v14
; CHECK-NEXT: v_mov_b32_e32 v14, v13
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[14:15], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[14:15], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[14:15], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[14:15], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[14:15], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[14:15], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_smfmac_f32_32x32x32_f16 a[0:15], v[0:3], v[4:11], v12
; CHECK-NEXT: ;;#ASMSTART
@@ -440,10 +440,10 @@ define void @test_rewrite_smfmac_f32_32x32x32_bf16(<8 x bfloat> %arg0, <16 x bfl
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v15, v14
; CHECK-NEXT: v_mov_b32_e32 v14, v13
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[14:15], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[14:15], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[14:15], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[14:15], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[14:15], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[14:15], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v12
; CHECK-NEXT: ;;#ASMSTART
@@ -481,10 +481,10 @@ define void @test_rewrite_smfmac_i32_32x32x64_i8(<4 x i32> %arg0, <8 x i32> %arg
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v15, v14
; CHECK-NEXT: v_mov_b32_e32 v14, v13
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[14:15], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[14:15], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[14:15], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[14:15], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[14:15], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[14:15], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_smfmac_i32_32x32x64_i8 a[0:15], v[0:3], v[4:11], v12
; CHECK-NEXT: ;;#ASMSTART
@@ -579,10 +579,10 @@ define void @test_rewrite_smfmac_f32_32x32x64_bf8_bf8(<4 x i32> %arg0, <8 x i32>
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v15, v14
; CHECK-NEXT: v_mov_b32_e32 v14, v13
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[14:15], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[14:15], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[14:15], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[14:15], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[14:15], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[14:15], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[0:3], v[4:11], v12 cbsz:1 abid:2
; CHECK-NEXT: ;;#ASMSTART
@@ -601,10 +601,10 @@ define void @test_rewrite_smfmac_f32_32x32x64_bf8_fp8(<4 x i32> %arg0, <8 x i32>
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v15, v14
; CHECK-NEXT: v_mov_b32_e32 v14, v13
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[14:15], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[14:15], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[14:15], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[14:15], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[14:15], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[14:15], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[0:3], v[4:11], v12 cbsz:1 abid:2
; CHECK-NEXT: ;;#ASMSTART
@@ -623,10 +623,10 @@ define void @test_rewrite_smfmac_f32_32x32x64_fp8_bf8(<4 x i32> %arg0, <8 x i32>
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v15, v14
; CHECK-NEXT: v_mov_b32_e32 v14, v13
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[14:15], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[14:15], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[14:15], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[14:15], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[14:15], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[14:15], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[0:3], v[4:11], v12 cbsz:1 abid:2
; CHECK-NEXT: ;;#ASMSTART
@@ -645,10 +645,10 @@ define void @test_rewrite_smfmac_f32_32x32x64_fp8_fp8(<4 x i32> %arg0, <8 x i32>
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v15, v14
; CHECK-NEXT: v_mov_b32_e32 v14, v13
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[14:15], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[14:15], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[14:15], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[14:15], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[14:15], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[14:15], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[0:3], v[4:11], v12 cbsz:1 abid:2
; CHECK-NEXT: ;;#ASMSTART
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
index 47ebd072c4cc7..7c603a8a4b954 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
@@ -42,19 +42,19 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma(ptr addrsp
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: v_lshlrev_b32_e32 v0, 7, v0
+; CHECK-NEXT: v_lshlrev_b32_e32 v4, 7, v0
; CHECK-NEXT: v_mov_b32_e32 v32, 1.0
; CHECK-NEXT: v_mov_b32_e32 v33, 2.0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 v[28:31], v0, s[0:1] offset:112
-; CHECK-NEXT: global_load_dwordx4 v[24:27], v0, s[0:1] offset:96
-; CHECK-NEXT: global_load_dwordx4 v[20:23], v0, s[0:1] offset:80
-; CHECK-NEXT: global_load_dwordx4 v[16:19], v0, s[0:1] offset:64
-; CHECK-NEXT: global_load_dwordx4 v[12:15], v0, s[0:1] offset:48
-; CHECK-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] offset:32
-; CHECK-NEXT: global_load_dwordx4 v[4:7], v0, s[0:1] offset:16
+; CHECK-NEXT: global_load_dwordx4 v[24:27], v4, s[0:1] offset:96
+; CHECK-NEXT: global_load_dwordx4 v[28:31], v4, s[0:1] offset:112
+; CHECK-NEXT: global_load_dwordx4 v[16:19], v4, s[0:1] offset:64
+; CHECK-NEXT: global_load_dwordx4 v[20:23], v4, s[0:1] offset:80
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v4, s[0:1] offset:32
+; CHECK-NEXT: global_load_dwordx4 v[12:15], v4, s[0:1] offset:48
+; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1]
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v4, s[0:1] offset:16
; CHECK-NEXT: v_accvgpr_write_b32 a0, 1.0
; CHECK-NEXT: v_accvgpr_write_b32 a1, 2.0
; CHECK-NEXT: s_waitcnt vmcnt(0)
@@ -123,19 +123,19 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_noshuffle(
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: v_lshlrev_b32_e32 v0, 7, v0
+; CHECK-NEXT: v_lshlrev_b32_e32 v4, 7, v0
; CHECK-NEXT: v_mov_b32_e32 v32, 1.0
; CHECK-NEXT: v_mov_b32_e32 v33, 2.0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 v[28:31], v0, s[0:1] offset:112
-; CHECK-NEXT: global_load_dwordx4 v[24:27], v0, s[0:1] offset:96
-; CHECK-NEXT: global_load_dwordx4 v[20:23], v0, s[0:1] offset:80
-; CHECK-NEXT: global_load_dwordx4 v[16:19], v0, s[0:1] offset:64
-; CHECK-NEXT: global_load_dwordx4 v[12:15], v0, s[0:1] offset:48
-; CHECK-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] offset:32
-; CHECK-NEXT: global_load_dwordx4 v[4:7], v0, s[0:1] offset:16
+; CHECK-NEXT: global_load_dwordx4 v[24:27], v4, s[0:1] offset:96
+; CHECK-NEXT: global_load_dwordx4 v[28:31], v4, s[0:1] offset:112
+; CHECK-NEXT: global_load_dwordx4 v[16:19], v4, s[0:1] offset:64
+; CHECK-NEXT: global_load_dwordx4 v[20:23], v4, s[0:1] offset:80
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v4, s[0:1] offset:32
+; CHECK-NEXT: global_load_dwordx4 v[12:15], v4, s[0:1] offset:48
+; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1]
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v4, s[0:1] offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
@@ -237,18 +237,18 @@ define amdgpu_kernel void @test_rewrite_mfma_direct_copy_to_agpr_class(ptr addrs
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: v_lshlrev_b32_e32 v0, 7, v0
+; CHECK-NEXT: v_lshlrev_b32_e32 v4, 7, v0
; CHECK-NEXT: v_mov_b32_e32 v32, 2.0
; CHECK-NEXT: v_mov_b32_e32 v33, 4.0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112
-; CHECK-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96
-; CHECK-NEXT: global_load_dwordx4 a[20:23], v0, s[0:1] offset:80
-; CHECK-NEXT: global_load_dwordx4 a[16:19], v0, s[0:1] offset:64
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v0, s[0:1] offset:48
-; CHECK-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16
-; CHECK-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1]
+; CHECK-NEXT: global_load_dwordx4 a[24:27], v4, s[0:1] offset:96
+; CHECK-NEXT: global_load_dwordx4 a[28:31], v4, s[0:1] offset:112
+; CHECK-NEXT: global_load_dwordx4 a[16:19], v4, s[0:1] offset:64
+; CHECK-NEXT: global_load_dwordx4 a[20:23], v4, s[0:1] offset:80
+; CHECK-NEXT: global_load_dwordx4 a[8:11], v4, s[0:1] offset:32
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v4, s[0:1] offset:48
+; CHECK-NEXT: global_load_dwordx4 a[0:3], v4, s[0:1]
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v4, s[0:1] offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v32, v33, a[0:31]
; CHECK-NEXT: ;;#ASMSTART
@@ -283,14 +283,14 @@ define void @test_rewrite_mfma_subreg_extract0(float %arg0, float %arg1, ptr add
; CHECK-LABEL: test_rewrite_mfma_subreg_extract0:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[28:31], v[2:3], off offset:112
; CHECK-NEXT: global_load_dwordx4 a[24:27], v[2:3], off offset:96
-; CHECK-NEXT: global_load_dwordx4 a[20:23], v[2:3], off offset:80
+; CHECK-NEXT: global_load_dwordx4 a[28:31], v[2:3], off offset:112
; CHECK-NEXT: global_load_dwordx4 a[16:19], v[2:3], off offset:64
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
+; CHECK-NEXT: global_load_dwordx4 a[20:23], v[2:3], off offset:80
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[2:3], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[2:3], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
; CHECK-NEXT: ;;#ASMSTART
@@ -309,14 +309,14 @@ define void @test_rewrite_mfma_subreg_extract1(float %arg0, float %arg1, ptr add
; CHECK-LABEL: test_rewrite_mfma_subreg_extract1:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[28:31], v[2:3], off offset:112
; CHECK-NEXT: global_load_dwordx4 a[24:27], v[2:3], off offset:96
-; CHECK-NEXT: global_load_dwordx4 a[20:23], v[2:3], off offset:80
+; CHECK-NEXT: global_load_dwordx4 a[28:31], v[2:3], off offset:112
; CHECK-NEXT: global_load_dwordx4 a[16:19], v[2:3], off offset:64
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
+; CHECK-NEXT: global_load_dwordx4 a[20:23], v[2:3], off offset:80
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[2:3], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[2:3], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
; CHECK-NEXT: ;;#ASMSTART
@@ -336,14 +336,14 @@ define void @test_rewrite_mfma_subreg_extract2(float %arg0, float %arg1, ptr add
; CHECK-LABEL: test_rewrite_mfma_subreg_extract2:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[28:31], v[2:3], off offset:112
; CHECK-NEXT: global_load_dwordx4 a[24:27], v[2:3], off offset:96
-; CHECK-NEXT: global_load_dwordx4 a[20:23], v[2:3], off offset:80
+; CHECK-NEXT: global_load_dwordx4 a[28:31], v[2:3], off offset:112
; CHECK-NEXT: global_load_dwordx4 a[16:19], v[2:3], off offset:64
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
+; CHECK-NEXT: global_load_dwordx4 a[20:23], v[2:3], off offset:80
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[2:3], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[2:3], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
; CHECK-NEXT: s_nop 15
@@ -831,14 +831,14 @@ define void @test_rewrite_mfma_f32_32x32x1f32(float %arg0, float %arg1, ptr addr
; CHECK-LABEL: test_rewrite_mfma_f32_32x32x1f32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[28:31], v[2:3], off offset:112
; CHECK-NEXT: global_load_dwordx4 a[24:27], v[2:3], off offset:96
-; CHECK-NEXT: global_load_dwordx4 a[20:23], v[2:3], off offset:80
+; CHECK-NEXT: global_load_dwordx4 a[28:31], v[2:3], off offset:112
; CHECK-NEXT: global_load_dwordx4 a[16:19], v[2:3], off offset:64
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
+; CHECK-NEXT: global_load_dwordx4 a[20:23], v[2:3], off offset:80
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[2:3], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[2:3], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
; CHECK-NEXT: ;;#ASMSTART
@@ -855,10 +855,10 @@ define void @test_rewrite_mfma_f32_16x16x1f32(float %arg0, float %arg1, ptr addr
; CHECK-LABEL: test_rewrite_mfma_f32_16x16x1f32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[2:3], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[2:3], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15]
; CHECK-NEXT: ;;#ASMSTART
@@ -892,10 +892,10 @@ define void @test_rewrite_mfma_f32_32x32x2f32(float %arg0, float %arg1, ptr addr
; CHECK-LABEL: test_rewrite_mfma_f32_32x32x2f32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[2:3], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[2:3], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x2_f32 a[0:15], v0, v1, a[0:15]
; CHECK-NEXT: ;;#ASMSTART
@@ -929,14 +929,14 @@ define void @test_rewrite_mfma_f32_32x32x4f16(<4 x half> %arg0, <4 x half> %arg1
; CHECK-LABEL: test_rewrite_mfma_f32_32x32x4f16:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[28:31], v[4:5], off offset:112
; CHECK-NEXT: global_load_dwordx4 a[24:27], v[4:5], off offset:96
-; CHECK-NEXT: global_load_dwordx4 a[20:23], v[4:5], off offset:80
+; CHECK-NEXT: global_load_dwordx4 a[28:31], v[4:5], off offset:112
; CHECK-NEXT: global_load_dwordx4 a[16:19], v[4:5], off offset:64
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[4:5], off offset:48
+; CHECK-NEXT: global_load_dwordx4 a[20:23], v[4:5], off offset:80
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[4:5], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[4:5], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[4:5], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[4:5], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[4:5], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x4_2b_f16 a[0:31], v[0:1], v[2:3], a[0:31]
; CHECK-NEXT: ;;#ASMSTART
@@ -953,10 +953,10 @@ define void @test_rewrite_mfma_f32_16x16x4f16(<4 x half> %arg0, <4 x half> %arg1
; CHECK-LABEL: test_rewrite_mfma_f32_16x16x4f16:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[4:5], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[4:5], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[4:5], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[4:5], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[4:5], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[4:5], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_16x16x4_4b_f16 a[0:15], v[0:1], v[2:3], a[0:15]
; CHECK-NEXT: ;;#ASMSTART
@@ -990,10 +990,10 @@ define void @test_rewrite_mfma_f32_32x32x8f16(<4 x half> %arg0, <4 x half> %arg1
; CHECK-LABEL: test_rewrite_mfma_f32_32x32x8f16:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[4:5], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[4:5], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[4:5], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[4:5], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[4:5], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[4:5], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x8_f16 a[0:15], v[0:1], v[2:3], a[0:15]
; CHECK-NEXT: ;;#ASMSTART
@@ -1027,14 +1027,14 @@ define void @test_rewrite_mfma_i32_32x32x4i8(i32 %arg0, i32 %arg1, ptr addrspace
; CHECK-LABEL: test_rewrite_mfma_i32_32x32x4i8:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[28:31], v[2:3], off offset:112
; CHECK-NEXT: global_load_dwordx4 a[24:27], v[2:3], off offset:96
-; CHECK-NEXT: global_load_dwordx4 a[20:23], v[2:3], off offset:80
+; CHECK-NEXT: global_load_dwordx4 a[28:31], v[2:3], off offset:112
; CHECK-NEXT: global_load_dwordx4 a[16:19], v[2:3], off offset:64
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
+; CHECK-NEXT: global_load_dwordx4 a[20:23], v[2:3], off offset:80
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[2:3], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[2:3], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_i32_32x32x4_2b_i8 a[0:31], v0, v1, a[0:31]
; CHECK-NEXT: ;;#ASMSTART
@@ -1051,10 +1051,10 @@ define void @test_rewrite_mfma_i32_16x16x4i8(i32 %arg0, i32 %arg1, ptr addrspace
; CHECK-LABEL: test_rewrite_mfma_i32_16x16x4i8:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[2:3], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[2:3], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_i32_16x16x4_4b_i8 a[0:15], v0, v1, a[0:15]
; CHECK-NEXT: ;;#ASMSTART
@@ -1092,14 +1092,14 @@ define void @test_rewrite_mfma_f32_32x32x4bf16_1k(<4 x i16> %arg0, <4 x i16> %ar
; CHECK-LABEL: test_rewrite_mfma_f32_32x32x4bf16_1k:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[28:31], v[4:5], off offset:112
; CHECK-NEXT: global_load_dwordx4 a[24:27], v[4:5], off offset:96
-; CHECK-NEXT: global_load_dwordx4 a[20:23], v[4:5], off offset:80
+; CHECK-NEXT: global_load_dwordx4 a[28:31], v[4:5], off offset:112
; CHECK-NEXT: global_load_dwordx4 a[16:19], v[4:5], off offset:64
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[4:5], off offset:48
+; CHECK-NEXT: global_load_dwordx4 a[20:23], v[4:5], off offset:80
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[4:5], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[4:5], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[4:5], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[4:5], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[4:5], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x4_2b_bf16 a[0:31], v[0:1], v[2:3], a[0:31]
; CHECK-NEXT: ;;#ASMSTART
@@ -1116,10 +1116,10 @@ define void @test_rewrite_mfma_f32_16x16x4bf16_1k(<4 x i16> %arg0, <4 x i16> %ar
; CHECK-LABEL: test_rewrite_mfma_f32_16x16x4bf16_1k:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[4:5], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[4:5], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[4:5], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[4:5], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[4:5], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[4:5], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_16x16x4_4b_bf16 a[0:15], v[0:1], v[2:3], a[0:15]
; CHECK-NEXT: ;;#ASMSTART
@@ -1153,10 +1153,10 @@ define void @test_rewrite_mfma_f32_32x32x8bf16_1k(<4 x i16> %arg0, <4 x i16> %ar
; CHECK-LABEL: test_rewrite_mfma_f32_32x32x8bf16_1k:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[4:5], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[4:5], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[4:5], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[4:5], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[4:5], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[4:5], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x8_bf16 a[0:15], v[0:1], v[2:3], a[0:15]
; CHECK-NEXT: ;;#ASMSTART
@@ -1190,8 +1190,8 @@ define void @test_rewrite_mfma_f64_16x16x4f64(double %arg0, double %arg1, ptr ad
; CHECK-LABEL: test_rewrite_mfma_f64_16x16x4f64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[4:5], off offset:16
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[4:5], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[4:5], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
; CHECK-NEXT: ;;#ASMSTART
@@ -1246,10 +1246,10 @@ define void @test_rewrite_mfma_i32_32x32x16_i8(i64 %arg0, i64 %arg1, ptr addrspa
; CHECK-LABEL: test_rewrite_mfma_i32_32x32x16_i8:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[4:5], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[4:5], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[4:5], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[4:5], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[4:5], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[4:5], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_i32_32x32x16_i8 a[0:15], v[0:1], v[2:3], a[0:15]
; CHECK-NEXT: ;;#ASMSTART
@@ -1283,10 +1283,10 @@ define void @test_rewrite_mfma_f32_32x32x4_xf32(<2 x float> %arg0, <2 x float> %
; CHECK-LABEL: test_rewrite_mfma_f32_32x32x4_xf32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[4:5], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[4:5], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[4:5], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[4:5], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[4:5], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[4:5], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x4_xf32 a[0:15], v[0:1], v[2:3], a[0:15]
; CHECK-NEXT: ;;#ASMSTART
@@ -1371,10 +1371,10 @@ define void @test_rewrite_mfma_f32_32x32x16_bf8_bf8(i64 %arg0, i64 %arg1, ptr ad
; CHECK-LABEL: test_rewrite_mfma_f32_32x32x16_bf8_bf8:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[4:5], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[4:5], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[4:5], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[4:5], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[4:5], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[4:5], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x16_bf8_bf8 a[0:15], v[0:1], v[2:3], a[0:15]
; CHECK-NEXT: ;;#ASMSTART
@@ -1391,10 +1391,10 @@ define void @test_rewrite_mfma_f32_32x32x16_bf8_fp8(i64 %arg0, i64 %arg1, ptr ad
; CHECK-LABEL: test_rewrite_mfma_f32_32x32x16_bf8_fp8:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[4:5], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[4:5], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[4:5], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[4:5], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[4:5], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[4:5], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x16_bf8_fp8 a[0:15], v[0:1], v[2:3], a[0:15]
; CHECK-NEXT: ;;#ASMSTART
@@ -1411,10 +1411,10 @@ define void @test_rewrite_mfma_f32_32x32x16_fp8_bf8(i64 %arg0, i64 %arg1, ptr ad
; CHECK-LABEL: test_rewrite_mfma_f32_32x32x16_fp8_bf8:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[4:5], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[4:5], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[4:5], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[4:5], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[4:5], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[4:5], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x16_fp8_bf8 a[0:15], v[0:1], v[2:3], a[0:15]
; CHECK-NEXT: ;;#ASMSTART
@@ -1431,10 +1431,10 @@ define void @test_rewrite_mfma_f32_32x32x16_fp8_fp8(i64 %arg0, i64 %arg1, ptr ad
; CHECK-LABEL: test_rewrite_mfma_f32_32x32x16_fp8_fp8:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[4:5], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[4:5], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[4:5], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[4:5], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[4:5], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[4:5], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x16_fp8_fp8 a[0:15], v[0:1], v[2:3], a[0:15]
; CHECK-NEXT: ;;#ASMSTART
@@ -1472,10 +1472,10 @@ define void @test_rewrite_smfmac_f32_32x32x16_f16(<4 x half> %arg0, <8 x half> %
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v9, v8
; CHECK-NEXT: v_mov_b32_e32 v8, v7
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[8:9], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[8:9], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[8:9], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[8:9], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[8:9], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[8:9], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_smfmac_f32_32x32x16_f16 a[0:15], v[0:1], v[2:5], v6
; CHECK-NEXT: ;;#ASMSTART
@@ -1513,10 +1513,10 @@ define void @test_rewrite_smfmac_f32_32x32x16_bf16(<4 x i16> %arg0, <8 x i16> %a
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v9, v8
; CHECK-NEXT: v_mov_b32_e32 v8, v7
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[8:9], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[8:9], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[8:9], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[8:9], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[8:9], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[8:9], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_smfmac_f32_32x32x16_bf16 a[0:15], v[0:1], v[2:5], v6
; CHECK-NEXT: ;;#ASMSTART
@@ -1554,10 +1554,10 @@ define void @test_rewrite_smfmac_i32_32x32x32_i8(<2 x i32> %arg0, <4 x i32> %arg
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v9, v8
; CHECK-NEXT: v_mov_b32_e32 v8, v7
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[8:9], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[8:9], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[8:9], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[8:9], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[8:9], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[8:9], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_smfmac_i32_32x32x32_i8 a[0:15], v[0:1], v[2:5], v6
; CHECK-NEXT: ;;#ASMSTART
@@ -1652,10 +1652,10 @@ define void @test_rewrite_smfmac_32x32x32_bf8_bf8(<2 x i32> %arg0, <4 x i32> %ar
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v9, v8
; CHECK-NEXT: v_mov_b32_e32 v8, v7
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[8:9], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[8:9], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[8:9], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[8:9], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[8:9], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[8:9], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 a[0:15], v[0:1], v[2:5], v6
; CHECK-NEXT: ;;#ASMSTART
@@ -1674,10 +1674,10 @@ define void @test_rewrite_smfmac_32x32x32_bf8_fp8(<2 x i32> %arg0, <4 x i32> %ar
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v9, v8
; CHECK-NEXT: v_mov_b32_e32 v8, v7
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[8:9], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[8:9], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[8:9], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[8:9], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[8:9], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[8:9], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 a[0:15], v[0:1], v[2:5], v6
; CHECK-NEXT: ;;#ASMSTART
@@ -1696,10 +1696,10 @@ define void @test_rewrite_smfmac_32x32x32_fp8_bf8(<2 x i32> %arg0, <4 x i32> %ar
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v9, v8
; CHECK-NEXT: v_mov_b32_e32 v8, v7
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[8:9], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[8:9], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[8:9], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[8:9], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[8:9], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[8:9], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 a[0:15], v[0:1], v[2:5], v6
; CHECK-NEXT: ;;#ASMSTART
@@ -1718,10 +1718,10 @@ define void @test_rewrite_smfmac_32x32x32_fp8_fp8(<2 x i32> %arg0, <4 x i32> %ar
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v9, v8
; CHECK-NEXT: v_mov_b32_e32 v8, v7
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[8:9], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[8:9], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[8:9], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[8:9], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[8:9], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[8:9], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 a[0:15], v[0:1], v[2:5], v6
; CHECK-NEXT: ;;#ASMSTART
diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll
index 02c70fbd970f8..8d68e71d08254 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.ll
@@ -343,13 +343,13 @@ define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v8, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:16
-; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3]
+; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3]
+; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_sub_u32_e32 v3, v7, v3
-; GFX9-NEXT: v_sub_u32_e32 v2, v6, v2
-; GFX9-NEXT: v_sub_u32_e32 v1, v5, v1
-; GFX9-NEXT: v_sub_u32_e32 v0, v4, v0
+; GFX9-NEXT: v_sub_u32_e32 v3, v3, v7
+; GFX9-NEXT: v_sub_u32_e32 v2, v2, v6
+; GFX9-NEXT: v_sub_u32_e32 v1, v1, v5
+; GFX9-NEXT: v_sub_u32_e32 v0, v0, v4
; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX9-NEXT: s_endpgm
;
@@ -359,13 +359,13 @@ define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1
; GFX12-NEXT: v_mov_b32_e32 v8, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_load_b128 v[0:3], v8, s[2:3] offset:16
-; GFX12-NEXT: global_load_b128 v[4:7], v8, s[2:3]
+; GFX12-NEXT: global_load_b128 v[0:3], v8, s[2:3]
+; GFX12-NEXT: global_load_b128 v[4:7], v8, s[2:3] offset:16
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_sub_nc_u32_e32 v3, v7, v3
-; GFX12-NEXT: v_sub_nc_u32_e32 v2, v6, v2
-; GFX12-NEXT: v_sub_nc_u32_e32 v1, v5, v1
-; GFX12-NEXT: v_sub_nc_u32_e32 v0, v4, v0
+; GFX12-NEXT: v_sub_nc_u32_e32 v3, v3, v7
+; GFX12-NEXT: v_sub_nc_u32_e32 v2, v2, v6
+; GFX12-NEXT: v_sub_nc_u32_e32 v1, v1, v5
+; GFX12-NEXT: v_sub_nc_u32_e32 v0, v0, v4
; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1]
; GFX12-NEXT: s_endpgm
%b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index 20789232b1f25..5d06acbde06cb 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -218,20 +218,19 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
; GFX942-NEXT: v_lshlrev_b32_e32 v1, 5, v0
; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx4 v[6:9], v1, s[0:1] offset:16
; GFX942-NEXT: global_load_dwordx4 v[2:5], v1, s[0:1]
+; GFX942-NEXT: global_load_dwordx4 v[6:9], v1, s[0:1] offset:16
; GFX942-NEXT: v_mov_b32_e32 v1, 0
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX942-NEXT: s_cbranch_execz .LBB5_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
; GFX942-NEXT: v_lshlrev_b32_e32 v0, 5, v0
-; GFX942-NEXT: global_load_dwordx4 v[6:9], v0, s[2:3] offset:16
; GFX942-NEXT: global_load_dwordx4 v[2:5], v0, s[2:3]
+; GFX942-NEXT: global_load_dwordx4 v[6:9], v0, s[2:3] offset:16
; GFX942-NEXT: .LBB5_2: ; %bb.2
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: global_store_dwordx4 v1, v[6:9], s[6:7] offset:16
-; GFX942-NEXT: s_waitcnt vmcnt(1)
; GFX942-NEXT: global_store_dwordx4 v1, v[2:5], s[6:7]
; GFX942-NEXT: s_endpgm
entry:
@@ -260,68 +259,64 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: global_load_dwordx4 v[22:25], v1, s[0:1] offset:224
; GFX942-NEXT: global_load_dwordx4 v[30:33], v1, s[0:1] offset:240
-; GFX942-NEXT: global_load_dwordx4 v[26:29], v1, s[0:1] offset:224
-; GFX942-NEXT: global_load_dwordx4 v[22:25], v1, s[0:1] offset:208
-; GFX942-NEXT: global_load_dwordx4 v[18:21], v1, s[0:1] offset:192
-; GFX942-NEXT: global_load_dwordx4 v[14:17], v1, s[0:1] offset:176
-; GFX942-NEXT: global_load_dwordx4 v[10:13], v1, s[0:1] offset:160
-; GFX942-NEXT: global_load_dwordx4 v[6:9], v1, s[0:1] offset:144
+; GFX942-NEXT: global_load_dwordx4 v[14:17], v1, s[0:1] offset:192
+; GFX942-NEXT: global_load_dwordx4 v[26:29], v1, s[0:1] offset:208
+; GFX942-NEXT: global_load_dwordx4 v[6:9], v1, s[0:1] offset:160
+; GFX942-NEXT: global_load_dwordx4 v[18:21], v1, s[0:1] offset:176
; GFX942-NEXT: global_load_dwordx4 v[2:5], v1, s[0:1] offset:128
+; GFX942-NEXT: global_load_dwordx4 v[10:13], v1, s[0:1] offset:144
+; GFX942-NEXT: global_load_dwordx4 v[54:57], v1, s[0:1] offset:96
; GFX942-NEXT: global_load_dwordx4 a[0:3], v1, s[0:1] offset:112
-; GFX942-NEXT: global_load_dwordx4 v[58:61], v1, s[0:1] offset:96
-; GFX942-NEXT: global_load_dwordx4 v[54:57], v1, s[0:1] offset:80
-; GFX942-NEXT: global_load_dwordx4 v[50:53], v1, s[0:1] offset:64
-; GFX942-NEXT: global_load_dwordx4 v[46:49], v1, s[0:1] offset:48
-; GFX942-NEXT: global_load_dwordx4 v[42:45], v1, s[0:1] offset:32
-; GFX942-NEXT: global_load_dwordx4 v[38:41], v1, s[0:1] offset:16
+; GFX942-NEXT: global_load_dwordx4 v[46:49], v1, s[0:1] offset:64
+; GFX942-NEXT: global_load_dwordx4 v[58:61], v1, s[0:1] offset:80
+; GFX942-NEXT: global_load_dwordx4 v[38:41], v1, s[0:1] offset:32
+; GFX942-NEXT: global_load_dwordx4 v[50:53], v1, s[0:1] offset:48
; GFX942-NEXT: global_load_dwordx4 v[34:37], v1, s[0:1]
+; GFX942-NEXT: global_load_dwordx4 v[42:45], v1, s[0:1] offset:16
; GFX942-NEXT: v_mov_b32_e32 v1, 0
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX942-NEXT: s_cbranch_execz .LBB6_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
; GFX942-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX942-NEXT: global_load_dwordx4 v[22:25], v0, s[2:3] offset:224
; GFX942-NEXT: global_load_dwordx4 v[30:33], v0, s[2:3] offset:240
-; GFX942-NEXT: global_load_dwordx4 v[26:29], v0, s[2:3] offset:224
-; GFX942-NEXT: global_load_dwordx4 v[22:25], v0, s[2:3] offset:208
-; GFX942-NEXT: global_load_dwordx4 v[18:21], v0, s[2:3] offset:192
-; GFX942-NEXT: global_load_dwordx4 v[14:17], v0, s[2:3] offset:176
-; GFX942-NEXT: global_load_dwordx4 v[10:13], v0, s[2:3] offset:160
-; GFX942-NEXT: global_load_dwordx4 v[6:9], v0, s[2:3] offset:144
+; GFX942-NEXT: global_load_dwordx4 v[14:17], v0, s[2:3] offset:192
+; GFX942-NEXT: global_load_dwordx4 v[26:29], v0, s[2:3] offset:208
+; GFX942-NEXT: global_load_dwordx4 v[6:9], v0, s[2:3] offset:160
+; GFX942-NEXT: global_load_dwordx4 v[18:21], v0, s[2:3] offset:176
; GFX942-NEXT: global_load_dwordx4 v[2:5], v0, s[2:3] offset:128
+; GFX942-NEXT: global_load_dwordx4 v[10:13], v0, s[2:3] offset:144
+; GFX942-NEXT: global_load_dwordx4 v[54:57], v0, s[2:3] offset:96
; GFX942-NEXT: global_load_dwordx4 a[0:3], v0, s[2:3] offset:112
-; GFX942-NEXT: global_load_dwordx4 v[58:61], v0, s[2:3] offset:96
-; GFX942-NEXT: global_load_dwordx4 v[54:57], v0, s[2:3] offset:80
-; GFX942-NEXT: global_load_dwordx4 v[50:53], v0, s[2:3] offset:64
-; GFX942-NEXT: global_load_dwordx4 v[46:49], v0, s[2:3] offset:48
-; GFX942-NEXT: global_load_dwordx4 v[42:45], v0, s[2:3] offset:32
-; GFX942-NEXT: global_load_dwordx4 v[38:41], v0, s[2:3] offset:16
+; GFX942-NEXT: global_load_dwordx4 v[46:49], v0, s[2:3] offset:64
+; GFX942-NEXT: global_load_dwordx4 v[58:61], v0, s[2:3] offset:80
+; GFX942-NEXT: global_load_dwordx4 v[38:41], v0, s[2:3] offset:32
+; GFX942-NEXT: global_load_dwordx4 v[50:53], v0, s[2:3] offset:48
; GFX942-NEXT: global_load_dwordx4 v[34:37], v0, s[2:3]
+; GFX942-NEXT: global_load_dwordx4 v[42:45], v0, s[2:3] offset:16
; GFX942-NEXT: .LBB6_2: ; %bb.2
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: s_waitcnt vmcnt(7)
+; GFX942-NEXT: s_waitcnt vmcnt(6)
; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7] offset:112
-; GFX942-NEXT: s_waitcnt vmcnt(7)
-; GFX942-NEXT: global_store_dwordx4 v1, v[58:61], s[6:7] offset:96
-; GFX942-NEXT: s_waitcnt vmcnt(7)
-; GFX942-NEXT: global_store_dwordx4 v1, v[54:57], s[6:7] offset:80
-; GFX942-NEXT: s_waitcnt vmcnt(7)
-; GFX942-NEXT: global_store_dwordx4 v1, v[50:53], s[6:7] offset:64
-; GFX942-NEXT: s_waitcnt vmcnt(7)
-; GFX942-NEXT: global_store_dwordx4 v1, v[46:49], s[6:7] offset:48
-; GFX942-NEXT: s_waitcnt vmcnt(7)
-; GFX942-NEXT: global_store_dwordx4 v1, v[42:45], s[6:7] offset:32
-; GFX942-NEXT: s_waitcnt vmcnt(7)
-; GFX942-NEXT: global_store_dwordx4 v1, v[38:41], s[6:7] offset:16
-; GFX942-NEXT: s_waitcnt vmcnt(7)
+; GFX942-NEXT: global_store_dwordx4 v1, v[54:57], s[6:7] offset:96
+; GFX942-NEXT: s_waitcnt vmcnt(6)
+; GFX942-NEXT: global_store_dwordx4 v1, v[58:61], s[6:7] offset:80
+; GFX942-NEXT: global_store_dwordx4 v1, v[46:49], s[6:7] offset:64
+; GFX942-NEXT: s_waitcnt vmcnt(6)
+; GFX942-NEXT: global_store_dwordx4 v1, v[50:53], s[6:7] offset:48
+; GFX942-NEXT: global_store_dwordx4 v1, v[38:41], s[6:7] offset:32
+; GFX942-NEXT: s_waitcnt vmcnt(6)
+; GFX942-NEXT: global_store_dwordx4 v1, v[42:45], s[6:7] offset:16
; GFX942-NEXT: global_store_dwordx4 v1, v[34:37], s[6:7]
; GFX942-NEXT: global_store_dwordx4 v1, v[30:33], s[6:7] offset:240
-; GFX942-NEXT: global_store_dwordx4 v1, v[26:29], s[6:7] offset:224
-; GFX942-NEXT: global_store_dwordx4 v1, v[22:25], s[6:7] offset:208
-; GFX942-NEXT: global_store_dwordx4 v1, v[18:21], s[6:7] offset:192
-; GFX942-NEXT: global_store_dwordx4 v1, v[14:17], s[6:7] offset:176
-; GFX942-NEXT: global_store_dwordx4 v1, v[10:13], s[6:7] offset:160
-; GFX942-NEXT: global_store_dwordx4 v1, v[6:9], s[6:7] offset:144
+; GFX942-NEXT: global_store_dwordx4 v1, v[22:25], s[6:7] offset:224
+; GFX942-NEXT: global_store_dwordx4 v1, v[26:29], s[6:7] offset:208
+; GFX942-NEXT: global_store_dwordx4 v1, v[14:17], s[6:7] offset:192
+; GFX942-NEXT: global_store_dwordx4 v1, v[18:21], s[6:7] offset:176
+; GFX942-NEXT: global_store_dwordx4 v1, v[6:9], s[6:7] offset:160
+; GFX942-NEXT: global_store_dwordx4 v1, v[10:13], s[6:7] offset:144
; GFX942-NEXT: global_store_dwordx4 v1, v[2:5], s[6:7] offset:128
; GFX942-NEXT: s_endpgm
entry:
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll b/llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll
index 6345011e3d9ce..cb3f00b4f3ace 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll
@@ -70,8 +70,8 @@ define i32 @va1(ptr %fmt, ...) {
; RV64-NEXT: addi a0, sp, 8
; RV64-NEXT: addi a1, sp, 24
; RV64-NEXT: sd a1, 8(sp)
-; RV64-NEXT: lw a0, 4(a0)
; RV64-NEXT: lwu a1, 8(sp)
+; RV64-NEXT: lw a0, 4(a0)
; RV64-NEXT: sd a5, 56(sp)
; RV64-NEXT: sd a6, 64(sp)
; RV64-NEXT: sd a7, 72(sp)
@@ -135,8 +135,8 @@ define i32 @va1(ptr %fmt, ...) {
; RV64-WITHFP-NEXT: addi a0, s0, -24
; RV64-WITHFP-NEXT: addi a1, s0, 8
; RV64-WITHFP-NEXT: sd a1, -24(s0)
-; RV64-WITHFP-NEXT: lw a0, 4(a0)
; RV64-WITHFP-NEXT: lwu a1, -24(s0)
+; RV64-WITHFP-NEXT: lw a0, 4(a0)
; RV64-WITHFP-NEXT: sd a5, 40(s0)
; RV64-WITHFP-NEXT: sd a6, 48(s0)
; RV64-WITHFP-NEXT: sd a7, 56(s0)
@@ -1633,8 +1633,8 @@ define i32 @va_large_stack(ptr %fmt, ...) {
; RV64-NEXT: addi a1, a1, 280
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: sd a1, 8(sp)
-; RV64-NEXT: lw a0, 4(a0)
; RV64-NEXT: lwu a1, 8(sp)
+; RV64-NEXT: lw a0, 4(a0)
; RV64-NEXT: lui a2, 24414
; RV64-NEXT: add a2, sp, a2
; RV64-NEXT: sd a5, 312(a2)
More information about the llvm-commits
mailing list