[llvm] [AMDGPU] Enable reordering of VMEM loads during clustering (PR #107986)
Carl Ritson via llvm-commits
llvm-commits at lists.llvm.org
Sun Dec 7 22:29:18 PST 2025
https://github.com/perlfu updated https://github.com/llvm/llvm-project/pull/107986
>From 5cea97d6d48a012acc528ac61510eca292e33d05 Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Mon, 8 Dec 2025 10:18:24 +0900
Subject: [PATCH 1/2] [AMDGPU] Enable reordering of VMEM loads during
clustering
Add fine grain control over ReorderWhileClustering by adding
canReorderClusterMemOps query to TargetInstrInfo.
Implement this to true for RISC to maintain current behaviour.
On AMDGPU enable ReorderWhileClustering for loads and implement
canReorderClusterMemOps to reject reordering for operations other
than VMEM.
The intention of doing this is to allow some additional overlap
of computation with memory loads, as loads will be issued in
an order closer to their usage, more incremental s_waitcnt can
be introduced.
On average this yields a very small reduction in VGPR pressure,
although edge cases may see increased pressure.
Reordering SMEM/LDS access is not beneficial as these must always
be waitcnt 0.
For the benefit of future tuning add support for function metadata
"amdgpu-reorder-loads-while-clustering" to disable/enable
reordering behaviour per function.
---
llvm/include/llvm/CodeGen/TargetInstrInfo.h | 10 +++++++
llvm/lib/CodeGen/MachineScheduler.cpp | 5 ++--
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 28 +++++++++++++++----
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 14 ++++++++++
llvm/lib/Target/AMDGPU/SIInstrInfo.h | 4 +++
llvm/lib/Target/RISCV/RISCVInstrInfo.h | 6 ++++
6 files changed, 60 insertions(+), 7 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 2a9522452d7cd..6b21488867776 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -1668,6 +1668,16 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo {
llvm_unreachable("target did not implement shouldClusterMemOps()");
}
+ /// Returns true if the two given memory operations can be reordered
+ /// while clustering.
+ /// Will only be queried if ReorderWhileClustering is enabled and
+ /// shouldClusterMemOps already returned true for the same operation pair.
+ virtual bool
+ canReorderClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
+ ArrayRef<const MachineOperand *> BaseOps2) const {
+ llvm_unreachable("target did not implement canReorderClusterMemOps()");
+ }
+
/// Reverses the branch condition of the specified condition list,
/// returning false on success and true if it cannot be reversed.
virtual bool
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index de29a9fab876e..ae3977044d206 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -2117,8 +2117,9 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps(
SUnit *SUa = MemOpa.SU;
SUnit *SUb = MemOpb.SU;
-
- if (!ReorderWhileClustering && SUa->NodeNum > SUb->NodeNum)
+ if (SUa->NodeNum > SUb->NodeNum &&
+ (!ReorderWhileClustering ||
+ !TII->canReorderClusterMemOps(MemOpa.BaseOps, MemOpb.BaseOps)))
std::swap(SUa, SUb);
// FIXME: Is this check really required?
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index e5a35abe6da6b..efd4e86516bd4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -537,6 +537,11 @@ static cl::opt<bool> EnableUniformIntrinsicCombine(
cl::desc("Enable/Disable the Uniform Intrinsic Combine Pass"),
cl::init(true), cl::Hidden);
+static cl::opt<bool> ReorderWhileLoadClustering(
+ "amdgpu-reorder-while-load-clustering",
+ cl::desc("Enable reordering during load clustering"), cl::init(true),
+ cl::Hidden);
+
extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
// Register the target
RegisterTargetMachine<R600TargetMachine> X(getTheR600Target());
@@ -636,12 +641,21 @@ static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
return new SIScheduleDAGMI(C);
}
+static bool getReorderWhileLoadClustering(const MachineFunction *MF) {
+ if (!ReorderWhileLoadClustering)
+ return false;
+ Attribute FnAttr =
+ MF->getFunction().getFnAttribute("amdgpu-reorder-loads-while-clustering");
+ return !FnAttr.isValid() || FnAttr.getValueAsBool();
+}
+
static ScheduleDAGInstrs *
createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
ScheduleDAGMILive *DAG =
new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
- DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+ DAG->addMutation(createLoadClusterDAGMutation(
+ DAG->TII, DAG->TRI, getReorderWhileLoadClustering(C->MF)));
if (ST.shouldClusterStores())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
@@ -664,7 +678,8 @@ createGCNMaxMemoryClauseMachineScheduler(MachineSchedContext *C) {
const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
ScheduleDAGMILive *DAG = new GCNScheduleDAGMILive(
C, std::make_unique<GCNMaxMemoryClauseSchedStrategy>(C));
- DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+ DAG->addMutation(createLoadClusterDAGMutation(
+ DAG->TII, DAG->TRI, getReorderWhileLoadClustering(C->MF)));
if (ST.shouldClusterStores())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
@@ -677,7 +692,8 @@ createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
auto *DAG = new GCNIterativeScheduler(
C, GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
- DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+ DAG->addMutation(createLoadClusterDAGMutation(
+ DAG->TII, DAG->TRI, getReorderWhileLoadClustering(C->MF)));
if (ST.shouldClusterStores())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
@@ -695,7 +711,8 @@ static ScheduleDAGInstrs *
createIterativeILPMachineScheduler(MachineSchedContext *C) {
const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
auto *DAG = new GCNIterativeScheduler(C, GCNIterativeScheduler::SCHEDULE_ILP);
- DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+ DAG->addMutation(createLoadClusterDAGMutation(
+ DAG->TII, DAG->TRI, getReorderWhileLoadClustering(C->MF)));
if (ST.shouldClusterStores())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
@@ -794,7 +811,8 @@ llvm::ScheduleDAGInstrs *
AMDGPUTargetMachine::createMachineScheduler(MachineSchedContext *C) const {
const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
ScheduleDAGMILive *DAG = createSchedLive(C);
- DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+ DAG->addMutation(createLoadClusterDAGMutation(
+ DAG->TII, DAG->TRI, getReorderWhileLoadClustering(C->MF)));
if (ST.shouldClusterStores())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
return DAG;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 6d2110957002a..7604a2255df59 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -601,6 +601,20 @@ bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
return NumDWords <= MaxMemoryClusterDWords;
}
+bool SIInstrInfo::canReorderClusterMemOps(
+ ArrayRef<const MachineOperand *> BaseOps1,
+ ArrayRef<const MachineOperand *> BaseOps2) const {
+ if (BaseOps1.empty() || BaseOps2.empty())
+ return false;
+
+ // Only reorder VMEMs/LDS.
+ // Assume caller has confirmed legality, e.g. aliasing.
+ const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
+ const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
+ return (isVMEM(FirstLdSt) && isVMEM(SecondLdSt)) ||
+ (isDS(FirstLdSt) && isDS(SecondLdSt));
+}
+
// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
// the first 16 loads will be interleaved with the stores, and the next 16 will
// be clustered as expected. It should really split into 2 16 store batches.
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index b1d6563bf3c0b..c951c5dc908c0 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -276,6 +276,10 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
unsigned ClusterSize,
unsigned NumBytes) const override;
+ bool canReorderClusterMemOps(
+ ArrayRef<const MachineOperand *> BaseOps1,
+ ArrayRef<const MachineOperand *> BaseOps2) const override;
+
bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0,
int64_t Offset1, unsigned NumLoads) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index 0ffe015b9fac8..c3772811f0371 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -204,6 +204,12 @@ class RISCVInstrInfo : public RISCVGenInstrInfo {
unsigned ClusterSize,
unsigned NumBytes) const override;
+ bool canReorderClusterMemOps(
+ ArrayRef<const MachineOperand *> BaseOps1,
+ ArrayRef<const MachineOperand *> BaseOps2) const override {
+ return true;
+ }
+
bool getMemOperandWithOffsetWidth(const MachineInstr &LdSt,
const MachineOperand *&BaseOp,
int64_t &Offset, LocationSize &Width,
>From 6d6fd9c8b31fd6133cac1da8673f07b4d2f78753 Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Mon, 8 Dec 2025 15:26:01 +0900
Subject: [PATCH 2/2] - Test changes
---
llvm/test/CodeGen/AMDGPU/bf16.ll | 583 +--
.../CodeGen/AMDGPU/call-argument-types.ll | 33 +-
llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll | 148 +-
llvm/test/CodeGen/AMDGPU/ds_read2.ll | 128 +-
.../fast-unaligned-load-store.global.ll | 6 +-
.../fast-unaligned-load-store.private.ll | 6 +-
.../flat_atomics_i64_system_noprivate.ll | 368 +-
llvm/test/CodeGen/AMDGPU/fma-combine.ll | 12 +-
llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll | 122 +-
llvm/test/CodeGen/AMDGPU/freeze.ll | 600 ++-
llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll | 2 +-
llvm/test/CodeGen/AMDGPU/idot4u.ll | 18 +-
.../CodeGen/AMDGPU/insert_vector_elt.v2i16.ll | 119 +-
.../insert_waitcnt_for_precise_memory.ll | 4 +-
...e92561-restore-undef-scc-verifier-error.ll | 56 +-
.../test/CodeGen/AMDGPU/lds-misaligned-bug.ll | 13 +-
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll | 40 +-
.../AMDGPU/llvm.amdgcn.smfmac.gfx950.ll | 28 +-
.../CodeGen/AMDGPU/llvm.amdgcn.writelane.ll | 82 +-
.../AMDGPU/llvm.amdgcn.writelane.ptr.ll | 6 +-
llvm/test/CodeGen/AMDGPU/load-constant-i16.ll | 48 +-
llvm/test/CodeGen/AMDGPU/load-global-i16.ll | 479 ++-
llvm/test/CodeGen/AMDGPU/load-global-i32.ll | 862 ++--
llvm/test/CodeGen/AMDGPU/load-local-i16.ll | 1341 +++----
.../AMDGPU/load-local-redundant-copies.ll | 85 +-
llvm/test/CodeGen/AMDGPU/load-local.128.ll | 149 +-
llvm/test/CodeGen/AMDGPU/load-local.96.ll | 105 +-
...ne-sink-temporal-divergence-swdev407790.ll | 8 +-
llvm/test/CodeGen/AMDGPU/madak.ll | 192 +-
llvm/test/CodeGen/AMDGPU/max.i16.ll | 8 +-
llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll | 88 +-
.../CodeGen/AMDGPU/memintrinsic-unroll.ll | 3470 +++++++++--------
.../AMDGPU/memmove-param-combinations.ll | 178 +-
llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll | 48 +-
.../CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll | 196 +-
llvm/test/CodeGen/AMDGPU/neg_ashr64_reduce.ll | 24 +-
llvm/test/CodeGen/AMDGPU/packed-fp32.ll | 315 +-
llvm/test/CodeGen/AMDGPU/permute_i8.ll | 34 +-
llvm/test/CodeGen/AMDGPU/pr51516.mir | 3 -
.../CodeGen/AMDGPU/private-memory-atomics.ll | 2 +-
.../AMDGPU/promote-constOffset-to-imm.ll | 287 +-
.../AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll | 24 +-
.../rewrite-vgpr-mfma-to-agpr.gfx90a.ll | 20 +-
.../rewrite-vgpr-mfma-to-agpr.gfx950.ll | 64 +-
.../AMDGPU/rewrite-vgpr-mfma-to-agpr.ll | 192 +-
llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll | 88 +-
.../AMDGPU/splitkit-getsubrangeformask.ll | 6 +-
llvm/test/CodeGen/AMDGPU/sub.ll | 72 +-
llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll | 60 +-
.../test/CodeGen/AMDGPU/vni8-across-blocks.ll | 91 +-
llvm/test/CodeGen/AMDGPU/wqm.ll | 14 +-
51 files changed, 5418 insertions(+), 5509 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 6b0424e50983e..415a9612a4623 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -9548,27 +9548,20 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
; GFX8-NEXT: v_addc_u32_e32 v14, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v15, vcc, 14, v1
; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v19, vcc, 16, v1
-; GFX8-NEXT: v_addc_u32_e32 v20, vcc, 0, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v17, vcc, 18, v1
+; GFX8-NEXT: v_add_u32_e32 v17, vcc, 16, v1
; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v21, vcc, 20, v1
+; GFX8-NEXT: v_add_u32_e32 v19, vcc, 50, v1
+; GFX8-NEXT: v_addc_u32_e32 v20, vcc, 0, v2, vcc
+; GFX8-NEXT: v_add_u32_e32 v21, vcc, 52, v1
; GFX8-NEXT: v_addc_u32_e32 v22, vcc, 0, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v23, vcc, 22, v1
+; GFX8-NEXT: v_add_u32_e32 v23, vcc, 54, v1
; GFX8-NEXT: v_addc_u32_e32 v24, vcc, 0, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v25, vcc, 24, v1
-; GFX8-NEXT: v_addc_u32_e32 v26, vcc, 0, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v27, vcc, 26, v1
+; GFX8-NEXT: v_add_u32_e32 v27, vcc, 56, v1
; GFX8-NEXT: v_addc_u32_e32 v28, vcc, 0, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v29, vcc, 28, v1
+; GFX8-NEXT: v_add_u32_e32 v29, vcc, 58, v1
; GFX8-NEXT: v_addc_u32_e32 v30, vcc, 0, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v31, vcc, 30, v1
+; GFX8-NEXT: v_add_u32_e32 v31, vcc, 60, v1
; GFX8-NEXT: v_addc_u32_e32 v32, vcc, 0, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v33, vcc, 34, v1
-; GFX8-NEXT: v_addc_u32_e32 v34, vcc, 0, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v35, vcc, 36, v1
-; GFX8-NEXT: v_addc_u32_e32 v36, vcc, 0, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v37, vcc, 38, v1
; GFX8-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GFX8-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GFX8-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
@@ -9580,78 +9573,84 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
; GFX8-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX8-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX8-NEXT: buffer_store_dword v58, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX8-NEXT: flat_load_ushort v44, v[1:2]
-; GFX8-NEXT: v_addc_u32_e32 v38, vcc, 0, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v48, vcc, 40, v1
+; GFX8-NEXT: v_add_u32_e32 v33, vcc, 62, v1
+; GFX8-NEXT: v_addc_u32_e32 v34, vcc, 0, v2, vcc
+; GFX8-NEXT: flat_load_ushort v26, v[19:20]
+; GFX8-NEXT: flat_load_ushort v44, v[21:22]
+; GFX8-NEXT: flat_load_ushort v45, v[23:24]
+; GFX8-NEXT: flat_load_ushort v46, v[27:28]
+; GFX8-NEXT: flat_load_ushort v47, v[29:30]
+; GFX8-NEXT: flat_load_ushort v56, v[31:32]
+; GFX8-NEXT: flat_load_ushort v57, v[33:34]
+; GFX8-NEXT: flat_load_ushort v58, v[1:2]
+; GFX8-NEXT: v_add_u32_e32 v28, vcc, 18, v1
+; GFX8-NEXT: v_addc_u32_e32 v29, vcc, 0, v2, vcc
+; GFX8-NEXT: v_add_u32_e32 v30, vcc, 20, v1
+; GFX8-NEXT: v_addc_u32_e32 v31, vcc, 0, v2, vcc
+; GFX8-NEXT: v_add_u32_e32 v32, vcc, 22, v1
+; GFX8-NEXT: v_addc_u32_e32 v33, vcc, 0, v2, vcc
+; GFX8-NEXT: v_add_u32_e32 v34, vcc, 24, v1
+; GFX8-NEXT: v_addc_u32_e32 v35, vcc, 0, v2, vcc
+; GFX8-NEXT: v_add_u32_e32 v36, vcc, 26, v1
+; GFX8-NEXT: v_addc_u32_e32 v37, vcc, 0, v2, vcc
+; GFX8-NEXT: v_add_u32_e32 v38, vcc, 28, v1
+; GFX8-NEXT: v_addc_u32_e32 v39, vcc, 0, v2, vcc
+; GFX8-NEXT: v_add_u32_e32 v48, vcc, 30, v1
; GFX8-NEXT: v_addc_u32_e32 v49, vcc, 0, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v50, vcc, 62, v1
-; GFX8-NEXT: v_addc_u32_e32 v51, vcc, 0, v2, vcc
-; GFX8-NEXT: flat_load_ushort v45, v[50:51]
-; GFX8-NEXT: v_add_u32_e32 v50, vcc, 60, v1
-; GFX8-NEXT: v_addc_u32_e32 v51, vcc, 0, v2, vcc
-; GFX8-NEXT: flat_load_ushort v46, v[50:51]
-; GFX8-NEXT: v_add_u32_e32 v50, vcc, 42, v1
+; GFX8-NEXT: v_add_u32_e32 v19, vcc, 34, v1
+; GFX8-NEXT: v_addc_u32_e32 v20, vcc, 0, v2, vcc
+; GFX8-NEXT: v_add_u32_e32 v21, vcc, 36, v1
+; GFX8-NEXT: v_addc_u32_e32 v22, vcc, 0, v2, vcc
+; GFX8-NEXT: v_add_u32_e32 v23, vcc, 38, v1
+; GFX8-NEXT: v_addc_u32_e32 v24, vcc, 0, v2, vcc
+; GFX8-NEXT: v_add_u32_e32 v50, vcc, 40, v1
; GFX8-NEXT: v_addc_u32_e32 v51, vcc, 0, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v52, vcc, 58, v1
+; GFX8-NEXT: v_add_u32_e32 v52, vcc, 42, v1
; GFX8-NEXT: v_addc_u32_e32 v53, vcc, 0, v2, vcc
-; GFX8-NEXT: flat_load_ushort v47, v[52:53]
-; GFX8-NEXT: v_add_u32_e32 v52, vcc, 44, v1
-; GFX8-NEXT: v_addc_u32_e32 v53, vcc, 0, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v54, vcc, 56, v1
-; GFX8-NEXT: v_addc_u32_e32 v55, vcc, 0, v2, vcc
-; GFX8-NEXT: flat_load_ushort v56, v[54:55]
-; GFX8-NEXT: v_add_u32_e32 v54, vcc, 46, v1
+; GFX8-NEXT: v_add_u32_e32 v54, vcc, 44, v1
; GFX8-NEXT: v_addc_u32_e32 v55, vcc, 0, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v39, vcc, 54, v1
-; GFX8-NEXT: v_addc_u32_e32 v40, vcc, 0, v2, vcc
-; GFX8-NEXT: flat_load_ushort v57, v[39:40]
-; GFX8-NEXT: v_add_u32_e32 v39, vcc, 52, v1
-; GFX8-NEXT: v_addc_u32_e32 v40, vcc, 0, v2, vcc
-; GFX8-NEXT: flat_load_ushort v58, v[39:40]
-; GFX8-NEXT: v_add_u32_e32 v40, vcc, 48, v1
+; GFX8-NEXT: v_add_u32_e32 v40, vcc, 46, v1
; GFX8-NEXT: v_addc_u32_e32 v41, vcc, 0, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v42, vcc, 50, v1
+; GFX8-NEXT: v_add_u32_e32 v42, vcc, 48, v1
; GFX8-NEXT: v_addc_u32_e32 v43, vcc, 0, v2, vcc
-; GFX8-NEXT: flat_load_ushort v42, v[42:43]
-; GFX8-NEXT: flat_load_ushort v34, v[33:34]
-; GFX8-NEXT: flat_load_ushort v36, v[35:36]
-; GFX8-NEXT: flat_load_ushort v38, v[37:38]
-; GFX8-NEXT: flat_load_ushort v39, v[48:49]
-; GFX8-NEXT: flat_load_ushort v48, v[50:51]
-; GFX8-NEXT: flat_load_ushort v51, v[52:53]
+; GFX8-NEXT: flat_load_ushort v20, v[19:20]
+; GFX8-NEXT: flat_load_ushort v22, v[21:22]
+; GFX8-NEXT: flat_load_ushort v24, v[23:24]
+; GFX8-NEXT: flat_load_ushort v25, v[50:51]
+; GFX8-NEXT: flat_load_ushort v27, v[52:53]
; GFX8-NEXT: flat_load_ushort v52, v[54:55]
; GFX8-NEXT: flat_load_ushort v53, v[40:41]
-; GFX8-NEXT: v_add_u32_e32 v49, vcc, 32, v1
-; GFX8-NEXT: v_addc_u32_e32 v50, vcc, 0, v2, vcc
-; GFX8-NEXT: flat_load_ushort v37, v[3:4]
-; GFX8-NEXT: flat_load_ushort v35, v[5:6]
-; GFX8-NEXT: flat_load_ushort v33, v[7:8]
+; GFX8-NEXT: flat_load_ushort v54, v[42:43]
+; GFX8-NEXT: v_add_u32_e32 v50, vcc, 32, v1
+; GFX8-NEXT: v_addc_u32_e32 v51, vcc, 0, v2, vcc
+; GFX8-NEXT: flat_load_ushort v23, v[3:4]
+; GFX8-NEXT: flat_load_ushort v21, v[5:6]
+; GFX8-NEXT: flat_load_ushort v19, v[7:8]
; GFX8-NEXT: flat_load_ushort v8, v[9:10]
; GFX8-NEXT: flat_load_ushort v6, v[11:12]
; GFX8-NEXT: flat_load_ushort v4, v[13:14]
; GFX8-NEXT: flat_load_ushort v2, v[15:16]
-; GFX8-NEXT: flat_load_ushort v1, v[19:20]
+; GFX8-NEXT: flat_load_ushort v1, v[17:18]
; GFX8-NEXT: v_add_u32_e32 v16, vcc, 4, v0
-; GFX8-NEXT: v_add_u32_e32 v19, vcc, 0x7c, v0
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, 0x84, v0
; GFX8-NEXT: s_waitcnt vmcnt(14)
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v44
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v58
; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v3
-; GFX8-NEXT: flat_load_ushort v3, v[17:18]
-; GFX8-NEXT: flat_load_ushort v5, v[21:22]
-; GFX8-NEXT: flat_load_ushort v7, v[23:24]
-; GFX8-NEXT: flat_load_ushort v9, v[25:26]
-; GFX8-NEXT: flat_load_ushort v10, v[27:28]
-; GFX8-NEXT: flat_load_ushort v11, v[29:30]
-; GFX8-NEXT: flat_load_ushort v12, v[31:32]
-; GFX8-NEXT: flat_load_ushort v13, v[49:50]
-; GFX8-NEXT: v_add_u32_e32 v18, vcc, 0x84, v0
+; GFX8-NEXT: flat_load_ushort v3, v[28:29]
+; GFX8-NEXT: flat_load_ushort v5, v[30:31]
+; GFX8-NEXT: flat_load_ushort v7, v[32:33]
+; GFX8-NEXT: flat_load_ushort v9, v[34:35]
+; GFX8-NEXT: flat_load_ushort v10, v[36:37]
+; GFX8-NEXT: flat_load_ushort v11, v[38:39]
+; GFX8-NEXT: flat_load_ushort v12, v[48:49]
+; GFX8-NEXT: flat_load_ushort v13, v[50:51]
; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen
+; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v57
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15
; GFX8-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xfc, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v45
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15
; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen
-; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v46
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v56
; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xf8, v0
; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen
@@ -9663,68 +9662,68 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xec, v0
; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xe8, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v56
+; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v46
; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15
+; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xe8, v0
; GFX8-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xe4, v0
; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xe0, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v57
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v45
; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xe0, v0
; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xdc, v0
; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen
-; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v58
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v44
; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xd8, v0
; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xd4, v0
; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen
-; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v42
+; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v26
; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15
; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xd0, v0
; GFX8-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xcc, v0
; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xc8, v0
; GFX8-NEXT: s_waitcnt vmcnt(14)
-; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v53
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v54
; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
-; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xc8, v0
; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xc4, v0
; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen
-; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v52
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v53
; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xc0, v0
; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xbc, v0
; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen
-; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v51
+; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v52
; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15
; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xb8, v0
; GFX8-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xb4, v0
; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen
-; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v48
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v27
; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xb0, v0
; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xac, v0
; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen
-; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v39
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v25
; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xa8, v0
; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xa4, v0
; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen
-; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v38
+; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v24
; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15
; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xa0, v0
; GFX8-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x9c, v0
; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen
-; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v36
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v22
; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x98, v0
; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen
@@ -9732,17 +9731,17 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x90, v0
; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen
-; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v34
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v20
; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0x8c, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v37
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v23
; GFX8-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0x88, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GFX8-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen
; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v16
; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v13
-; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v35
+; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v21
; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GFX8-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
@@ -9750,16 +9749,15 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen
; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v13
; GFX8-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
-; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v33
-; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v19
+; GFX8-NEXT: v_add_u32_e32 v19, vcc, 0x7c, v0
; GFX8-NEXT: buffer_store_dword v13, v19, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v13, vcc, 0x78, v0
; GFX8-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen
; GFX8-NEXT: v_cvt_f64_f32_e32 v[12:13], v18
; GFX8-NEXT: v_cvt_f64_f32_e32 v[18:19], v11
; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0x74, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX8-NEXT: buffer_store_dword v19, v11, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0x70, v0
; GFX8-NEXT: buffer_store_dword v18, v11, s[0:3], 0 offen
@@ -9767,7 +9765,7 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v10
; GFX8-NEXT: v_cvt_f64_f32_e32 v[10:11], v8
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x6c, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX8-NEXT: buffer_store_dword v11, v8, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x68, v0
; GFX8-NEXT: buffer_store_dword v10, v8, s[0:3], 0 offen
@@ -9775,7 +9773,7 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v9
; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v6
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x64, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX8-NEXT: buffer_store_dword v9, v6, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x60, v0
; GFX8-NEXT: buffer_store_dword v8, v6, s[0:3], 0 offen
@@ -9783,6 +9781,7 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v7
; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v4
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x5c, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: buffer_store_dword v7, v4, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x58, v0
; GFX8-NEXT: buffer_store_dword v6, v4, s[0:3], 0 offen
@@ -9791,6 +9790,7 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v2
; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x54, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0
; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -10045,135 +10045,137 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
; GFX950-NEXT: v_accvgpr_write_b32 a7, v47 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a8, v56 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a9, v57 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a10, v58 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a12, v60 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a13, v61 ; Reload Reuse
; GFX950-NEXT: global_load_ushort v1, v[2:3], off offset:2
-; GFX950-NEXT: global_load_ushort v4, v[2:3], off offset:12
-; GFX950-NEXT: global_load_ushort v5, v[2:3], off offset:8
+; GFX950-NEXT: global_load_ushort v4, v[2:3], off
+; GFX950-NEXT: global_load_ushort v5, v[2:3], off offset:6
; GFX950-NEXT: global_load_ushort v6, v[2:3], off offset:4
-; GFX950-NEXT: global_load_ushort v7, v[2:3], off
-; GFX950-NEXT: global_load_ushort v8, v[2:3], off offset:6
-; GFX950-NEXT: global_load_ushort v9, v[2:3], off offset:10
-; GFX950-NEXT: global_load_ushort v10, v[2:3], off offset:14
+; GFX950-NEXT: global_load_ushort v7, v[2:3], off offset:10
+; GFX950-NEXT: global_load_ushort v8, v[2:3], off offset:8
+; GFX950-NEXT: global_load_ushort v9, v[2:3], off offset:14
+; GFX950-NEXT: global_load_ushort v10, v[2:3], off offset:12
; GFX950-NEXT: global_load_ushort v11, v[2:3], off offset:18
-; GFX950-NEXT: global_load_ushort v12, v[2:3], off offset:28
-; GFX950-NEXT: global_load_ushort v13, v[2:3], off offset:24
+; GFX950-NEXT: global_load_ushort v12, v[2:3], off offset:16
+; GFX950-NEXT: global_load_ushort v13, v[2:3], off offset:22
; GFX950-NEXT: global_load_ushort v14, v[2:3], off offset:20
-; GFX950-NEXT: global_load_ushort v15, v[2:3], off offset:16
-; GFX950-NEXT: global_load_ushort v16, v[2:3], off offset:22
-; GFX950-NEXT: global_load_ushort v17, v[2:3], off offset:26
-; GFX950-NEXT: global_load_ushort v18, v[2:3], off offset:30
+; GFX950-NEXT: global_load_ushort v15, v[2:3], off offset:26
+; GFX950-NEXT: global_load_ushort v16, v[2:3], off offset:24
+; GFX950-NEXT: global_load_ushort v17, v[2:3], off offset:30
+; GFX950-NEXT: global_load_ushort v18, v[2:3], off offset:28
; GFX950-NEXT: global_load_ushort v19, v[2:3], off offset:34
-; GFX950-NEXT: global_load_ushort v20, v[2:3], off offset:44
-; GFX950-NEXT: global_load_ushort v21, v[2:3], off offset:40
+; GFX950-NEXT: global_load_ushort v20, v[2:3], off offset:32
+; GFX950-NEXT: global_load_ushort v21, v[2:3], off offset:38
; GFX950-NEXT: global_load_ushort v22, v[2:3], off offset:36
-; GFX950-NEXT: global_load_ushort v23, v[2:3], off offset:32
-; GFX950-NEXT: global_load_ushort v24, v[2:3], off offset:38
-; GFX950-NEXT: global_load_ushort v25, v[2:3], off offset:42
-; GFX950-NEXT: global_load_ushort v26, v[2:3], off offset:46
+; GFX950-NEXT: global_load_ushort v23, v[2:3], off offset:42
+; GFX950-NEXT: global_load_ushort v24, v[2:3], off offset:40
+; GFX950-NEXT: global_load_ushort v25, v[2:3], off offset:46
+; GFX950-NEXT: global_load_ushort v26, v[2:3], off offset:44
; GFX950-NEXT: global_load_ushort v42, v[2:3], off offset:50
; GFX950-NEXT: global_load_ushort v43, v[2:3], off offset:62
; GFX950-NEXT: global_load_ushort v46, v[2:3], off offset:60
-; GFX950-NEXT: global_load_ushort v47, v[2:3], off offset:56
-; GFX950-NEXT: global_load_ushort v60, v[2:3], off offset:52
-; GFX950-NEXT: global_load_ushort v56, v[2:3], off offset:48
-; GFX950-NEXT: global_load_ushort v57, v[2:3], off offset:54
-; GFX950-NEXT: global_load_ushort v58, v[2:3], off offset:58
+; GFX950-NEXT: global_load_ushort v47, v[2:3], off offset:48
+; GFX950-NEXT: global_load_ushort v56, v[2:3], off offset:54
+; GFX950-NEXT: global_load_ushort v57, v[2:3], off offset:58
+; GFX950-NEXT: global_load_ushort v60, v[2:3], off offset:56
+; GFX950-NEXT: global_load_ushort v61, v[2:3], off offset:52
; GFX950-NEXT: v_accvgpr_write_b32 a4, v44 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v41 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a5, v45 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a10, v58 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a11, v59 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v41 ; Reload Reuse
; GFX950-NEXT: s_waitcnt vmcnt(31)
; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX950-NEXT: s_waitcnt vmcnt(30)
-; GFX950-NEXT: v_lshlrev_b32_e32 v30, 16, v4
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v4
; GFX950-NEXT: s_waitcnt vmcnt(29)
-; GFX950-NEXT: v_lshlrev_b32_e32 v28, 16, v5
+; GFX950-NEXT: v_lshlrev_b32_e32 v27, 16, v5
; GFX950-NEXT: v_cvt_f64_f32_e32 v[4:5], v1
; GFX950-NEXT: s_waitcnt vmcnt(27)
-; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v7
+; GFX950-NEXT: v_lshlrev_b32_e32 v28, 16, v7
; GFX950-NEXT: s_waitcnt vmcnt(26)
-; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v8
+; GFX950-NEXT: v_lshlrev_b32_e32 v29, 16, v8
; GFX950-NEXT: s_waitcnt vmcnt(25)
-; GFX950-NEXT: v_lshlrev_b32_e32 v27, 16, v9
+; GFX950-NEXT: v_lshlrev_b32_e32 v30, 16, v9
; GFX950-NEXT: s_waitcnt vmcnt(24)
-; GFX950-NEXT: v_lshlrev_b32_e32 v29, 16, v10
+; GFX950-NEXT: v_lshlrev_b32_e32 v31, 16, v10
; GFX950-NEXT: s_waitcnt vmcnt(23)
-; GFX950-NEXT: v_lshlrev_b32_e32 v31, 16, v11
+; GFX950-NEXT: v_lshlrev_b32_e32 v32, 16, v11
; GFX950-NEXT: s_waitcnt vmcnt(22)
-; GFX950-NEXT: v_lshlrev_b32_e32 v38, 16, v12
+; GFX950-NEXT: v_lshlrev_b32_e32 v33, 16, v12
; GFX950-NEXT: s_waitcnt vmcnt(21)
-; GFX950-NEXT: v_lshlrev_b32_e32 v36, 16, v13
+; GFX950-NEXT: v_lshlrev_b32_e32 v34, 16, v13
; GFX950-NEXT: s_waitcnt vmcnt(20)
-; GFX950-NEXT: v_lshlrev_b32_e32 v34, 16, v14
+; GFX950-NEXT: v_lshlrev_b32_e32 v35, 16, v14
; GFX950-NEXT: s_waitcnt vmcnt(19)
-; GFX950-NEXT: v_lshlrev_b32_e32 v32, 16, v15
+; GFX950-NEXT: v_lshlrev_b32_e32 v36, 16, v15
; GFX950-NEXT: s_waitcnt vmcnt(18)
-; GFX950-NEXT: v_lshlrev_b32_e32 v33, 16, v16
-; GFX950-NEXT: v_cvt_f64_f32_e32 v[12:13], v27
+; GFX950-NEXT: v_lshlrev_b32_e32 v37, 16, v16
+; GFX950-NEXT: s_waitcnt vmcnt(17)
+; GFX950-NEXT: v_lshlrev_b32_e32 v38, 16, v17
; GFX950-NEXT: s_waitcnt vmcnt(16)
-; GFX950-NEXT: v_lshlrev_b32_e32 v37, 16, v18
+; GFX950-NEXT: v_lshlrev_b32_e32 v39, 16, v18
; GFX950-NEXT: s_waitcnt vmcnt(15)
-; GFX950-NEXT: v_lshlrev_b32_e32 v39, 16, v19
+; GFX950-NEXT: v_lshlrev_b32_e32 v48, 16, v19
; GFX950-NEXT: s_waitcnt vmcnt(14)
-; GFX950-NEXT: v_lshlrev_b32_e32 v44, 16, v20
+; GFX950-NEXT: v_lshlrev_b32_e32 v49, 16, v20
; GFX950-NEXT: s_waitcnt vmcnt(13)
-; GFX950-NEXT: v_lshlrev_b32_e32 v40, 16, v21
-; GFX950-NEXT: v_cvt_f64_f32_e32 v[14:15], v30
-; GFX950-NEXT: v_cvt_f64_f32_e32 v[20:21], v31
-; GFX950-NEXT: s_waitcnt vmcnt(10)
-; GFX950-NEXT: v_lshlrev_b32_e32 v49, 16, v24
-; GFX950-NEXT: s_waitcnt vmcnt(9)
-; GFX950-NEXT: v_lshlrev_b32_e32 v53, 16, v25
+; GFX950-NEXT: v_lshlrev_b32_e32 v50, 16, v21
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[16:17], v30
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[14:15], v31
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[20:21], v32
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[18:19], v33
; GFX950-NEXT: s_waitcnt vmcnt(8)
-; GFX950-NEXT: v_lshlrev_b32_e32 v41, 16, v26
+; GFX950-NEXT: v_lshlrev_b32_e32 v44, 16, v26
; GFX950-NEXT: s_waitcnt vmcnt(7)
; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v42
; GFX950-NEXT: s_waitcnt vmcnt(6)
; GFX950-NEXT: v_lshlrev_b32_e32 v42, 16, v43
-; GFX950-NEXT: v_cvt_f64_f32_e32 v[18:19], v32
-; GFX950-NEXT: v_cvt_f64_f32_e32 v[24:25], v33
-; GFX950-NEXT: v_cvt_f64_f32_e32 v[26:27], v36
-; GFX950-NEXT: v_cvt_f64_f32_e32 v[32:33], v37
-; GFX950-NEXT: v_cvt_f64_f32_e32 v[30:31], v38
-; GFX950-NEXT: v_cvt_f64_f32_e32 v[36:37], v39
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[32:33], v38
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[30:31], v39
; GFX950-NEXT: v_cvt_f64_f32_e32 v[38:39], v44
; GFX950-NEXT: v_cvt_f64_f32_e32 v[44:45], v42
; GFX950-NEXT: s_waitcnt vmcnt(5)
; GFX950-NEXT: v_lshlrev_b32_e32 v42, 16, v46
; GFX950-NEXT: v_cvt_f64_f32_e32 v[42:43], v42
-; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_lshlrev_b32_e32 v46, 16, v58
+; GFX950-NEXT: s_waitcnt vmcnt(2)
+; GFX950-NEXT: v_lshlrev_b32_e32 v46, 16, v57
; GFX950-NEXT: scratch_store_dwordx4 v0, v[42:45], off offset:240
; GFX950-NEXT: v_cvt_f64_f32_e32 v[58:59], v46
-; GFX950-NEXT: v_lshlrev_b32_e32 v46, 16, v47
+; GFX950-NEXT: s_waitcnt vmcnt(2)
+; GFX950-NEXT: v_lshlrev_b32_e32 v46, 16, v60
; GFX950-NEXT: v_cvt_f64_f32_e32 v[44:45], v1
-; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v56
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v47
; GFX950-NEXT: v_cvt_f64_f32_e32 v[42:43], v1
-; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v57
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v56
; GFX950-NEXT: v_cvt_f64_f32_e32 v[56:57], v46
-; GFX950-NEXT: v_lshlrev_b32_e32 v35, 16, v17
-; GFX950-NEXT: v_lshlrev_b32_e32 v48, 16, v23
; GFX950-NEXT: v_lshlrev_b32_e32 v52, 16, v22
+; GFX950-NEXT: v_lshlrev_b32_e32 v53, 16, v23
+; GFX950-NEXT: v_lshlrev_b32_e32 v40, 16, v24
+; GFX950-NEXT: v_lshlrev_b32_e32 v41, 16, v25
; GFX950-NEXT: scratch_store_dwordx4 v0, v[56:59], off offset:224
; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX950-NEXT: v_cvt_f64_f32_e32 v[10:11], v28
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[8:9], v27
; GFX950-NEXT: v_cvt_f64_f32_e32 v[58:59], v1
-; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v60
-; GFX950-NEXT: v_cvt_f64_f32_e32 v[16:17], v29
-; GFX950-NEXT: v_cvt_f64_f32_e32 v[22:23], v34
-; GFX950-NEXT: v_cvt_f64_f32_e32 v[28:29], v35
-; GFX950-NEXT: v_cvt_f64_f32_e32 v[34:35], v48
-; GFX950-NEXT: v_cvt_f64_f32_e32 v[50:51], v49
+; GFX950-NEXT: s_waitcnt vmcnt(2)
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v61
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[12:13], v28
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[10:11], v29
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[24:25], v34
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[22:23], v35
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[28:29], v36
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[26:27], v37
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[36:37], v48
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[34:35], v49
; GFX950-NEXT: v_cvt_f64_f32_e32 v[48:49], v52
; GFX950-NEXT: v_cvt_f64_f32_e32 v[54:55], v53
; GFX950-NEXT: v_cvt_f64_f32_e32 v[52:53], v40
; GFX950-NEXT: v_cvt_f64_f32_e32 v[40:41], v41
; GFX950-NEXT: v_cvt_f64_f32_e32 v[56:57], v1
; GFX950-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
-; GFX950-NEXT: v_cvt_f64_f32_e32 v[8:9], v7
; GFX950-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
+; GFX950-NEXT: v_cvt_f64_f32_e32 v[50:51], v50
; GFX950-NEXT: scratch_store_dwordx4 v0, v[56:59], off offset:208
; GFX950-NEXT: scratch_store_dwordx4 v0, v[42:45], off offset:192
; GFX950-NEXT: scratch_store_dwordx4 v0, v[38:41], off offset:176
@@ -10188,6 +10190,7 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
; GFX950-NEXT: scratch_store_dwordx4 v0, v[10:13], off offset:32
; GFX950-NEXT: scratch_store_dwordx4 v0, v[6:9], off offset:16
; GFX950-NEXT: scratch_store_dwordx4 v0, v[2:5], off
+; GFX950-NEXT: v_accvgpr_read_b32 v61, a13 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v60, a12 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v59, a11 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v58, a10 ; Reload Reuse
@@ -10402,130 +10405,130 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1f
; GFX11-NEXT: global_load_u16 v3, v[1:2], off offset:2
-; GFX11-NEXT: global_load_u16 v4, v[1:2], off offset:12
-; GFX11-NEXT: global_load_u16 v5, v[1:2], off offset:8
+; GFX11-NEXT: global_load_u16 v4, v[1:2], off
+; GFX11-NEXT: global_load_u16 v5, v[1:2], off offset:6
; GFX11-NEXT: global_load_u16 v6, v[1:2], off offset:4
-; GFX11-NEXT: global_load_u16 v7, v[1:2], off
-; GFX11-NEXT: global_load_u16 v8, v[1:2], off offset:6
-; GFX11-NEXT: global_load_u16 v9, v[1:2], off offset:10
-; GFX11-NEXT: global_load_u16 v10, v[1:2], off offset:14
+; GFX11-NEXT: global_load_u16 v7, v[1:2], off offset:10
+; GFX11-NEXT: global_load_u16 v8, v[1:2], off offset:8
+; GFX11-NEXT: global_load_u16 v9, v[1:2], off offset:14
+; GFX11-NEXT: global_load_u16 v10, v[1:2], off offset:12
; GFX11-NEXT: global_load_u16 v11, v[1:2], off offset:18
-; GFX11-NEXT: global_load_u16 v12, v[1:2], off offset:28
-; GFX11-NEXT: global_load_u16 v13, v[1:2], off offset:24
+; GFX11-NEXT: global_load_u16 v12, v[1:2], off offset:16
+; GFX11-NEXT: global_load_u16 v13, v[1:2], off offset:22
; GFX11-NEXT: global_load_u16 v14, v[1:2], off offset:20
-; GFX11-NEXT: global_load_u16 v15, v[1:2], off offset:16
-; GFX11-NEXT: global_load_u16 v16, v[1:2], off offset:22
-; GFX11-NEXT: global_load_u16 v17, v[1:2], off offset:26
-; GFX11-NEXT: global_load_u16 v18, v[1:2], off offset:30
+; GFX11-NEXT: global_load_u16 v15, v[1:2], off offset:26
+; GFX11-NEXT: global_load_u16 v16, v[1:2], off offset:24
+; GFX11-NEXT: global_load_u16 v17, v[1:2], off offset:30
+; GFX11-NEXT: global_load_u16 v18, v[1:2], off offset:28
; GFX11-NEXT: global_load_u16 v19, v[1:2], off offset:34
-; GFX11-NEXT: global_load_u16 v20, v[1:2], off offset:44
-; GFX11-NEXT: global_load_u16 v21, v[1:2], off offset:40
+; GFX11-NEXT: global_load_u16 v20, v[1:2], off offset:32
+; GFX11-NEXT: global_load_u16 v21, v[1:2], off offset:38
; GFX11-NEXT: global_load_u16 v22, v[1:2], off offset:36
-; GFX11-NEXT: global_load_u16 v23, v[1:2], off offset:32
-; GFX11-NEXT: global_load_u16 v24, v[1:2], off offset:38
-; GFX11-NEXT: global_load_u16 v25, v[1:2], off offset:42
-; GFX11-NEXT: global_load_u16 v26, v[1:2], off offset:46
+; GFX11-NEXT: global_load_u16 v23, v[1:2], off offset:42
+; GFX11-NEXT: global_load_u16 v24, v[1:2], off offset:40
+; GFX11-NEXT: global_load_u16 v25, v[1:2], off offset:46
+; GFX11-NEXT: global_load_u16 v26, v[1:2], off offset:44
; GFX11-NEXT: global_load_u16 v27, v[1:2], off offset:50
-; GFX11-NEXT: global_load_u16 v28, v[1:2], off offset:60
-; GFX11-NEXT: global_load_u16 v29, v[1:2], off offset:56
+; GFX11-NEXT: global_load_u16 v28, v[1:2], off offset:48
+; GFX11-NEXT: global_load_u16 v29, v[1:2], off offset:54
; GFX11-NEXT: global_load_u16 v30, v[1:2], off offset:52
-; GFX11-NEXT: global_load_u16 v31, v[1:2], off offset:48
-; GFX11-NEXT: global_load_u16 v32, v[1:2], off offset:54
-; GFX11-NEXT: global_load_u16 v33, v[1:2], off offset:58
-; GFX11-NEXT: global_load_u16 v1, v[1:2], off offset:62
+; GFX11-NEXT: global_load_u16 v31, v[1:2], off offset:58
+; GFX11-NEXT: global_load_u16 v32, v[1:2], off offset:56
+; GFX11-NEXT: global_load_u16 v33, v[1:2], off offset:62
+; GFX11-NEXT: global_load_u16 v1, v[1:2], off offset:60
; GFX11-NEXT: s_waitcnt vmcnt(31)
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX11-NEXT: s_waitcnt vmcnt(30)
-; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v4
; GFX11-NEXT: s_waitcnt vmcnt(29)
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX11-NEXT: s_waitcnt vmcnt(28)
; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX11-NEXT: s_waitcnt vmcnt(27)
-; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v7
+; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v7
; GFX11-NEXT: s_waitcnt vmcnt(26)
-; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v8
+; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v8
; GFX11-NEXT: s_waitcnt vmcnt(25)
; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; GFX11-NEXT: s_waitcnt vmcnt(24)
; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; GFX11-NEXT: s_waitcnt vmcnt(23)
-; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v11
+; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v11
; GFX11-NEXT: s_waitcnt vmcnt(22)
-; GFX11-NEXT: v_lshlrev_b32_e32 v100, 16, v12
+; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v12
; GFX11-NEXT: s_waitcnt vmcnt(21)
; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GFX11-NEXT: s_waitcnt vmcnt(20)
; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; GFX11-NEXT: s_waitcnt vmcnt(19)
-; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v15
+; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v15
; GFX11-NEXT: s_waitcnt vmcnt(18)
-; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v16
+; GFX11-NEXT: v_lshlrev_b32_e32 v100, 16, v16
; GFX11-NEXT: s_waitcnt vmcnt(17)
; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17
; GFX11-NEXT: s_waitcnt vmcnt(16)
; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v18
; GFX11-NEXT: s_waitcnt vmcnt(15)
-; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v19
+; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v19
; GFX11-NEXT: s_waitcnt vmcnt(14)
-; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v20
+; GFX11-NEXT: v_lshlrev_b32_e32 v101, 16, v20
; GFX11-NEXT: s_waitcnt vmcnt(13)
; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21
; GFX11-NEXT: s_waitcnt vmcnt(12)
; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22
; GFX11-NEXT: s_waitcnt vmcnt(11)
-; GFX11-NEXT: v_lshlrev_b32_e32 v101, 16, v23
+; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v23
; GFX11-NEXT: s_waitcnt vmcnt(10)
-; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v24
+; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v24
; GFX11-NEXT: s_waitcnt vmcnt(9)
; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25
; GFX11-NEXT: s_waitcnt vmcnt(8)
; GFX11-NEXT: v_lshlrev_b32_e32 v26, 16, v26
; GFX11-NEXT: s_waitcnt vmcnt(7)
-; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v27
+; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v27
; GFX11-NEXT: s_waitcnt vmcnt(6)
-; GFX11-NEXT: v_lshlrev_b32_e32 v65, 16, v28
+; GFX11-NEXT: v_lshlrev_b32_e32 v64, 16, v28
; GFX11-NEXT: s_waitcnt vmcnt(5)
; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v29
; GFX11-NEXT: s_waitcnt vmcnt(4)
; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v30
; GFX11-NEXT: s_waitcnt vmcnt(3)
-; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v31
+; GFX11-NEXT: v_lshlrev_b32_e32 v65, 16, v31
; GFX11-NEXT: s_waitcnt vmcnt(2)
-; GFX11-NEXT: v_lshlrev_b32_e32 v64, 16, v32
+; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v32
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v33
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[96:97], v65
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[84:85], v29
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[82:83], v64
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[86:87], v33
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[98:99], v1
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[82:83], v29
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[86:87], v65
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[84:85], v68
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[98:99], v33
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[96:97], v1
; GFX11-NEXT: v_cvt_f64_f32_e32 v[80:81], v30
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[70:71], v52
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[68:69], v53
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[66:67], v26
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[64:65], v49
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[54:55], v25
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[52:53], v21
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[50:51], v48
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[70:71], v53
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[68:69], v64
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[66:67], v25
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[64:65], v26
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[54:55], v49
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[52:53], v52
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[19:20], v35
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[27:28], v36
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[35:36], v48
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[50:51], v21
; GFX11-NEXT: v_cvt_f64_f32_e32 v[48:49], v22
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[19:20], v34
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[23:24], v35
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[35:36], v36
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[11:12], v34
; GFX11-NEXT: v_cvt_f64_f32_e32 v[33:34], v101
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[31:32], v18
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[29:30], v100
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[27:28], v17
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[25:26], v13
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[31:32], v17
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[29:30], v18
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[25:26], v100
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[23:24], v13
; GFX11-NEXT: v_cvt_f64_f32_e32 v[21:22], v14
; GFX11-NEXT: v_cvt_f64_f32_e32 v[17:18], v39
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[15:16], v10
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[13:14], v38
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[11:12], v9
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[9:10], v5
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[7:8], v7
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[15:16], v9
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[13:14], v10
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[9:10], v38
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[7:8], v5
; GFX11-NEXT: v_cvt_f64_f32_e32 v[5:6], v6
; GFX11-NEXT: v_cvt_f64_f32_e32 v[3:4], v2
; GFX11-NEXT: v_cvt_f64_f32_e32 v[1:2], v37
@@ -10555,97 +10558,95 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
; GFX1250-NEXT: s_clause 0x1f
; GFX1250-NEXT: global_load_u16 v1, v[2:3], off offset:2
-; GFX1250-NEXT: global_load_u16 v4, v[2:3], off offset:12
-; GFX1250-NEXT: global_load_u16 v5, v[2:3], off offset:8
+; GFX1250-NEXT: global_load_u16 v4, v[2:3], off
+; GFX1250-NEXT: global_load_u16 v5, v[2:3], off offset:6
; GFX1250-NEXT: global_load_u16 v6, v[2:3], off offset:4
-; GFX1250-NEXT: global_load_u16 v7, v[2:3], off
-; GFX1250-NEXT: global_load_u16 v8, v[2:3], off offset:6
-; GFX1250-NEXT: global_load_u16 v9, v[2:3], off offset:10
-; GFX1250-NEXT: global_load_u16 v10, v[2:3], off offset:14
-; GFX1250-NEXT: global_load_u16 v11, v[2:3], off offset:18
-; GFX1250-NEXT: global_load_u16 v12, v[2:3], off offset:62
-; GFX1250-NEXT: global_load_u16 v13, v[2:3], off offset:60
+; GFX1250-NEXT: global_load_u16 v7, v[2:3], off offset:10
+; GFX1250-NEXT: global_load_u16 v8, v[2:3], off offset:8
+; GFX1250-NEXT: global_load_u16 v9, v[2:3], off offset:14
+; GFX1250-NEXT: global_load_u16 v10, v[2:3], off offset:12
+; GFX1250-NEXT: global_load_u16 v11, v[2:3], off offset:62
+; GFX1250-NEXT: global_load_u16 v12, v[2:3], off offset:60
+; GFX1250-NEXT: global_load_u16 v13, v[2:3], off offset:18
; GFX1250-NEXT: global_load_u16 v14, v[2:3], off offset:58
; GFX1250-NEXT: global_load_u16 v15, v[2:3], off offset:56
-; GFX1250-NEXT: global_load_u16 v16, v[2:3], off offset:28
-; GFX1250-NEXT: global_load_u16 v17, v[2:3], off offset:24
+; GFX1250-NEXT: global_load_u16 v16, v[2:3], off offset:16
+; GFX1250-NEXT: global_load_u16 v17, v[2:3], off offset:22
; GFX1250-NEXT: global_load_u16 v18, v[2:3], off offset:20
-; GFX1250-NEXT: global_load_u16 v19, v[2:3], off offset:16
-; GFX1250-NEXT: global_load_u16 v20, v[2:3], off offset:22
-; GFX1250-NEXT: global_load_u16 v21, v[2:3], off offset:26
-; GFX1250-NEXT: global_load_u16 v22, v[2:3], off offset:30
+; GFX1250-NEXT: global_load_u16 v19, v[2:3], off offset:26
+; GFX1250-NEXT: global_load_u16 v20, v[2:3], off offset:24
+; GFX1250-NEXT: global_load_u16 v21, v[2:3], off offset:30
+; GFX1250-NEXT: global_load_u16 v22, v[2:3], off offset:28
; GFX1250-NEXT: global_load_u16 v23, v[2:3], off offset:34
-; GFX1250-NEXT: global_load_u16 v24, v[2:3], off offset:44
-; GFX1250-NEXT: global_load_u16 v25, v[2:3], off offset:40
+; GFX1250-NEXT: global_load_u16 v24, v[2:3], off offset:32
+; GFX1250-NEXT: global_load_u16 v25, v[2:3], off offset:38
; GFX1250-NEXT: global_load_u16 v26, v[2:3], off offset:36
-; GFX1250-NEXT: global_load_u16 v27, v[2:3], off offset:32
-; GFX1250-NEXT: global_load_u16 v28, v[2:3], off offset:38
-; GFX1250-NEXT: global_load_u16 v29, v[2:3], off offset:42
-; GFX1250-NEXT: global_load_u16 v30, v[2:3], off offset:46
+; GFX1250-NEXT: global_load_u16 v27, v[2:3], off offset:42
+; GFX1250-NEXT: global_load_u16 v28, v[2:3], off offset:40
+; GFX1250-NEXT: global_load_u16 v29, v[2:3], off offset:46
+; GFX1250-NEXT: global_load_u16 v30, v[2:3], off offset:44
; GFX1250-NEXT: global_load_u16 v31, v[2:3], off offset:50
-; GFX1250-NEXT: global_load_u16 v32, v[2:3], off offset:52
-; GFX1250-NEXT: global_load_u16 v33, v[2:3], off offset:48
-; GFX1250-NEXT: global_load_u16 v34, v[2:3], off offset:54
+; GFX1250-NEXT: global_load_u16 v32, v[2:3], off offset:48
+; GFX1250-NEXT: global_load_u16 v33, v[2:3], off offset:54
+; GFX1250-NEXT: global_load_u16 v34, v[2:3], off offset:52
; GFX1250-NEXT: s_wait_loadcnt 0x1e
-; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v37, 16, v4
+; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v84, 16, v4
; GFX1250-NEXT: s_wait_loadcnt 0x1c
-; GFX1250-NEXT: v_dual_lshlrev_b32 v81, 16, v5 :: v_dual_lshlrev_b32 v85, 16, v6
+; GFX1250-NEXT: v_dual_lshlrev_b32 v35, 16, v5 :: v_dual_lshlrev_b32 v85, 16, v6
; GFX1250-NEXT: s_wait_loadcnt 0x1a
-; GFX1250-NEXT: v_dual_lshlrev_b32 v84, 16, v7 :: v_dual_lshlrev_b32 v35, 16, v8
+; GFX1250-NEXT: v_dual_lshlrev_b32 v80, 16, v7 :: v_dual_lshlrev_b32 v81, 16, v8
; GFX1250-NEXT: s_wait_loadcnt 0x18
-; GFX1250-NEXT: v_dual_lshlrev_b32 v80, 16, v9 :: v_dual_lshlrev_b32 v36, 16, v10
-; GFX1250-NEXT: s_wait_loadcnt 0x15
+; GFX1250-NEXT: v_dual_lshlrev_b32 v36, 16, v9 :: v_dual_lshlrev_b32 v37, 16, v10
+; GFX1250-NEXT: s_wait_loadcnt 0x16
; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: v_dual_lshlrev_b32 v2, 16, v12 :: v_dual_lshlrev_b32 v3, 16, v13
+; GFX1250-NEXT: v_dual_lshlrev_b32 v2, 16, v11 :: v_dual_lshlrev_b32 v3, 16, v12
; GFX1250-NEXT: s_wait_loadcnt 0x14
-; GFX1250-NEXT: v_dual_lshlrev_b32 v12, 16, v11 :: v_dual_lshlrev_b32 v6, 16, v14
-; GFX1250-NEXT: s_wait_loadcnt 0x13
-; GFX1250-NEXT: v_lshlrev_b32_e32 v7, 16, v15
+; GFX1250-NEXT: v_dual_lshlrev_b32 v12, 16, v13 :: v_dual_lshlrev_b32 v6, 16, v14
+; GFX1250-NEXT: s_wait_loadcnt 0x12
+; GFX1250-NEXT: v_dual_lshlrev_b32 v7, 16, v15 :: v_dual_lshlrev_b32 v13, 16, v16
; GFX1250-NEXT: v_cvt_f64_f32_e32 v[4:5], v2
; GFX1250-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
-; GFX1250-NEXT: s_wait_loadcnt 0x11
-; GFX1250-NEXT: v_dual_lshlrev_b32 v68, 16, v17 :: v_dual_lshlrev_b32 v39, 16, v16
-; GFX1250-NEXT: s_wait_loadcnt 0xe
-; GFX1250-NEXT: v_lshlrev_b32_e32 v20, 16, v20
; GFX1250-NEXT: v_cvt_f64_f32_e32 v[8:9], v6
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX1250-NEXT: v_cvt_f64_f32_e32 v[6:7], v7
-; GFX1250-NEXT: s_wait_loadcnt 0xc
-; GFX1250-NEXT: v_dual_lshlrev_b32 v21, 16, v21 :: v_dual_lshlrev_b32 v38, 16, v22
-; GFX1250-NEXT: s_wait_loadcnt 0x9
-; GFX1250-NEXT: v_dual_lshlrev_b32 v48, 16, v23 :: v_dual_lshlrev_b32 v25, 16, v25
-; GFX1250-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; GFX1250-NEXT: s_wait_loadcnt 0x5
-; GFX1250-NEXT: v_dual_lshlrev_b32 v49, 16, v28 :: v_dual_lshlrev_b32 v64, 16, v29
-; GFX1250-NEXT: s_wait_loadcnt 0x3
-; GFX1250-NEXT: v_dual_lshlrev_b32 v50, 16, v30 :: v_dual_lshlrev_b32 v51, 16, v31
+; GFX1250-NEXT: s_wait_loadcnt 0x10
+; GFX1250-NEXT: v_dual_lshlrev_b32 v38, 16, v17 :: v_dual_lshlrev_b32 v82, 16, v18
+; GFX1250-NEXT: s_wait_loadcnt 0xa
+; GFX1250-NEXT: v_dual_lshlrev_b32 v48, 16, v23 :: v_dual_lshlrev_b32 v24, 16, v24
+; GFX1250-NEXT: v_dual_lshlrev_b32 v21, 16, v21 :: v_dual_lshlrev_b32 v39, 16, v22
+; GFX1250-NEXT: s_wait_loadcnt 0x6
+; GFX1250-NEXT: v_dual_lshlrev_b32 v64, 16, v27 :: v_dual_lshlrev_b32 v65, 16, v28
+; GFX1250-NEXT: s_wait_loadcnt 0x4
+; GFX1250-NEXT: v_dual_lshlrev_b32 v49, 16, v29 :: v_dual_lshlrev_b32 v50, 16, v30
+; GFX1250-NEXT: s_wait_loadcnt 0x2
+; GFX1250-NEXT: v_dual_lshlrev_b32 v51, 16, v31 :: v_dual_lshlrev_b32 v32, 16, v32
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_dual_lshlrev_b32 v33, 16, v33 :: v_dual_lshlrev_b32 v52, 16, v34
-; GFX1250-NEXT: v_dual_lshlrev_b32 v32, 16, v32 :: v_dual_lshlrev_b32 v69, 16, v27
-; GFX1250-NEXT: v_lshlrev_b32_e32 v70, 16, v26
+; GFX1250-NEXT: v_dual_lshlrev_b32 v25, 16, v25 :: v_dual_lshlrev_b32 v69, 16, v26
; GFX1250-NEXT: v_cvt_f64_f32_e32 v[14:15], v35
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250-NEXT: v_cvt_f64_f32_e32 v[54:55], v52
-; GFX1250-NEXT: v_cvt_f64_f32_e32 v[52:53], v32
-; GFX1250-NEXT: v_cvt_f64_f32_e32 v[30:31], v38
-; GFX1250-NEXT: v_cvt_f64_f32_e32 v[28:29], v39
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: v_cvt_f64_f32_e32 v[54:55], v33
+; GFX1250-NEXT: v_cvt_f64_f32_e32 v[52:53], v52
; GFX1250-NEXT: v_cvt_f64_f32_e32 v[34:35], v48
-; GFX1250-NEXT: v_cvt_f64_f32_e32 v[38:39], v49
-; GFX1250-NEXT: v_cvt_f64_f32_e32 v[48:49], v33
-; GFX1250-NEXT: v_dual_lshlrev_b32 v13, 16, v19 :: v_dual_lshlrev_b32 v82, 16, v18
+; GFX1250-NEXT: v_dual_lshlrev_b32 v68, 16, v19 :: v_dual_lshlrev_b32 v20, 16, v20
; GFX1250-NEXT: v_cvt_f64_f32_e32 v[66:67], v64
-; GFX1250-NEXT: v_cvt_f64_f32_e32 v[64:65], v25
+; GFX1250-NEXT: v_cvt_f64_f32_e32 v[64:65], v65
+; GFX1250-NEXT: v_cvt_f64_f32_e32 v[18:19], v36
+; GFX1250-NEXT: v_cvt_f64_f32_e32 v[16:17], v37
+; GFX1250-NEXT: v_cvt_f64_f32_e32 v[26:27], v38
+; GFX1250-NEXT: v_cvt_f64_f32_e32 v[28:29], v39
+; GFX1250-NEXT: v_cvt_f64_f32_e32 v[38:39], v25
; GFX1250-NEXT: scratch_store_b128 v0, v[2:5], off offset:240
; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: v_cvt_f64_f32_e32 v[4:5], v50
+; GFX1250-NEXT: v_cvt_f64_f32_e32 v[4:5], v49
+; GFX1250-NEXT: v_cvt_f64_f32_e32 v[2:3], v50
; GFX1250-NEXT: v_cvt_f64_f32_e32 v[50:51], v51
-; GFX1250-NEXT: v_cvt_f64_f32_e32 v[2:3], v24
-; GFX1250-NEXT: v_cvt_f64_f32_e32 v[18:19], v36
-; GFX1250-NEXT: v_cvt_f64_f32_e32 v[16:17], v37
-; GFX1250-NEXT: v_cvt_f64_f32_e32 v[36:37], v70
-; GFX1250-NEXT: v_cvt_f64_f32_e32 v[32:33], v69
-; GFX1250-NEXT: v_cvt_f64_f32_e32 v[70:71], v21
-; GFX1250-NEXT: v_cvt_f64_f32_e32 v[68:69], v68
-; GFX1250-NEXT: v_cvt_f64_f32_e32 v[26:27], v20
+; GFX1250-NEXT: v_cvt_f64_f32_e32 v[48:49], v32
+; GFX1250-NEXT: v_cvt_f64_f32_e32 v[36:37], v69
+; GFX1250-NEXT: v_cvt_f64_f32_e32 v[32:33], v24
+; GFX1250-NEXT: v_cvt_f64_f32_e32 v[30:31], v21
+; GFX1250-NEXT: v_cvt_f64_f32_e32 v[70:71], v68
+; GFX1250-NEXT: v_cvt_f64_f32_e32 v[68:69], v20
; GFX1250-NEXT: v_cvt_f64_f32_e32 v[24:25], v82
; GFX1250-NEXT: v_cvt_f64_f32_e32 v[22:23], v12
; GFX1250-NEXT: v_cvt_f64_f32_e32 v[20:21], v13
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index c407f7645315d..6e41496193d74 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -6932,51 +6932,52 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28
-; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32
; VI-NEXT: s_getpc_b64 s[4:5]
; VI-NEXT: s_add_u32 s4, s4, byval_align16_f64_arg at rel32@lo+4
; VI-NEXT: s_addc_u32 s5, s5, byval_align16_f64_arg at rel32@hi+12
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:20
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:16
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32
; VI-NEXT: s_setpc_b64 s[4:5]
;
; CI-LABEL: tail_call_byval_align16:
; CI: ; %bb.0: ; %entry
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28
-; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32
; CI-NEXT: s_getpc_b64 s[4:5]
; CI-NEXT: s_add_u32 s4, s4, byval_align16_f64_arg at rel32@lo+4
; CI-NEXT: s_addc_u32 s5, s5, byval_align16_f64_arg at rel32@hi+12
-; CI-NEXT: s_waitcnt vmcnt(1)
+; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:20
; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24
-; CI-NEXT: s_waitcnt vmcnt(2)
-; CI-NEXT: buffer_store_dword v32, off, s[0:3], s32
+; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32
; CI-NEXT: s_waitcnt vmcnt(1)
; CI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:16
+; CI-NEXT: s_waitcnt vmcnt(1)
+; CI-NEXT: buffer_store_dword v32, off, s[0:3], s32
; CI-NEXT: s_setpc_b64 s[4:5]
;
; SDAG-LABEL: tail_call_byval_align16:
; SDAG: ; %bb.0: ; %entry
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28
-; SDAG-NEXT: buffer_load_dword v32, off, s[0:3], s32
; SDAG-NEXT: s_getpc_b64 s[4:5]
; SDAG-NEXT: s_add_u32 s4, s4, byval_align16_f64_arg at rel32@lo+4
; SDAG-NEXT: s_addc_u32 s5, s5, byval_align16_f64_arg at rel32@hi+12
-; SDAG-NEXT: s_waitcnt vmcnt(1)
+; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:20
; SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24
-; SDAG-NEXT: s_waitcnt vmcnt(2)
-; SDAG-NEXT: buffer_store_dword v32, off, s[0:3], s32
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: buffer_load_dword v32, off, s[0:3], s32
; SDAG-NEXT: s_waitcnt vmcnt(1)
; SDAG-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:16
+; SDAG-NEXT: s_waitcnt vmcnt(1)
+; SDAG-NEXT: buffer_store_dword v32, off, s[0:3], s32
; SDAG-NEXT: s_setpc_b64 s[4:5]
;
; GFX11-LABEL: tail_call_byval_align16:
@@ -6997,17 +6998,17 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
; HSA: ; %bb.0: ; %entry
; HSA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; HSA-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28
-; HSA-NEXT: buffer_load_dword v32, off, s[0:3], s32
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, byval_align16_f64_arg at rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, byval_align16_f64_arg at rel32@hi+12
-; HSA-NEXT: s_waitcnt vmcnt(1)
+; HSA-NEXT: s_waitcnt vmcnt(0)
; HSA-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:20
; HSA-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24
-; HSA-NEXT: s_waitcnt vmcnt(2)
-; HSA-NEXT: buffer_store_dword v32, off, s[0:3], s32
+; HSA-NEXT: buffer_load_dword v32, off, s[0:3], s32
; HSA-NEXT: s_waitcnt vmcnt(1)
; HSA-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:16
+; HSA-NEXT: s_waitcnt vmcnt(1)
+; HSA-NEXT: buffer_store_dword v32, off, s[0:3], s32
; HSA-NEXT: s_setpc_b64 s[4:5]
;
; GISEL-LABEL: tail_call_byval_align16:
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index 795f0841cede2..4d13cfcf2301b 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -1352,26 +1352,26 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_add_u32_e32 v2, vcc, 2, v0
+; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v0
+; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_ubyte v2, v[2:3]
-; VI-NEXT: flat_load_ubyte v3, v[4:5]
-; VI-NEXT: flat_load_ubyte v4, v[0:1]
-; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_ubyte v1, v[0:1]
+; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v0
+; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_ubyte v0, v[0:1]
+; VI-NEXT: flat_load_ubyte v1, v[6:7]
+; VI-NEXT: flat_load_ubyte v4, v[4:5]
+; VI-NEXT: flat_load_ubyte v5, v[2:3]
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v3
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v1
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v5
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
@@ -1540,24 +1540,24 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT: v_mov_b32_e32 v7, 0
+; GFX10-NEXT: v_mov_b32_e32 v6, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x3
; GFX10-NEXT: global_load_ubyte v1, v0, s[12:13] offset:2
; GFX10-NEXT: global_load_ubyte v3, v0, s[12:13] offset:3
-; GFX10-NEXT: global_load_ubyte v2, v0, s[14:15] offset:3
-; GFX10-NEXT: global_load_ubyte v4, v0, s[14:15] offset:2
+; GFX10-NEXT: global_load_ubyte v2, v0, s[14:15] offset:2
+; GFX10-NEXT: global_load_ubyte v4, v0, s[14:15] offset:3
; GFX10-NEXT: s_waitcnt vmcnt(2)
; GFX10-NEXT: v_lshl_or_b32 v5, v3, 8, v1
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshl_or_b32 v6, v2, 8, v4
-; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, v4
+; GFX10-NEXT: v_lshl_or_b32 v4, v4, 8, v2
+; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v3
; GFX10-NEXT: v_mov_b32_e32 v3, v1
-; GFX10-NEXT: v_perm_b32 v4, v5, v6, 0x4000405
-; GFX10-NEXT: global_store_dwordx4 v7, v[0:3], s[8:9]
-; GFX10-NEXT: global_store_dword v7, v4, s[10:11]
+; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x4000405
+; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[8:9]
+; GFX10-NEXT: global_store_dword v6, v4, s[10:11]
; GFX10-NEXT: s_endpgm
;
; GFX9-LABEL: load_v4i8_to_v4f32_unaligned_multiuse:
@@ -1568,18 +1568,18 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1
; GFX9-NEXT: s_mov_b32 s0, 0x4000405
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ubyte v1, v0, s[12:13] offset:2
+; GFX9-NEXT: global_load_ubyte v2, v0, s[14:15] offset:2
; GFX9-NEXT: global_load_ubyte v3, v0, s[12:13] offset:3
-; GFX9-NEXT: global_load_ubyte v2, v0, s[14:15] offset:3
-; GFX9-NEXT: global_load_ubyte v4, v0, s[14:15] offset:2
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: global_load_ubyte v4, v0, s[14:15] offset:3
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshl_or_b32 v6, v3, 8, v1
; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshl_or_b32 v7, v2, 8, v4
-; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v4
+; GFX9-NEXT: v_lshl_or_b32 v4, v4, 8, v2
+; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v3
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: v_perm_b32 v4, v6, v7, s0
+; GFX9-NEXT: v_perm_b32 v4, v6, v4, s0
; GFX9-NEXT: global_store_dwordx4 v5, v[0:3], s[8:9]
; GFX9-NEXT: global_store_dword v5, v4, s[10:11]
; GFX9-NEXT: s_endpgm
@@ -1595,14 +1595,14 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: global_load_u8 v1, v0, s[4:5] offset:2
; GFX11-NEXT: global_load_u8 v3, v0, s[4:5] offset:3
-; GFX11-NEXT: global_load_u8 v2, v0, s[6:7] offset:3
-; GFX11-NEXT: global_load_u8 v0, v0, s[6:7] offset:2
+; GFX11-NEXT: global_load_u8 v2, v0, s[6:7] offset:2
+; GFX11-NEXT: global_load_u8 v0, v0, s[6:7] offset:3
; GFX11-NEXT: s_waitcnt vmcnt(2)
; GFX11-NEXT: v_lshl_or_b32 v4, v3, 8, v1
; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshl_or_b32 v5, v2, 8, v0
-; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v2, v0
+; GFX11-NEXT: v_lshl_or_b32 v5, v0, 8, v2
+; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v3
; GFX11-NEXT: v_mov_b32_e32 v3, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
@@ -1898,37 +1898,39 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_add_u32_e32 v2, vcc, 5, v0
+; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; VI-NEXT: v_add_u32_e32 v4, vcc, 6, v0
+; VI-NEXT: v_add_u32_e32 v4, vcc, 5, v0
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; VI-NEXT: v_add_u32_e32 v6, vcc, 1, v0
+; VI-NEXT: v_add_u32_e32 v6, vcc, 6, v0
; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; VI-NEXT: v_add_u32_e32 v8, vcc, 2, v0
+; VI-NEXT: v_add_u32_e32 v8, vcc, 1, v0
; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
-; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v0
+; VI-NEXT: v_add_u32_e32 v10, vcc, 2, v0
; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_ubyte v12, v[2:3]
-; VI-NEXT: flat_load_ubyte v2, v[8:9]
-; VI-NEXT: flat_load_ubyte v3, v[10:11]
-; VI-NEXT: flat_load_ubyte v4, v[4:5]
-; VI-NEXT: flat_load_ubyte v5, v[0:1]
+; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v0
+; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_ubyte v0, v[0:1]
+; VI-NEXT: flat_load_ubyte v1, v[12:13]
+; VI-NEXT: flat_load_ubyte v10, v[10:11]
+; VI-NEXT: flat_load_ubyte v8, v[8:9]
; VI-NEXT: flat_load_ubyte v6, v[6:7]
-; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v0
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_ubyte v7, v[0:1]
+; VI-NEXT: flat_load_ubyte v4, v[4:5]
+; VI-NEXT: flat_load_ubyte v7, v[2:3]
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt vmcnt(6)
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
; VI-NEXT: s_waitcnt vmcnt(5)
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v1
; VI-NEXT: s_waitcnt vmcnt(4)
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v3
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v10
+; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v8
; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v5
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v6
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v6
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v4
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v5, v12
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v5, v4
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v7
; VI-NEXT: buffer_store_dwordx3 v[4:6], off, s[0:3], 0 offset:16
@@ -1942,20 +1944,20 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr
; GFX10-NEXT: v_mov_b32_e32 v8, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x5
-; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:6
; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3
; GFX10-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2
-; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] offset:1
+; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:1
+; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] offset:6
; GFX10-NEXT: global_load_short_d16 v7, v0, s[2:3] offset:4
; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(5)
-; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, v4
-; GFX10-NEXT: s_waitcnt vmcnt(4)
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, v1
-; GFX10-NEXT: s_waitcnt vmcnt(3)
+; GFX10-NEXT: s_waitcnt vmcnt(4)
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
+; GFX10-NEXT: s_waitcnt vmcnt(3)
+; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v4
; GFX10-NEXT: s_waitcnt vmcnt(2)
-; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v5
+; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, v5
; GFX10-NEXT: s_waitcnt vmcnt(1)
; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v5, v7
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v7
@@ -2003,20 +2005,20 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x5
-; GFX11-NEXT: global_load_u8 v4, v0, s[2:3] offset:6
; GFX11-NEXT: global_load_u8 v1, v0, s[2:3] offset:3
; GFX11-NEXT: global_load_u8 v2, v0, s[2:3] offset:2
-; GFX11-NEXT: global_load_u8 v5, v0, s[2:3] offset:1
+; GFX11-NEXT: global_load_u8 v4, v0, s[2:3] offset:1
+; GFX11-NEXT: global_load_u8 v5, v0, s[2:3] offset:6
; GFX11-NEXT: global_load_d16_b16 v7, v0, s[2:3] offset:4
; GFX11-NEXT: global_load_u8 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(5)
-; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v6, v4
-; GFX11-NEXT: s_waitcnt vmcnt(4)
; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v3, v1
-; GFX11-NEXT: s_waitcnt vmcnt(3)
+; GFX11-NEXT: s_waitcnt vmcnt(4)
; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
+; GFX11-NEXT: s_waitcnt vmcnt(3)
+; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v4
; GFX11-NEXT: s_waitcnt vmcnt(2)
-; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v5
+; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v6, v5
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v5, v7
; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v4, v7
@@ -2439,26 +2441,26 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_add_u32_e32 v2, vcc, 2, v0
+; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v0
+; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_ubyte v2, v[2:3]
-; VI-NEXT: flat_load_ubyte v3, v[4:5]
-; VI-NEXT: flat_load_ubyte v4, v[0:1]
-; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_ubyte v1, v[0:1]
+; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v0
+; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_ubyte v0, v[0:1]
+; VI-NEXT: flat_load_ubyte v1, v[6:7]
+; VI-NEXT: flat_load_ubyte v4, v[4:5]
+; VI-NEXT: flat_load_ubyte v5, v[2:3]
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v3
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v1
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v5
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
index 9f1b55ea3b1ef..03977565086fb 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
@@ -529,30 +529,29 @@ define amdgpu_kernel void @unaligned_read2_f32(ptr addrspace(1) %out, ptr addrsp
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v0
-; CI-NEXT: ds_read_u8 v2, v1 offset:1
-; CI-NEXT: ds_read_u8 v3, v1 offset:34
-; CI-NEXT: ds_read_u8 v4, v1 offset:32
-; CI-NEXT: ds_read_u8 v5, v1 offset:2
-; CI-NEXT: ds_read_u8 v6, v1
-; CI-NEXT: ds_read_u8 v7, v1 offset:3
-; CI-NEXT: ds_read_u8 v8, v1 offset:33
+; CI-NEXT: ds_read_u8 v2, v1
+; CI-NEXT: ds_read_u8 v3, v1 offset:1
+; CI-NEXT: ds_read_u8 v4, v1 offset:2
+; CI-NEXT: ds_read_u8 v5, v1 offset:3
+; CI-NEXT: ds_read_u8 v6, v1 offset:32
+; CI-NEXT: ds_read_u8 v7, v1 offset:33
+; CI-NEXT: ds_read_u8 v8, v1 offset:34
; CI-NEXT: ds_read_u8 v1, v1 offset:35
-; CI-NEXT: s_waitcnt lgkmcnt(7)
-; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; CI-NEXT: s_waitcnt lgkmcnt(3)
-; CI-NEXT: v_or_b32_e32 v2, v2, v6
-; CI-NEXT: s_waitcnt lgkmcnt(2)
-; CI-NEXT: v_lshlrev_b32_e32 v6, 8, v7
-; CI-NEXT: v_or_b32_e32 v5, v6, v5
-; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; CI-NEXT: s_waitcnt lgkmcnt(6)
+; CI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
+; CI-NEXT: v_or_b32_e32 v2, v3, v2
+; CI-NEXT: s_waitcnt lgkmcnt(4)
+; CI-NEXT: v_lshlrev_b32_e32 v3, 8, v5
+; CI-NEXT: v_or_b32_e32 v3, v3, v4
+; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; CI-NEXT: v_or_b32_e32 v2, v5, v2
-; CI-NEXT: v_lshlrev_b32_e32 v5, 8, v8
-; CI-NEXT: v_or_b32_e32 v1, v1, v3
-; CI-NEXT: v_or_b32_e32 v4, v5, v4
+; CI-NEXT: v_or_b32_e32 v2, v3, v2
+; CI-NEXT: v_lshlrev_b32_e32 v3, 8, v7
+; CI-NEXT: v_or_b32_e32 v1, v1, v8
+; CI-NEXT: v_or_b32_e32 v3, v3, v6
; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; CI-NEXT: v_or_b32_e32 v1, v1, v4
+; CI-NEXT: v_or_b32_e32 v1, v1, v3
; CI-NEXT: v_add_f32_e32 v2, v2, v1
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_mov_b32_e32 v1, 0
@@ -622,30 +621,29 @@ define amdgpu_kernel void @unaligned_offset_read2_f32(ptr addrspace(1) %out, ptr
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v0
-; CI-NEXT: ds_read_u8 v2, v1 offset:6
-; CI-NEXT: ds_read_u8 v3, v1 offset:11
-; CI-NEXT: ds_read_u8 v4, v1 offset:9
-; CI-NEXT: ds_read_u8 v5, v1 offset:7
-; CI-NEXT: ds_read_u8 v6, v1 offset:5
-; CI-NEXT: ds_read_u8 v7, v1 offset:8
-; CI-NEXT: ds_read_u8 v8, v1 offset:10
+; CI-NEXT: ds_read_u8 v2, v1 offset:5
+; CI-NEXT: ds_read_u8 v3, v1 offset:6
+; CI-NEXT: ds_read_u8 v4, v1 offset:7
+; CI-NEXT: ds_read_u8 v5, v1 offset:8
+; CI-NEXT: ds_read_u8 v6, v1 offset:9
+; CI-NEXT: ds_read_u8 v7, v1 offset:10
+; CI-NEXT: ds_read_u8 v8, v1 offset:11
; CI-NEXT: ds_read_u8 v1, v1 offset:12
-; CI-NEXT: s_waitcnt lgkmcnt(7)
-; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; CI-NEXT: s_waitcnt lgkmcnt(3)
-; CI-NEXT: v_or_b32_e32 v2, v2, v6
-; CI-NEXT: s_waitcnt lgkmcnt(2)
-; CI-NEXT: v_lshlrev_b32_e32 v6, 8, v7
-; CI-NEXT: v_or_b32_e32 v5, v6, v5
-; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; CI-NEXT: s_waitcnt lgkmcnt(6)
+; CI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
+; CI-NEXT: v_or_b32_e32 v2, v3, v2
+; CI-NEXT: s_waitcnt lgkmcnt(4)
+; CI-NEXT: v_lshlrev_b32_e32 v3, 8, v5
+; CI-NEXT: v_or_b32_e32 v3, v3, v4
+; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; CI-NEXT: v_or_b32_e32 v2, v5, v2
-; CI-NEXT: v_lshlrev_b32_e32 v5, 8, v8
-; CI-NEXT: v_or_b32_e32 v1, v1, v3
-; CI-NEXT: v_or_b32_e32 v4, v5, v4
+; CI-NEXT: v_or_b32_e32 v2, v3, v2
+; CI-NEXT: v_lshlrev_b32_e32 v3, 8, v7
+; CI-NEXT: v_or_b32_e32 v1, v1, v8
+; CI-NEXT: v_or_b32_e32 v3, v3, v6
; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; CI-NEXT: v_or_b32_e32 v1, v1, v4
+; CI-NEXT: v_or_b32_e32 v1, v1, v3
; CI-NEXT: v_add_f32_e32 v2, v2, v1
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_mov_b32_e32 v1, 0
@@ -716,15 +714,15 @@ define amdgpu_kernel void @misaligned_2_simple_read2_f32(ptr addrspace(1) %out,
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_add_i32_e32 v1, vcc, s0, v0
; CI-NEXT: ds_read_u16 v2, v1 offset:2
-; CI-NEXT: ds_read_u16 v3, v1 offset:32
-; CI-NEXT: ds_read_u16 v4, v1
-; CI-NEXT: ds_read_u16 v1, v1 offset:34
+; CI-NEXT: ds_read_u16 v3, v1
+; CI-NEXT: ds_read_u16 v4, v1 offset:34
+; CI-NEXT: ds_read_u16 v1, v1 offset:32
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; CI-NEXT: v_or_b32_e32 v2, v2, v4
-; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; CI-NEXT: v_or_b32_e32 v1, v1, v3
+; CI-NEXT: v_or_b32_e32 v2, v2, v3
+; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; CI-NEXT: v_or_b32_e32 v1, v3, v1
; CI-NEXT: v_add_f32_e32 v2, v2, v1
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
@@ -1453,32 +1451,28 @@ define amdgpu_kernel void @read2_v2i32_align1_odd_offset(ptr addrspace(1) %out)
; CI: ; %bb.0: ; %entry
; CI-NEXT: v_mov_b32_e32 v0, 0
; CI-NEXT: s_mov_b32 m0, -1
-; CI-NEXT: ds_read_u8 v1, v0 offset:70
-; CI-NEXT: ds_read_u8 v2, v0 offset:72
-; CI-NEXT: ds_read_u8 v3, v0 offset:71
-; CI-NEXT: ds_read_u8 v4, v0 offset:69
+; CI-NEXT: ds_read_u8 v2, v0 offset:65
+; CI-NEXT: ds_read_u8 v3, v0 offset:66
+; CI-NEXT: ds_read_u8 v4, v0 offset:67
; CI-NEXT: ds_read_u8 v5, v0 offset:68
-; CI-NEXT: s_waitcnt lgkmcnt(4)
-; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; CI-NEXT: s_waitcnt lgkmcnt(3)
-; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; CI-NEXT: s_waitcnt lgkmcnt(2)
-; CI-NEXT: v_or_b32_e32 v2, v2, v3
-; CI-NEXT: s_waitcnt lgkmcnt(1)
-; CI-NEXT: v_or_b32_e32 v1, v1, v4
-; CI-NEXT: ds_read_u8 v4, v0 offset:66
-; CI-NEXT: ds_read_u8 v6, v0 offset:67
-; CI-NEXT: ds_read_u8 v0, v0 offset:65
-; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; CI-NEXT: ds_read_u8 v1, v0 offset:69
+; CI-NEXT: ds_read_u8 v6, v0 offset:70
+; CI-NEXT: ds_read_u8 v7, v0 offset:71
+; CI-NEXT: ds_read_u8 v0, v0 offset:72
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; CI-NEXT: v_or_b32_e32 v1, v2, v1
+; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v4
-; CI-NEXT: v_or_b32_e32 v0, v2, v0
+; CI-NEXT: v_lshlrev_b32_e32 v6, 8, v6
+; CI-NEXT: v_or_b32_e32 v1, v6, v1
+; CI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
+; CI-NEXT: v_or_b32_e32 v0, v0, v7
+; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; CI-NEXT: v_or_b32_e32 v1, v0, v1
+; CI-NEXT: v_lshlrev_b32_e32 v0, 8, v3
+; CI-NEXT: v_or_b32_e32 v0, v0, v2
; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v5
-; CI-NEXT: v_or_b32_e32 v2, v2, v6
+; CI-NEXT: v_or_b32_e32 v2, v2, v4
; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, -1
; CI-NEXT: v_or_b32_e32 v0, v2, v0
; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
index 52bcaed7ec75a..b667cf591df05 100644
--- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
+++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
@@ -154,15 +154,15 @@ define i32 @global_load_2xi16_align1(ptr addrspace(1) %p) #0 {
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v6, vcc, 3, v0
; GFX7-ALIGNED-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
; GFX7-ALIGNED-NEXT: flat_load_ubyte v4, v[4:5]
-; GFX7-ALIGNED-NEXT: flat_load_ubyte v5, v[6:7]
; GFX7-ALIGNED-NEXT: flat_load_ubyte v2, v[2:3]
+; GFX7-ALIGNED-NEXT: flat_load_ubyte v3, v[6:7]
; GFX7-ALIGNED-NEXT: flat_load_ubyte v0, v[0:1]
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(3)
; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 8, v4
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v3, 24, v5
-; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(1)
; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v3, 24, v3
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v1, v0
; GFX7-ALIGNED-NEXT: v_or_b32_e32 v1, v3, v2
diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
index 6f8da57e223e5..44493498bdbe9 100644
--- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
+++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
@@ -205,15 +205,15 @@ define i32 @private_load_2xi16_align1(ptr addrspace(5) %p) #0 {
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v2, vcc, 1, v0
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v3, vcc, 3, v0
; GFX7-ALIGNED-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen
-; GFX7-ALIGNED-NEXT: buffer_load_ubyte v3, v3, s[0:3], 0 offen
; GFX7-ALIGNED-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen
+; GFX7-ALIGNED-NEXT: buffer_load_ubyte v3, v3, s[0:3], 0 offen
; GFX7-ALIGNED-NEXT: buffer_load_ubyte v0, v0, s[0:3], 0 offen
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(3)
; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v3, 24, v3
-; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(1)
; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v3, 24, v3
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v2, v0
; GFX7-ALIGNED-NEXT: v_or_b32_e32 v1, v3, v1
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll
index f655d4761fa31..6e1543f10218c 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll
@@ -1099,8 +1099,8 @@ define void @flat_atomic_sub_i64_noret(ptr %ptr, i64 %in) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v6, v[0:1]
; GFX7-NEXT: flat_load_dword v7, v[4:5]
+; GFX7-NEXT: flat_load_dword v6, v[0:1]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -1125,8 +1125,8 @@ define void @flat_atomic_sub_i64_noret(ptr %ptr, i64 %in) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v6, v[0:1]
; GFX8-NEXT: flat_load_dword v7, v[4:5]
+; GFX8-NEXT: flat_load_dword v6, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -1180,8 +1180,8 @@ define void @flat_atomic_sub_i64_noret_offset(ptr %out, i64 %in) {
; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v7, v[0:1]
; GFX7-NEXT: flat_load_dword v6, v[8:9]
+; GFX7-NEXT: flat_load_dword v7, v[0:1]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -1208,8 +1208,8 @@ define void @flat_atomic_sub_i64_noret_offset(ptr %out, i64 %in) {
; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v7, v[0:1]
; GFX8-NEXT: flat_load_dword v6, v[8:9]
+; GFX8-NEXT: flat_load_dword v7, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -1260,10 +1260,10 @@ define i64 @flat_atomic_sub_i64_ret(ptr %ptr, i64 %in) {
; GFX7-LABEL: flat_atomic_sub_i64_ret:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0
-; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0
+; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_load_dword v5, v[4:5]
; GFX7-NEXT: flat_load_dword v4, v[0:1]
-; GFX7-NEXT: flat_load_dword v5, v[5:6]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -1288,10 +1288,10 @@ define i64 @flat_atomic_sub_i64_ret(ptr %ptr, i64 %in) {
; GFX8-LABEL: flat_atomic_sub_i64_ret:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0
-; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v5, v[4:5]
; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: flat_load_dword v5, v[5:6]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -1347,10 +1347,10 @@ define i64 @flat_atomic_sub_i64_ret_offset(ptr %out, i64 %in) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v1, v[0:1]
+; GFX7-NEXT: v_add_i32_e32 v6, vcc, 36, v0
+; GFX7-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
; GFX7-NEXT: flat_load_dword v0, v[4:5]
+; GFX7-NEXT: flat_load_dword v1, v[6:7]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -1375,10 +1375,10 @@ define i64 @flat_atomic_sub_i64_ret_offset(ptr %out, i64 %in) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v1, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 36, v0
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[4:5]
+; GFX8-NEXT: flat_load_dword v1, v[6:7]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -1897,8 +1897,8 @@ define void @flat_atomic_and_i64_noret(ptr %ptr, i64 %in) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v6, v[0:1]
; GFX7-NEXT: flat_load_dword v7, v[4:5]
+; GFX7-NEXT: flat_load_dword v6, v[0:1]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -1923,8 +1923,8 @@ define void @flat_atomic_and_i64_noret(ptr %ptr, i64 %in) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v6, v[0:1]
; GFX8-NEXT: flat_load_dword v7, v[4:5]
+; GFX8-NEXT: flat_load_dword v6, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB40_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -1978,8 +1978,8 @@ define void @flat_atomic_and_i64_noret_offset(ptr %out, i64 %in) {
; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v7, v[0:1]
; GFX7-NEXT: flat_load_dword v6, v[8:9]
+; GFX7-NEXT: flat_load_dword v7, v[0:1]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -2006,8 +2006,8 @@ define void @flat_atomic_and_i64_noret_offset(ptr %out, i64 %in) {
; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v7, v[0:1]
; GFX8-NEXT: flat_load_dword v6, v[8:9]
+; GFX8-NEXT: flat_load_dword v7, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB41_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -2058,10 +2058,10 @@ define i64 @flat_atomic_and_i64_ret(ptr %ptr, i64 %in) {
; GFX7-LABEL: flat_atomic_and_i64_ret:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0
-; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0
+; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_load_dword v5, v[4:5]
; GFX7-NEXT: flat_load_dword v4, v[0:1]
-; GFX7-NEXT: flat_load_dword v5, v[5:6]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -2086,10 +2086,10 @@ define i64 @flat_atomic_and_i64_ret(ptr %ptr, i64 %in) {
; GFX8-LABEL: flat_atomic_and_i64_ret:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0
-; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v5, v[4:5]
; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: flat_load_dword v5, v[5:6]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB42_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -2145,10 +2145,10 @@ define i64 @flat_atomic_and_i64_ret_offset(ptr %out, i64 %in) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v1, v[0:1]
+; GFX7-NEXT: v_add_i32_e32 v6, vcc, 36, v0
+; GFX7-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
; GFX7-NEXT: flat_load_dword v0, v[4:5]
+; GFX7-NEXT: flat_load_dword v1, v[6:7]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -2173,10 +2173,10 @@ define i64 @flat_atomic_and_i64_ret_offset(ptr %out, i64 %in) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v1, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 36, v0
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[4:5]
+; GFX8-NEXT: flat_load_dword v1, v[6:7]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB43_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -2683,8 +2683,8 @@ define void @flat_atomic_nand_i64_noret(ptr %ptr, i64 %in) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v6, v[0:1]
; GFX7-NEXT: flat_load_dword v7, v[4:5]
+; GFX7-NEXT: flat_load_dword v6, v[0:1]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -2711,8 +2711,8 @@ define void @flat_atomic_nand_i64_noret(ptr %ptr, i64 %in) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v6, v[0:1]
; GFX8-NEXT: flat_load_dword v7, v[4:5]
+; GFX8-NEXT: flat_load_dword v6, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -2770,8 +2770,8 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) {
; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v7, v[0:1]
; GFX7-NEXT: flat_load_dword v6, v[8:9]
+; GFX7-NEXT: flat_load_dword v7, v[0:1]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -2800,8 +2800,8 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) {
; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v7, v[0:1]
; GFX8-NEXT: flat_load_dword v6, v[8:9]
+; GFX8-NEXT: flat_load_dword v7, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -2856,10 +2856,10 @@ define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) {
; GFX7-LABEL: flat_atomic_nand_i64_ret:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0
-; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0
+; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_load_dword v5, v[4:5]
; GFX7-NEXT: flat_load_dword v4, v[0:1]
-; GFX7-NEXT: flat_load_dword v5, v[5:6]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -2886,10 +2886,10 @@ define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) {
; GFX8-LABEL: flat_atomic_nand_i64_ret:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0
-; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v5, v[4:5]
; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: flat_load_dword v5, v[5:6]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -2949,10 +2949,10 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v1, v[0:1]
+; GFX7-NEXT: v_add_i32_e32 v6, vcc, 36, v0
+; GFX7-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
; GFX7-NEXT: flat_load_dword v0, v[4:5]
+; GFX7-NEXT: flat_load_dword v1, v[6:7]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -2979,10 +2979,10 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v1, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 36, v0
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[4:5]
+; GFX8-NEXT: flat_load_dword v1, v[6:7]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -3449,8 +3449,8 @@ define void @flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr %out
; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v7, v[0:1]
; GFX7-NEXT: flat_load_dword v6, v[8:9]
+; GFX7-NEXT: flat_load_dword v7, v[0:1]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -3479,8 +3479,8 @@ define void @flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr %out
; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v7, v[0:1]
; GFX8-NEXT: flat_load_dword v6, v[8:9]
+; GFX8-NEXT: flat_load_dword v7, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -3537,10 +3537,10 @@ define i64 @flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v1, v[0:1]
+; GFX7-NEXT: v_add_i32_e32 v6, vcc, 36, v0
+; GFX7-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
; GFX7-NEXT: flat_load_dword v0, v[4:5]
+; GFX7-NEXT: flat_load_dword v1, v[6:7]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -3567,10 +3567,10 @@ define i64 @flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v1, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 36, v0
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[4:5]
+; GFX8-NEXT: flat_load_dword v1, v[6:7]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -3633,8 +3633,8 @@ define void @flat_atomic_or_i64_noret(ptr %ptr, i64 %in) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v6, v[0:1]
; GFX7-NEXT: flat_load_dword v7, v[4:5]
+; GFX7-NEXT: flat_load_dword v6, v[0:1]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -3659,8 +3659,8 @@ define void @flat_atomic_or_i64_noret(ptr %ptr, i64 %in) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v6, v[0:1]
; GFX8-NEXT: flat_load_dword v7, v[4:5]
+; GFX8-NEXT: flat_load_dword v6, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -3714,8 +3714,8 @@ define void @flat_atomic_or_i64_noret_offset(ptr %out, i64 %in) {
; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v7, v[0:1]
; GFX7-NEXT: flat_load_dword v6, v[8:9]
+; GFX7-NEXT: flat_load_dword v7, v[0:1]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -3742,8 +3742,8 @@ define void @flat_atomic_or_i64_noret_offset(ptr %out, i64 %in) {
; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v7, v[0:1]
; GFX8-NEXT: flat_load_dword v6, v[8:9]
+; GFX8-NEXT: flat_load_dword v7, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -3794,10 +3794,10 @@ define i64 @flat_atomic_or_i64_ret(ptr %ptr, i64 %in) {
; GFX7-LABEL: flat_atomic_or_i64_ret:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0
-; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0
+; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_load_dword v5, v[4:5]
; GFX7-NEXT: flat_load_dword v4, v[0:1]
-; GFX7-NEXT: flat_load_dword v5, v[5:6]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB62_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -3822,10 +3822,10 @@ define i64 @flat_atomic_or_i64_ret(ptr %ptr, i64 %in) {
; GFX8-LABEL: flat_atomic_or_i64_ret:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0
-; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v5, v[4:5]
; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: flat_load_dword v5, v[5:6]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB62_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -3881,10 +3881,10 @@ define i64 @flat_atomic_or_i64_ret_offset(ptr %out, i64 %in) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v1, v[0:1]
+; GFX7-NEXT: v_add_i32_e32 v6, vcc, 36, v0
+; GFX7-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
; GFX7-NEXT: flat_load_dword v0, v[4:5]
+; GFX7-NEXT: flat_load_dword v1, v[6:7]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB63_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -3909,10 +3909,10 @@ define i64 @flat_atomic_or_i64_ret_offset(ptr %out, i64 %in) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v1, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 36, v0
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[4:5]
+; GFX8-NEXT: flat_load_dword v1, v[6:7]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB63_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -4419,8 +4419,8 @@ define void @flat_atomic_xor_i64_noret(ptr %ptr, i64 %in) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v6, v[0:1]
; GFX7-NEXT: flat_load_dword v7, v[4:5]
+; GFX7-NEXT: flat_load_dword v6, v[0:1]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB70_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -4445,8 +4445,8 @@ define void @flat_atomic_xor_i64_noret(ptr %ptr, i64 %in) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v6, v[0:1]
; GFX8-NEXT: flat_load_dword v7, v[4:5]
+; GFX8-NEXT: flat_load_dword v6, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB70_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -4500,8 +4500,8 @@ define void @flat_atomic_xor_i64_noret_offset(ptr %out, i64 %in) {
; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v7, v[0:1]
; GFX7-NEXT: flat_load_dword v6, v[8:9]
+; GFX7-NEXT: flat_load_dword v7, v[0:1]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB71_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -4528,8 +4528,8 @@ define void @flat_atomic_xor_i64_noret_offset(ptr %out, i64 %in) {
; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v7, v[0:1]
; GFX8-NEXT: flat_load_dword v6, v[8:9]
+; GFX8-NEXT: flat_load_dword v7, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB71_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -4580,10 +4580,10 @@ define i64 @flat_atomic_xor_i64_ret(ptr %ptr, i64 %in) {
; GFX7-LABEL: flat_atomic_xor_i64_ret:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0
-; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0
+; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_load_dword v5, v[4:5]
; GFX7-NEXT: flat_load_dword v4, v[0:1]
-; GFX7-NEXT: flat_load_dword v5, v[5:6]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB72_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -4608,10 +4608,10 @@ define i64 @flat_atomic_xor_i64_ret(ptr %ptr, i64 %in) {
; GFX8-LABEL: flat_atomic_xor_i64_ret:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0
-; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v5, v[4:5]
; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: flat_load_dword v5, v[5:6]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB72_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -4667,10 +4667,10 @@ define i64 @flat_atomic_xor_i64_ret_offset(ptr %out, i64 %in) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v1, v[0:1]
+; GFX7-NEXT: v_add_i32_e32 v6, vcc, 36, v0
+; GFX7-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
; GFX7-NEXT: flat_load_dword v0, v[4:5]
+; GFX7-NEXT: flat_load_dword v1, v[6:7]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB73_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -4695,10 +4695,10 @@ define i64 @flat_atomic_xor_i64_ret_offset(ptr %out, i64 %in) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v1, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 36, v0
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[4:5]
+; GFX8-NEXT: flat_load_dword v1, v[6:7]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB73_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -5205,8 +5205,8 @@ define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v6, v[0:1]
; GFX7-NEXT: flat_load_dword v7, v[4:5]
+; GFX7-NEXT: flat_load_dword v6, v[0:1]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB80_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -5232,8 +5232,8 @@ define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v6, v[0:1]
; GFX8-NEXT: flat_load_dword v7, v[4:5]
+; GFX8-NEXT: flat_load_dword v6, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB80_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -5289,8 +5289,8 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) {
; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v7, v[0:1]
; GFX7-NEXT: flat_load_dword v6, v[8:9]
+; GFX7-NEXT: flat_load_dword v7, v[0:1]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB81_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -5318,8 +5318,8 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) {
; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v7, v[0:1]
; GFX8-NEXT: flat_load_dword v6, v[8:9]
+; GFX8-NEXT: flat_load_dword v7, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB81_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -5372,10 +5372,10 @@ define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) {
; GFX7-LABEL: flat_atomic_max_i64_ret:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0
-; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0
+; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_load_dword v5, v[4:5]
; GFX7-NEXT: flat_load_dword v4, v[0:1]
-; GFX7-NEXT: flat_load_dword v5, v[5:6]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB82_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -5401,10 +5401,10 @@ define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) {
; GFX8-LABEL: flat_atomic_max_i64_ret:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0
-; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v5, v[4:5]
; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: flat_load_dword v5, v[5:6]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB82_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -5462,10 +5462,10 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v1, v[0:1]
+; GFX7-NEXT: v_add_i32_e32 v6, vcc, 36, v0
+; GFX7-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
; GFX7-NEXT: flat_load_dword v0, v[4:5]
+; GFX7-NEXT: flat_load_dword v1, v[6:7]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB83_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -5491,10 +5491,10 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v1, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 36, v0
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[4:5]
+; GFX8-NEXT: flat_load_dword v1, v[6:7]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB83_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -6477,8 +6477,8 @@ define void @flat_atomic_umax_i64_noret(ptr %ptr, i64 %in) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v6, v[0:1]
; GFX7-NEXT: flat_load_dword v7, v[4:5]
+; GFX7-NEXT: flat_load_dword v6, v[0:1]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB94_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -6504,8 +6504,8 @@ define void @flat_atomic_umax_i64_noret(ptr %ptr, i64 %in) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v6, v[0:1]
; GFX8-NEXT: flat_load_dword v7, v[4:5]
+; GFX8-NEXT: flat_load_dword v6, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB94_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -6561,8 +6561,8 @@ define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) {
; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v7, v[0:1]
; GFX7-NEXT: flat_load_dword v6, v[8:9]
+; GFX7-NEXT: flat_load_dword v7, v[0:1]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB95_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -6590,8 +6590,8 @@ define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) {
; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v7, v[0:1]
; GFX8-NEXT: flat_load_dword v6, v[8:9]
+; GFX8-NEXT: flat_load_dword v7, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB95_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -6644,10 +6644,10 @@ define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) {
; GFX7-LABEL: flat_atomic_umax_i64_ret:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0
-; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0
+; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_load_dword v5, v[4:5]
; GFX7-NEXT: flat_load_dword v4, v[0:1]
-; GFX7-NEXT: flat_load_dword v5, v[5:6]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB96_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -6673,10 +6673,10 @@ define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) {
; GFX8-LABEL: flat_atomic_umax_i64_ret:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0
-; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v5, v[4:5]
; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: flat_load_dword v5, v[5:6]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB96_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -6734,10 +6734,10 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v1, v[0:1]
+; GFX7-NEXT: v_add_i32_e32 v6, vcc, 36, v0
+; GFX7-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
; GFX7-NEXT: flat_load_dword v0, v[4:5]
+; GFX7-NEXT: flat_load_dword v1, v[6:7]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB97_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -6763,10 +6763,10 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v1, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 36, v0
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[4:5]
+; GFX8-NEXT: flat_load_dword v1, v[6:7]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB97_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -7647,8 +7647,8 @@ define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v6, v[0:1]
; GFX7-NEXT: flat_load_dword v7, v[4:5]
+; GFX7-NEXT: flat_load_dword v6, v[0:1]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB107_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -7674,8 +7674,8 @@ define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v6, v[0:1]
; GFX8-NEXT: flat_load_dword v7, v[4:5]
+; GFX8-NEXT: flat_load_dword v6, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB107_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -7731,8 +7731,8 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) {
; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v7, v[0:1]
; GFX7-NEXT: flat_load_dword v6, v[8:9]
+; GFX7-NEXT: flat_load_dword v7, v[0:1]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB108_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -7760,8 +7760,8 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) {
; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v7, v[0:1]
; GFX8-NEXT: flat_load_dword v6, v[8:9]
+; GFX8-NEXT: flat_load_dword v7, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB108_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -7814,10 +7814,10 @@ define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) {
; GFX7-LABEL: flat_atomic_umin_i64_ret:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0
-; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0
+; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_load_dword v5, v[4:5]
; GFX7-NEXT: flat_load_dword v4, v[0:1]
-; GFX7-NEXT: flat_load_dword v5, v[5:6]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB109_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -7843,10 +7843,10 @@ define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) {
; GFX8-LABEL: flat_atomic_umin_i64_ret:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0
-; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v5, v[4:5]
; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: flat_load_dword v5, v[5:6]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB109_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -7904,10 +7904,10 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v1, v[0:1]
+; GFX7-NEXT: v_add_i32_e32 v6, vcc, 36, v0
+; GFX7-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
; GFX7-NEXT: flat_load_dword v0, v[4:5]
+; GFX7-NEXT: flat_load_dword v1, v[6:7]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB110_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -7933,10 +7933,10 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v1, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 36, v0
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[4:5]
+; GFX8-NEXT: flat_load_dword v1, v[6:7]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB110_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8481,8 +8481,8 @@ define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v6, v[0:1]
; GFX7-NEXT: flat_load_dword v7, v[4:5]
+; GFX7-NEXT: flat_load_dword v6, v[0:1]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB117_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8508,8 +8508,8 @@ define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v6, v[0:1]
; GFX8-NEXT: flat_load_dword v7, v[4:5]
+; GFX8-NEXT: flat_load_dword v6, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB117_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8565,8 +8565,8 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) {
; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v7, v[0:1]
; GFX7-NEXT: flat_load_dword v6, v[8:9]
+; GFX7-NEXT: flat_load_dword v7, v[0:1]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB118_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8594,8 +8594,8 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) {
; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v7, v[0:1]
; GFX8-NEXT: flat_load_dword v6, v[8:9]
+; GFX8-NEXT: flat_load_dword v7, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB118_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8648,10 +8648,10 @@ define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) {
; GFX7-LABEL: flat_atomic_min_i64_ret:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0
-; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0
+; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_load_dword v5, v[4:5]
; GFX7-NEXT: flat_load_dword v4, v[0:1]
-; GFX7-NEXT: flat_load_dword v5, v[5:6]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB119_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8677,10 +8677,10 @@ define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) {
; GFX8-LABEL: flat_atomic_min_i64_ret:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0
-; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v5, v[4:5]
; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: flat_load_dword v5, v[5:6]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB119_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8738,10 +8738,10 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v1, v[0:1]
+; GFX7-NEXT: v_add_i32_e32 v6, vcc, 36, v0
+; GFX7-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
; GFX7-NEXT: flat_load_dword v0, v[4:5]
+; GFX7-NEXT: flat_load_dword v1, v[6:7]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB120_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8767,10 +8767,10 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v1, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 36, v0
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[4:5]
+; GFX8-NEXT: flat_load_dword v1, v[6:7]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB120_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -9746,8 +9746,8 @@ define void @flat_atomic_uinc_wrap_i64_noret(ptr %ptr, i64 %in) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v6, v[0:1]
; GFX7-NEXT: flat_load_dword v7, v[4:5]
+; GFX7-NEXT: flat_load_dword v6, v[0:1]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB131_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -9775,8 +9775,8 @@ define void @flat_atomic_uinc_wrap_i64_noret(ptr %ptr, i64 %in) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v6, v[0:1]
; GFX8-NEXT: flat_load_dword v7, v[4:5]
+; GFX8-NEXT: flat_load_dword v6, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB131_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -9836,8 +9836,8 @@ define void @flat_atomic_uinc_wrap_i64_noret_offset(ptr %out, i64 %in) {
; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v7, v[0:1]
; GFX7-NEXT: flat_load_dword v6, v[8:9]
+; GFX7-NEXT: flat_load_dword v7, v[0:1]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB132_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -9867,8 +9867,8 @@ define void @flat_atomic_uinc_wrap_i64_noret_offset(ptr %out, i64 %in) {
; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v7, v[0:1]
; GFX8-NEXT: flat_load_dword v6, v[8:9]
+; GFX8-NEXT: flat_load_dword v7, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB132_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -9925,10 +9925,10 @@ define i64 @flat_atomic_uinc_wrap_i64_ret(ptr %ptr, i64 %in) {
; GFX7-LABEL: flat_atomic_uinc_wrap_i64_ret:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0
-; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0
+; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_load_dword v5, v[4:5]
; GFX7-NEXT: flat_load_dword v4, v[0:1]
-; GFX7-NEXT: flat_load_dword v5, v[5:6]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB133_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -9956,10 +9956,10 @@ define i64 @flat_atomic_uinc_wrap_i64_ret(ptr %ptr, i64 %in) {
; GFX8-LABEL: flat_atomic_uinc_wrap_i64_ret:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0
-; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v5, v[4:5]
; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: flat_load_dword v5, v[5:6]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB133_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -10021,10 +10021,10 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset(ptr %out, i64 %in) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v1, v[0:1]
+; GFX7-NEXT: v_add_i32_e32 v6, vcc, 36, v0
+; GFX7-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
; GFX7-NEXT: flat_load_dword v0, v[4:5]
+; GFX7-NEXT: flat_load_dword v1, v[6:7]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB134_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -10052,10 +10052,10 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset(ptr %out, i64 %in) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v1, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 36, v0
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[4:5]
+; GFX8-NEXT: flat_load_dword v1, v[6:7]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB134_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -10604,8 +10604,8 @@ define void @flat_atomic_udec_wrap_i64_noret(ptr %ptr, i64 %in) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v6, v[0:1]
; GFX7-NEXT: flat_load_dword v7, v[4:5]
+; GFX7-NEXT: flat_load_dword v6, v[0:1]
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB141_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -10635,8 +10635,8 @@ define void @flat_atomic_udec_wrap_i64_noret(ptr %ptr, i64 %in) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v6, v[0:1]
; GFX8-NEXT: flat_load_dword v7, v[4:5]
+; GFX8-NEXT: flat_load_dword v6, v[0:1]
; GFX8-NEXT: s_mov_b64 s[8:9], 0
; GFX8-NEXT: .LBB141_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -10700,8 +10700,8 @@ define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) {
; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v7, v[0:1]
; GFX7-NEXT: flat_load_dword v6, v[8:9]
+; GFX7-NEXT: flat_load_dword v7, v[0:1]
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB142_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -10733,8 +10733,8 @@ define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) {
; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v7, v[0:1]
; GFX8-NEXT: flat_load_dword v6, v[8:9]
+; GFX8-NEXT: flat_load_dword v7, v[0:1]
; GFX8-NEXT: s_mov_b64 s[8:9], 0
; GFX8-NEXT: .LBB142_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -10795,10 +10795,10 @@ define i64 @flat_atomic_udec_wrap_i64_ret(ptr %ptr, i64 %in) {
; GFX7-LABEL: flat_atomic_udec_wrap_i64_ret:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0
-; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0
+; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_load_dword v5, v[4:5]
; GFX7-NEXT: flat_load_dword v4, v[0:1]
-; GFX7-NEXT: flat_load_dword v5, v[5:6]
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB143_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -10828,10 +10828,10 @@ define i64 @flat_atomic_udec_wrap_i64_ret(ptr %ptr, i64 %in) {
; GFX8-LABEL: flat_atomic_udec_wrap_i64_ret:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0
-; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v5, v[4:5]
; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: flat_load_dword v5, v[5:6]
; GFX8-NEXT: s_mov_b64 s[8:9], 0
; GFX8-NEXT: .LBB143_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -10897,10 +10897,10 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset(ptr %out, i64 %in) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v1, v[0:1]
+; GFX7-NEXT: v_add_i32_e32 v6, vcc, 36, v0
+; GFX7-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
; GFX7-NEXT: flat_load_dword v0, v[4:5]
+; GFX7-NEXT: flat_load_dword v1, v[6:7]
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB144_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -10930,10 +10930,10 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset(ptr %out, i64 %in) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v1, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 36, v0
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[4:5]
+; GFX8-NEXT: flat_load_dword v1, v[6:7]
; GFX8-NEXT: s_mov_b64 s[8:9], 0
; GFX8-NEXT: .LBB144_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/AMDGPU/fma-combine.ll b/llvm/test/CodeGen/AMDGPU/fma-combine.ll
index 8fc6904f5009c..4f65825c4d8c5 100644
--- a/llvm/test/CodeGen/AMDGPU/fma-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma-combine.ll
@@ -2707,14 +2707,14 @@ define amdgpu_kernel void @fma_neg_b_c_v4f32(ptr addrspace(1) %out, ptr addrspac
; GFX11-NEXT: v_lshlrev_b32_e32 v12, 4, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x2
-; GFX11-NEXT: global_load_b128 v[0:3], v12, s[2:3] offset:16
-; GFX11-NEXT: global_load_b128 v[4:7], v12, s[2:3]
+; GFX11-NEXT: global_load_b128 v[0:3], v12, s[2:3]
+; GFX11-NEXT: global_load_b128 v[4:7], v12, s[2:3] offset:16
; GFX11-NEXT: global_load_b128 v[8:11], v12, s[2:3] offset:48
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_fma_f32 v3, v11, -v7, -v3
-; GFX11-NEXT: v_fma_f32 v2, v10, -v6, -v2
-; GFX11-NEXT: v_fma_f32 v1, v9, -v5, -v1
-; GFX11-NEXT: v_fma_f32 v0, v8, -v4, -v0
+; GFX11-NEXT: v_fma_f32 v3, v11, -v3, -v7
+; GFX11-NEXT: v_fma_f32 v2, v10, -v2, -v6
+; GFX11-NEXT: v_fma_f32 v1, v9, -v1, -v5
+; GFX11-NEXT: v_fma_f32 v0, v8, -v0, -v4
; GFX11-NEXT: global_store_b128 v12, v[0:3], s[0:1]
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll b/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll
index af753812b4c0c..032e3e1e677b4 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll
@@ -497,32 +497,32 @@ define void @flat_offset_inbounds_very_wide(ptr %p, ptr %pout, i32 %i) {
; GFX90A-SDAG-MUBUF-NEXT: v_add_co_u32_e64 v6, s[4:5], 28, v0
; GFX90A-SDAG-MUBUF-NEXT: v_addc_co_u32_e64 v7, s[4:5], 0, v1, s[4:5]
; GFX90A-SDAG-MUBUF-NEXT: v_add_co_u32_e64 v8, s[4:5], 44, v0
-; GFX90A-SDAG-MUBUF-NEXT: v_add_co_u32_e32 v4, vcc, 0x8c, v0
; GFX90A-SDAG-MUBUF-NEXT: v_addc_co_u32_e64 v9, s[4:5], 0, v1, s[4:5]
-; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[10:13], v[8:9] offset:16
-; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[14:17], v[8:9] offset:32
-; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[18:21], v[8:9] offset:48
-; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[22:25], v[8:9] offset:64
-; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[26:29], v[8:9] offset:80
-; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[30:33], v[6:7]
+; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[10:13], v[0:1] offset:12
+; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[14:17], v[8:9] offset:16
+; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[18:21], v[8:9] offset:32
+; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[22:25], v[8:9] offset:48
+; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[26:29], v[8:9] offset:64
+; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[30:33], v[8:9] offset:80
+; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[34:37], v[6:7]
; GFX90A-SDAG-MUBUF-NEXT: s_nop 0
; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[6:9], v[8:9]
+; GFX90A-SDAG-MUBUF-NEXT: v_add_co_u32_e32 v4, vcc, 0x8c, v0
; GFX90A-SDAG-MUBUF-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
-; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[34:37], v[0:1] offset:12
; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[48:51], v[4:5]
; GFX90A-SDAG-MUBUF-NEXT: v_add_co_u32_e32 v0, vcc, 48, v2
; GFX90A-SDAG-MUBUF-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
; GFX90A-SDAG-MUBUF-NEXT: v_add_co_u32_e32 v4, vcc, 0x88, v2
; GFX90A-SDAG-MUBUF-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v3, vcc
; GFX90A-SDAG-MUBUF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[0:1], v[22:25] offset:48
-; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[0:1], v[26:29] offset:64
-; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[2:3], v[14:17] offset:64
-; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[0:1], v[18:21] offset:32
+; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[2:3], v[10:13]
+; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[0:1], v[26:29] offset:48
+; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[0:1], v[30:33] offset:64
+; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[2:3], v[18:21] offset:64
+; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[0:1], v[22:25] offset:32
; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[2:3], v[6:9] offset:32
-; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[0:1], v[10:13]
-; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[2:3], v[30:33] offset:16
-; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[2:3], v[34:37]
+; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[0:1], v[14:17]
+; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[2:3], v[34:37] offset:16
; GFX90A-SDAG-MUBUF-NEXT: flat_store_dword v[4:5], v50
; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx2 v[2:3], v[48:49] offset:128
; GFX90A-SDAG-MUBUF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -538,32 +538,32 @@ define void @flat_offset_inbounds_very_wide(ptr %p, ptr %pout, i32 %i) {
; GFX90A-SDAG-FLATSCR-NEXT: v_add_co_u32_e64 v6, s[0:1], 28, v0
; GFX90A-SDAG-FLATSCR-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, v1, s[0:1]
; GFX90A-SDAG-FLATSCR-NEXT: v_add_co_u32_e64 v8, s[0:1], 44, v0
-; GFX90A-SDAG-FLATSCR-NEXT: v_add_co_u32_e32 v4, vcc, 0x8c, v0
; GFX90A-SDAG-FLATSCR-NEXT: v_addc_co_u32_e64 v9, s[0:1], 0, v1, s[0:1]
-; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[10:13], v[8:9] offset:16
-; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[14:17], v[8:9] offset:32
-; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[18:21], v[8:9] offset:48
-; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[22:25], v[8:9] offset:64
-; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[26:29], v[8:9] offset:80
-; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[30:33], v[6:7]
+; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[10:13], v[0:1] offset:12
+; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[14:17], v[8:9] offset:16
+; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[18:21], v[8:9] offset:32
+; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[22:25], v[8:9] offset:48
+; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[26:29], v[8:9] offset:64
+; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[30:33], v[8:9] offset:80
+; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[34:37], v[6:7]
; GFX90A-SDAG-FLATSCR-NEXT: s_nop 0
; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[6:9], v[8:9]
+; GFX90A-SDAG-FLATSCR-NEXT: v_add_co_u32_e32 v4, vcc, 0x8c, v0
; GFX90A-SDAG-FLATSCR-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
-; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[34:37], v[0:1] offset:12
; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[48:51], v[4:5]
; GFX90A-SDAG-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, 48, v2
; GFX90A-SDAG-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
; GFX90A-SDAG-FLATSCR-NEXT: v_add_co_u32_e32 v4, vcc, 0x88, v2
; GFX90A-SDAG-FLATSCR-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v3, vcc
; GFX90A-SDAG-FLATSCR-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[0:1], v[22:25] offset:48
-; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[0:1], v[26:29] offset:64
-; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[2:3], v[14:17] offset:64
-; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[0:1], v[18:21] offset:32
+; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[2:3], v[10:13]
+; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[0:1], v[26:29] offset:48
+; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[0:1], v[30:33] offset:64
+; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[2:3], v[18:21] offset:64
+; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[0:1], v[22:25] offset:32
; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[2:3], v[6:9] offset:32
-; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[0:1], v[10:13]
-; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[2:3], v[30:33] offset:16
-; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[2:3], v[34:37]
+; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[0:1], v[14:17]
+; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[2:3], v[34:37] offset:16
; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dword v[4:5], v50
; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx2 v[2:3], v[48:49] offset:128
; GFX90A-SDAG-FLATSCR-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -579,13 +579,13 @@ define void @flat_offset_inbounds_very_wide(ptr %p, ptr %pout, i32 %i) {
; GFX10-SDAG-NEXT: v_add_co_u32 v36, vcc_lo, v0, 28
; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v37, null, 0, v1, vcc_lo
; GFX10-SDAG-NEXT: s_clause 0x8
-; GFX10-SDAG-NEXT: flat_load_dwordx4 v[4:7], v[36:37] offset:80
-; GFX10-SDAG-NEXT: flat_load_dwordx4 v[8:11], v[36:37] offset:96
-; GFX10-SDAG-NEXT: flat_load_dwordx4 v[12:15], v[36:37] offset:48
-; GFX10-SDAG-NEXT: flat_load_dwordx4 v[16:19], v[36:37] offset:64
-; GFX10-SDAG-NEXT: flat_load_dwordx4 v[20:23], v[36:37] offset:16
-; GFX10-SDAG-NEXT: flat_load_dwordx4 v[24:27], v[36:37] offset:32
-; GFX10-SDAG-NEXT: flat_load_dwordx4 v[28:31], v[0:1] offset:12
+; GFX10-SDAG-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:12
+; GFX10-SDAG-NEXT: flat_load_dwordx4 v[8:11], v[36:37] offset:80
+; GFX10-SDAG-NEXT: flat_load_dwordx4 v[12:15], v[36:37] offset:96
+; GFX10-SDAG-NEXT: flat_load_dwordx4 v[16:19], v[36:37] offset:48
+; GFX10-SDAG-NEXT: flat_load_dwordx4 v[20:23], v[36:37] offset:64
+; GFX10-SDAG-NEXT: flat_load_dwordx4 v[24:27], v[36:37] offset:16
+; GFX10-SDAG-NEXT: flat_load_dwordx4 v[28:31], v[36:37] offset:32
; GFX10-SDAG-NEXT: flat_load_dwordx4 v[32:35], v[36:37]
; GFX10-SDAG-NEXT: flat_load_dwordx4 v[36:39], v[36:37] offset:112
; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v2, 48
@@ -593,19 +593,19 @@ define void @flat_offset_inbounds_very_wide(ptr %p, ptr %pout, i32 %i) {
; GFX10-SDAG-NEXT: v_add_co_u32 v48, vcc_lo, 0x88, v2
; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v49, null, 0, v3, vcc_lo
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8)
-; GFX10-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:48
+; GFX10-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[4:7]
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(7) lgkmcnt(8)
-; GFX10-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[8:11] offset:64
+; GFX10-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[8:11] offset:48
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(6) lgkmcnt(8)
-; GFX10-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[12:15] offset:64
+; GFX10-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[12:15] offset:64
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(5) lgkmcnt(8)
-; GFX10-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[16:19] offset:32
+; GFX10-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[16:19] offset:64
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(4) lgkmcnt(8)
-; GFX10-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[20:23] offset:32
+; GFX10-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[20:23] offset:32
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(3) lgkmcnt(8)
-; GFX10-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[24:27]
+; GFX10-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[24:27] offset:32
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(2) lgkmcnt(8)
-; GFX10-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[28:31]
+; GFX10-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[28:31]
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(1) lgkmcnt(8)
; GFX10-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[32:35] offset:16
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(8)
@@ -628,20 +628,20 @@ define void @flat_offset_inbounds_very_wide(ptr %p, ptr %pout, i32 %i) {
; GFX942-SDAG-NEXT: s_mov_b64 s[2:3], 0x6c
; GFX942-SDAG-NEXT: v_lshl_add_u64 v[16:17], v[0:1], 0, s[2:3]
; GFX942-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 28
+; GFX942-SDAG-NEXT: flat_load_dwordx4 v[18:21], v[0:1] offset:12
; GFX942-SDAG-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, 60
; GFX942-SDAG-NEXT: v_lshl_add_u64 v[8:9], v[0:1], 0, 44
-; GFX942-SDAG-NEXT: flat_load_dwordx4 v[18:21], v[16:17]
-; GFX942-SDAG-NEXT: flat_load_dwordx4 v[22:25], v[12:13]
-; GFX942-SDAG-NEXT: flat_load_dwordx4 v[26:29], v[14:15]
+; GFX942-SDAG-NEXT: flat_load_dwordx4 v[22:25], v[16:17]
+; GFX942-SDAG-NEXT: flat_load_dwordx4 v[26:29], v[12:13]
; GFX942-SDAG-NEXT: ; kill: killed $vgpr12_vgpr13
-; GFX942-SDAG-NEXT: ; kill: killed $vgpr14_vgpr15
; GFX942-SDAG-NEXT: ; kill: killed $vgpr16_vgpr17
; GFX942-SDAG-NEXT: s_nop 0
-; GFX942-SDAG-NEXT: flat_load_dwordx4 v[12:15], v[8:9]
-; GFX942-SDAG-NEXT: flat_load_dwordx4 v[30:33], v[10:11]
-; GFX942-SDAG-NEXT: flat_load_dwordx4 v[34:37], v[4:5]
-; GFX942-SDAG-NEXT: flat_load_dwordx4 v[48:51], v[6:7]
-; GFX942-SDAG-NEXT: flat_load_dwordx4 v[52:55], v[0:1] offset:12
+; GFX942-SDAG-NEXT: flat_load_dwordx4 v[12:15], v[14:15]
+; GFX942-SDAG-NEXT: s_nop 0
+; GFX942-SDAG-NEXT: flat_load_dwordx4 v[30:33], v[8:9]
+; GFX942-SDAG-NEXT: flat_load_dwordx4 v[34:37], v[10:11]
+; GFX942-SDAG-NEXT: flat_load_dwordx4 v[48:51], v[4:5]
+; GFX942-SDAG-NEXT: flat_load_dwordx4 v[52:55], v[6:7]
; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], 0x8c
; GFX942-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
; GFX942-SDAG-NEXT: flat_load_dwordx4 a[0:3], v[0:1]
@@ -655,14 +655,14 @@ define void @flat_offset_inbounds_very_wide(ptr %p, ptr %pout, i32 %i) {
; GFX942-SDAG-NEXT: v_lshl_add_u64 v[8:9], v[2:3], 0, 48
; GFX942-SDAG-NEXT: v_lshl_add_u64 v[10:11], v[2:3], 0, s[6:7]
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[18:21]
-; GFX942-SDAG-NEXT: flat_store_dwordx4 v[4:5], v[26:29]
-; GFX942-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[22:25] offset:64
-; GFX942-SDAG-NEXT: flat_store_dwordx4 v[6:7], v[30:33]
-; GFX942-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[12:15] offset:32
-; GFX942-SDAG-NEXT: flat_store_dwordx4 v[8:9], v[48:51]
-; GFX942-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[52:55]
-; GFX942-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[34:37] offset:16
+; GFX942-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[18:21]
+; GFX942-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[22:25]
+; GFX942-SDAG-NEXT: flat_store_dwordx4 v[4:5], v[12:15]
+; GFX942-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[26:29] offset:64
+; GFX942-SDAG-NEXT: flat_store_dwordx4 v[6:7], v[34:37]
+; GFX942-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[30:33] offset:32
+; GFX942-SDAG-NEXT: flat_store_dwordx4 v[8:9], v[52:55]
+; GFX942-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[48:51] offset:16
; GFX942-SDAG-NEXT: flat_store_dword v[10:11], a2
; GFX942-SDAG-NEXT: flat_store_dwordx2 v[2:3], a[0:1] offset:128
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/freeze.ll b/llvm/test/CodeGen/AMDGPU/freeze.ll
index 7df250d1fc1b4..645a370a88425 100644
--- a/llvm/test/CodeGen/AMDGPU/freeze.ll
+++ b/llvm/test/CodeGen/AMDGPU/freeze.ll
@@ -2962,37 +2962,38 @@ define void @freeze_v19i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
; GFX8-GISEL-LABEL: freeze_v19i32:
; GFX8-GISEL: ; %bb.0:
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 16, v0
-; GFX8-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, 32, v0
-; GFX8-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
+; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, 16, v0
; GFX8-GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
+; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 32, v0
+; GFX8-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc
; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 48, v0
; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v1, vcc
-; GFX8-GISEL-NEXT: v_add_u32_e32 v20, vcc, 64, v0
-; GFX8-GISEL-NEXT: v_addc_u32_e32 v21, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 64, v0
+; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
-; GFX8-GISEL-NEXT: flat_load_dwordx4 v[12:15], v[0:1]
+; GFX8-GISEL-NEXT: flat_load_dwordx4 v[12:15], v[12:13]
; GFX8-GISEL-NEXT: flat_load_dwordx4 v[16:19], v[16:17]
-; GFX8-GISEL-NEXT: flat_load_dwordx3 v[20:22], v[20:21]
+; GFX8-GISEL-NEXT: flat_load_dwordx3 v[20:22], v[0:1]
; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 16, v2
; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(4)
-; GFX8-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
-; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 32, v2
-; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
-; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 48, v2
+; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[4:7]
+; GFX8-GISEL-NEXT: s_nop 0
+; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 32, v2
; GFX8-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc
-; GFX8-GISEL-NEXT: v_add_u32_e32 v6, vcc, 64, v2
+; GFX8-GISEL-NEXT: v_add_u32_e32 v6, vcc, 48, v2
; GFX8-GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT: v_add_u32_e32 v2, vcc, 64, v2
+; GFX8-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(4)
; GFX8-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(4)
-; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[12:15]
+; GFX8-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[12:15]
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(4)
-; GFX8-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[16:19]
+; GFX8-GISEL-NEXT: flat_store_dwordx4 v[6:7], v[16:19]
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(4)
-; GFX8-GISEL-NEXT: flat_store_dwordx3 v[6:7], v[20:22]
+; GFX8-GISEL-NEXT: flat_store_dwordx3 v[2:3], v[20:22]
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -10349,24 +10350,24 @@ define void @freeze_v8p3(ptr addrspace(3) %ptra, ptr addrspace(3) %ptrb) {
; GFX6-SDAG-LABEL: freeze_v8p3:
; GFX6-SDAG: ; %bb.0:
; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-SDAG-NEXT: v_add_i32_e32 v2, vcc, 24, v0
-; GFX6-SDAG-NEXT: v_add_i32_e32 v4, vcc, 16, v0
+; GFX6-SDAG-NEXT: v_add_i32_e32 v4, vcc, 8, v0
+; GFX6-SDAG-NEXT: v_add_i32_e32 v8, vcc, 24, v0
; GFX6-SDAG-NEXT: s_mov_b32 m0, -1
-; GFX6-SDAG-NEXT: ds_read_b64 v[2:3], v2
+; GFX6-SDAG-NEXT: ds_read_b64 v[2:3], v0
+; GFX6-SDAG-NEXT: v_add_i32_e32 v0, vcc, 16, v0
; GFX6-SDAG-NEXT: ds_read_b64 v[4:5], v4
; GFX6-SDAG-NEXT: ds_read_b64 v[6:7], v0
-; GFX6-SDAG-NEXT: v_add_i32_e32 v0, vcc, 8, v0
-; GFX6-SDAG-NEXT: ds_read_b64 v[8:9], v0
+; GFX6-SDAG-NEXT: ds_read_b64 v[8:9], v8
; GFX6-SDAG-NEXT: v_add_i32_e32 v0, vcc, 16, v1
+; GFX6-SDAG-NEXT: s_waitcnt lgkmcnt(3)
+; GFX6-SDAG-NEXT: ds_write_b64 v1, v[2:3]
; GFX6-SDAG-NEXT: s_waitcnt lgkmcnt(2)
-; GFX6-SDAG-NEXT: ds_write_b64 v0, v[4:5]
+; GFX6-SDAG-NEXT: ds_write_b64 v0, v[6:7]
; GFX6-SDAG-NEXT: v_add_i32_e32 v0, vcc, 24, v1
-; GFX6-SDAG-NEXT: ds_write_b64 v0, v[2:3]
-; GFX6-SDAG-NEXT: v_add_i32_e32 v0, vcc, 8, v1
-; GFX6-SDAG-NEXT: s_waitcnt lgkmcnt(3)
-; GFX6-SDAG-NEXT: ds_write_b64 v1, v[6:7]
-; GFX6-SDAG-NEXT: s_waitcnt lgkmcnt(3)
+; GFX6-SDAG-NEXT: s_waitcnt lgkmcnt(2)
; GFX6-SDAG-NEXT: ds_write_b64 v0, v[8:9]
+; GFX6-SDAG-NEXT: v_add_i32_e32 v0, vcc, 8, v1
+; GFX6-SDAG-NEXT: ds_write_b64 v0, v[4:5]
; GFX6-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -10502,40 +10503,39 @@ define void @freeze_v16p3(ptr addrspace(3) %ptra, ptr addrspace(3) %ptrb) {
; GFX6-SDAG-LABEL: freeze_v16p3:
; GFX6-SDAG: ; %bb.0:
; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-SDAG-NEXT: v_add_i32_e32 v2, vcc, 8, v0
-; GFX6-SDAG-NEXT: v_add_i32_e32 v4, vcc, 24, v0
-; GFX6-SDAG-NEXT: v_add_i32_e32 v6, vcc, 16, v0
-; GFX6-SDAG-NEXT: v_add_i32_e32 v10, vcc, 40, v0
-; GFX6-SDAG-NEXT: v_add_i32_e32 v12, vcc, 32, v0
-; GFX6-SDAG-NEXT: v_add_i32_e32 v16, vcc, 56, v0
-; GFX6-SDAG-NEXT: v_add_i32_e32 v14, vcc, 48, v0
+; GFX6-SDAG-NEXT: v_add_i32_e32 v12, vcc, 8, v0
+; GFX6-SDAG-NEXT: v_add_i32_e32 v14, vcc, 24, v0
+; GFX6-SDAG-NEXT: v_add_i32_e32 v16, vcc, 16, v0
+; GFX6-SDAG-NEXT: v_add_i32_e32 v4, vcc, 40, v0
+; GFX6-SDAG-NEXT: v_add_i32_e32 v6, vcc, 32, v0
+; GFX6-SDAG-NEXT: v_add_i32_e32 v8, vcc, 56, v0
+; GFX6-SDAG-NEXT: v_add_i32_e32 v10, vcc, 48, v0
; GFX6-SDAG-NEXT: s_mov_b32 m0, -1
-; GFX6-SDAG-NEXT: ds_read_b64 v[2:3], v2
+; GFX6-SDAG-NEXT: ds_read_b64 v[2:3], v0
; GFX6-SDAG-NEXT: ds_read_b64 v[4:5], v4
; GFX6-SDAG-NEXT: ds_read_b64 v[6:7], v6
-; GFX6-SDAG-NEXT: ds_read_b64 v[8:9], v0
+; GFX6-SDAG-NEXT: ds_read_b64 v[8:9], v8
; GFX6-SDAG-NEXT: ds_read_b64 v[10:11], v10
; GFX6-SDAG-NEXT: ds_read_b64 v[12:13], v12
; GFX6-SDAG-NEXT: ds_read_b64 v[14:15], v14
; GFX6-SDAG-NEXT: ds_read_b64 v[16:17], v16
; GFX6-SDAG-NEXT: v_add_i32_e32 v0, vcc, 48, v1
-; GFX6-SDAG-NEXT: s_waitcnt lgkmcnt(4)
-; GFX6-SDAG-NEXT: ds_write_b64 v1, v[8:9]
-; GFX6-SDAG-NEXT: s_waitcnt lgkmcnt(2)
-; GFX6-SDAG-NEXT: ds_write_b64 v0, v[14:15]
+; GFX6-SDAG-NEXT: s_waitcnt lgkmcnt(3)
+; GFX6-SDAG-NEXT: ds_write_b64 v0, v[10:11]
; GFX6-SDAG-NEXT: v_add_i32_e32 v0, vcc, 56, v1
-; GFX6-SDAG-NEXT: s_waitcnt lgkmcnt(2)
-; GFX6-SDAG-NEXT: ds_write_b64 v0, v[16:17]
+; GFX6-SDAG-NEXT: ds_write_b64 v0, v[8:9]
; GFX6-SDAG-NEXT: v_add_i32_e32 v0, vcc, 32, v1
-; GFX6-SDAG-NEXT: ds_write_b64 v0, v[12:13]
+; GFX6-SDAG-NEXT: ds_write_b64 v0, v[6:7]
; GFX6-SDAG-NEXT: v_add_i32_e32 v0, vcc, 40, v1
-; GFX6-SDAG-NEXT: ds_write_b64 v0, v[10:11]
+; GFX6-SDAG-NEXT: ds_write_b64 v0, v[4:5]
; GFX6-SDAG-NEXT: v_add_i32_e32 v0, vcc, 16, v1
-; GFX6-SDAG-NEXT: ds_write_b64 v0, v[6:7]
+; GFX6-SDAG-NEXT: s_waitcnt lgkmcnt(4)
+; GFX6-SDAG-NEXT: ds_write_b64 v0, v[16:17]
; GFX6-SDAG-NEXT: v_add_i32_e32 v0, vcc, 24, v1
-; GFX6-SDAG-NEXT: ds_write_b64 v0, v[4:5]
+; GFX6-SDAG-NEXT: ds_write_b64 v0, v[14:15]
; GFX6-SDAG-NEXT: v_add_i32_e32 v0, vcc, 8, v1
-; GFX6-SDAG-NEXT: ds_write_b64 v0, v[2:3]
+; GFX6-SDAG-NEXT: ds_write_b64 v1, v[2:3]
+; GFX6-SDAG-NEXT: ds_write_b64 v0, v[12:13]
; GFX6-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -10794,75 +10794,47 @@ define void @freeze_p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) {
}
define void @freeze_v2p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) {
-; GFX6-SDAG-LABEL: freeze_v2p5:
-; GFX6-SDAG: ; %bb.0:
-; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-SDAG-NEXT: v_add_i32_e32 v2, vcc, 4, v0
-; GFX6-SDAG-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
-; GFX6-SDAG-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
-; GFX6-SDAG-NEXT: v_add_i32_e32 v3, vcc, 4, v1
-; GFX6-SDAG-NEXT: s_waitcnt vmcnt(1)
-; GFX6-SDAG-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
-; GFX6-SDAG-NEXT: s_waitcnt vmcnt(1)
-; GFX6-SDAG-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
-; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX6-GISEL-LABEL: freeze_v2p5:
-; GFX6-GISEL: ; %bb.0:
-; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-GISEL-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
-; GFX6-GISEL-NEXT: v_add_i32_e32 v0, vcc, 4, v0
-; GFX6-GISEL-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
-; GFX6-GISEL-NEXT: s_waitcnt vmcnt(1)
-; GFX6-GISEL-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
-; GFX6-GISEL-NEXT: v_add_i32_e32 v1, vcc, 4, v1
-; GFX6-GISEL-NEXT: s_waitcnt vmcnt(1)
-; GFX6-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
-; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-SDAG-LABEL: freeze_v2p5:
-; GFX7-SDAG: ; %bb.0:
-; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_add_i32_e32 v2, vcc, 4, v0
-; GFX7-SDAG-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
-; GFX7-SDAG-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
-; GFX7-SDAG-NEXT: v_add_i32_e32 v3, vcc, 4, v1
-; GFX7-SDAG-NEXT: s_waitcnt vmcnt(1)
-; GFX7-SDAG-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
-; GFX7-SDAG-NEXT: s_waitcnt vmcnt(1)
-; GFX7-SDAG-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
-; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
+; GFX6-LABEL: freeze_v2p5:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, 4, v0
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(1)
+; GFX6-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
+; GFX6-NEXT: v_add_i32_e32 v1, vcc, 4, v1
+; GFX6-NEXT: s_waitcnt vmcnt(1)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-GISEL-LABEL: freeze_v2p5:
-; GFX7-GISEL: ; %bb.0:
-; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
-; GFX7-GISEL-NEXT: v_add_i32_e32 v0, vcc, 4, v0
-; GFX7-GISEL-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
-; GFX7-GISEL-NEXT: s_waitcnt vmcnt(1)
-; GFX7-GISEL-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
-; GFX7-GISEL-NEXT: v_add_i32_e32 v1, vcc, 4, v1
-; GFX7-GISEL-NEXT: s_waitcnt vmcnt(1)
-; GFX7-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
-; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX7-LABEL: freeze_v2p5:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v0
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v1, vcc, 4, v1
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-GISEL-LABEL: freeze_v2p5:
-; GFX8-GISEL: ; %bb.0:
-; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
-; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 4, v0
-; GFX8-GISEL-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
-; GFX8-GISEL-NEXT: s_waitcnt vmcnt(1)
-; GFX8-GISEL-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
-; GFX8-GISEL-NEXT: v_add_u32_e32 v1, vcc, 4, v1
-; GFX8-GISEL-NEXT: s_waitcnt vmcnt(1)
-; GFX8-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
-; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX8-LABEL: freeze_v2p5:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v0
+; GFX8-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX8-NEXT: s_waitcnt vmcnt(1)
+; GFX8-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 4, v1
+; GFX8-NEXT: s_waitcnt vmcnt(1)
+; GFX8-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: freeze_v2p5:
; GFX9: ; %bb.0:
@@ -10905,18 +10877,19 @@ define void @freeze_v3p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) {
; GFX6-SDAG-LABEL: freeze_v3p5:
; GFX6-SDAG: ; %bb.0:
; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-SDAG-NEXT: v_add_i32_e32 v2, vcc, 4, v0
-; GFX6-SDAG-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
-; GFX6-SDAG-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen
-; GFX6-SDAG-NEXT: v_add_i32_e32 v0, vcc, 8, v0
+; GFX6-SDAG-NEXT: v_add_i32_e32 v2, vcc, 8, v0
+; GFX6-SDAG-NEXT: v_add_i32_e32 v3, vcc, 4, v0
; GFX6-SDAG-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-SDAG-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen
+; GFX6-SDAG-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
; GFX6-SDAG-NEXT: v_add_i32_e32 v4, vcc, 4, v1
; GFX6-SDAG-NEXT: v_add_i32_e32 v5, vcc, 8, v1
-; GFX6-SDAG-NEXT: s_waitcnt vmcnt(1)
-; GFX6-SDAG-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
-; GFX6-SDAG-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
; GFX6-SDAG-NEXT: s_waitcnt vmcnt(2)
-; GFX6-SDAG-NEXT: buffer_store_dword v0, v5, s[0:3], 0 offen
+; GFX6-SDAG-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-SDAG-NEXT: s_waitcnt vmcnt(2)
+; GFX6-SDAG-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
+; GFX6-SDAG-NEXT: s_waitcnt vmcnt(2)
+; GFX6-SDAG-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen
; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -10942,18 +10915,19 @@ define void @freeze_v3p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) {
; GFX7-SDAG-LABEL: freeze_v3p5:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_add_i32_e32 v2, vcc, 4, v0
-; GFX7-SDAG-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
-; GFX7-SDAG-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen
-; GFX7-SDAG-NEXT: v_add_i32_e32 v0, vcc, 8, v0
+; GFX7-SDAG-NEXT: v_add_i32_e32 v2, vcc, 8, v0
+; GFX7-SDAG-NEXT: v_add_i32_e32 v3, vcc, 4, v0
; GFX7-SDAG-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-SDAG-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen
+; GFX7-SDAG-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
; GFX7-SDAG-NEXT: v_add_i32_e32 v4, vcc, 4, v1
; GFX7-SDAG-NEXT: v_add_i32_e32 v5, vcc, 8, v1
-; GFX7-SDAG-NEXT: s_waitcnt vmcnt(1)
-; GFX7-SDAG-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
-; GFX7-SDAG-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(2)
-; GFX7-SDAG-NEXT: buffer_store_dword v0, v5, s[0:3], 0 offen
+; GFX7-SDAG-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(2)
+; GFX7-SDAG-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(2)
+; GFX7-SDAG-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -11042,22 +11016,24 @@ define void @freeze_v4p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) {
; GFX6-SDAG-LABEL: freeze_v4p5:
; GFX6-SDAG: ; %bb.0:
; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-SDAG-NEXT: v_add_i32_e32 v2, vcc, 8, v0
-; GFX6-SDAG-NEXT: v_add_i32_e32 v3, vcc, 4, v0
-; GFX6-SDAG-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
-; GFX6-SDAG-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen
-; GFX6-SDAG-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
-; GFX6-SDAG-NEXT: v_add_i32_e32 v0, vcc, 12, v0
+; GFX6-SDAG-NEXT: v_add_i32_e32 v2, vcc, 12, v0
+; GFX6-SDAG-NEXT: v_add_i32_e32 v3, vcc, 8, v0
+; GFX6-SDAG-NEXT: v_add_i32_e32 v4, vcc, 4, v0
; GFX6-SDAG-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-SDAG-NEXT: buffer_load_dword v4, v4, s[0:3], 0 offen
+; GFX6-SDAG-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen
+; GFX6-SDAG-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
; GFX6-SDAG-NEXT: v_add_i32_e32 v5, vcc, 4, v1
; GFX6-SDAG-NEXT: v_add_i32_e32 v6, vcc, 8, v1
; GFX6-SDAG-NEXT: v_add_i32_e32 v7, vcc, 12, v1
-; GFX6-SDAG-NEXT: s_waitcnt vmcnt(1)
-; GFX6-SDAG-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen
-; GFX6-SDAG-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
-; GFX6-SDAG-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
; GFX6-SDAG-NEXT: s_waitcnt vmcnt(3)
-; GFX6-SDAG-NEXT: buffer_store_dword v0, v7, s[0:3], 0 offen
+; GFX6-SDAG-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-SDAG-NEXT: s_waitcnt vmcnt(3)
+; GFX6-SDAG-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen
+; GFX6-SDAG-NEXT: s_waitcnt vmcnt(3)
+; GFX6-SDAG-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen
+; GFX6-SDAG-NEXT: s_waitcnt vmcnt(3)
+; GFX6-SDAG-NEXT: buffer_store_dword v2, v7, s[0:3], 0 offen
; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -11088,22 +11064,24 @@ define void @freeze_v4p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) {
; GFX7-SDAG-LABEL: freeze_v4p5:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_add_i32_e32 v2, vcc, 8, v0
-; GFX7-SDAG-NEXT: v_add_i32_e32 v3, vcc, 4, v0
-; GFX7-SDAG-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
-; GFX7-SDAG-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen
-; GFX7-SDAG-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
-; GFX7-SDAG-NEXT: v_add_i32_e32 v0, vcc, 12, v0
+; GFX7-SDAG-NEXT: v_add_i32_e32 v2, vcc, 12, v0
+; GFX7-SDAG-NEXT: v_add_i32_e32 v3, vcc, 8, v0
+; GFX7-SDAG-NEXT: v_add_i32_e32 v4, vcc, 4, v0
; GFX7-SDAG-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-SDAG-NEXT: buffer_load_dword v4, v4, s[0:3], 0 offen
+; GFX7-SDAG-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen
+; GFX7-SDAG-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
; GFX7-SDAG-NEXT: v_add_i32_e32 v5, vcc, 4, v1
; GFX7-SDAG-NEXT: v_add_i32_e32 v6, vcc, 8, v1
; GFX7-SDAG-NEXT: v_add_i32_e32 v7, vcc, 12, v1
-; GFX7-SDAG-NEXT: s_waitcnt vmcnt(1)
-; GFX7-SDAG-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen
-; GFX7-SDAG-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
-; GFX7-SDAG-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(3)
-; GFX7-SDAG-NEXT: buffer_store_dword v0, v7, s[0:3], 0 offen
+; GFX7-SDAG-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(3)
+; GFX7-SDAG-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(3)
+; GFX7-SDAG-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(3)
+; GFX7-SDAG-NEXT: buffer_store_dword v2, v7, s[0:3], 0 offen
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -11208,21 +11186,21 @@ define void @freeze_v8p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) {
; GFX6-SDAG-LABEL: freeze_v8p5:
; GFX6-SDAG: ; %bb.0:
; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-SDAG-NEXT: v_add_i32_e32 v2, vcc, 24, v0
-; GFX6-SDAG-NEXT: v_add_i32_e32 v3, vcc, 20, v0
-; GFX6-SDAG-NEXT: v_add_i32_e32 v4, vcc, 16, v0
-; GFX6-SDAG-NEXT: v_add_i32_e32 v5, vcc, 12, v0
-; GFX6-SDAG-NEXT: v_add_i32_e32 v6, vcc, 8, v0
-; GFX6-SDAG-NEXT: v_add_i32_e32 v7, vcc, 4, v0
-; GFX6-SDAG-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
-; GFX6-SDAG-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen
-; GFX6-SDAG-NEXT: buffer_load_dword v4, v4, s[0:3], 0 offen
-; GFX6-SDAG-NEXT: buffer_load_dword v5, v5, s[0:3], 0 offen
-; GFX6-SDAG-NEXT: buffer_load_dword v6, v6, s[0:3], 0 offen
-; GFX6-SDAG-NEXT: buffer_load_dword v7, v7, s[0:3], 0 offen
-; GFX6-SDAG-NEXT: buffer_load_dword v8, v0, s[0:3], 0 offen
-; GFX6-SDAG-NEXT: v_add_i32_e32 v0, vcc, 28, v0
+; GFX6-SDAG-NEXT: v_add_i32_e32 v2, vcc, 28, v0
+; GFX6-SDAG-NEXT: v_add_i32_e32 v3, vcc, 24, v0
+; GFX6-SDAG-NEXT: v_add_i32_e32 v4, vcc, 20, v0
+; GFX6-SDAG-NEXT: v_add_i32_e32 v5, vcc, 16, v0
+; GFX6-SDAG-NEXT: v_add_i32_e32 v6, vcc, 12, v0
+; GFX6-SDAG-NEXT: v_add_i32_e32 v7, vcc, 8, v0
+; GFX6-SDAG-NEXT: v_add_i32_e32 v8, vcc, 4, v0
; GFX6-SDAG-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-SDAG-NEXT: buffer_load_dword v8, v8, s[0:3], 0 offen
+; GFX6-SDAG-NEXT: buffer_load_dword v7, v7, s[0:3], 0 offen
+; GFX6-SDAG-NEXT: buffer_load_dword v6, v6, s[0:3], 0 offen
+; GFX6-SDAG-NEXT: buffer_load_dword v5, v5, s[0:3], 0 offen
+; GFX6-SDAG-NEXT: buffer_load_dword v4, v4, s[0:3], 0 offen
+; GFX6-SDAG-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen
+; GFX6-SDAG-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
; GFX6-SDAG-NEXT: v_add_i32_e32 v9, vcc, 4, v1
; GFX6-SDAG-NEXT: v_add_i32_e32 v10, vcc, 8, v1
; GFX6-SDAG-NEXT: v_add_i32_e32 v11, vcc, 12, v1
@@ -11230,16 +11208,22 @@ define void @freeze_v8p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) {
; GFX6-SDAG-NEXT: v_add_i32_e32 v13, vcc, 20, v1
; GFX6-SDAG-NEXT: v_add_i32_e32 v14, vcc, 24, v1
; GFX6-SDAG-NEXT: v_add_i32_e32 v15, vcc, 28, v1
-; GFX6-SDAG-NEXT: s_waitcnt vmcnt(1)
-; GFX6-SDAG-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen
-; GFX6-SDAG-NEXT: buffer_store_dword v7, v9, s[0:3], 0 offen
-; GFX6-SDAG-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen
-; GFX6-SDAG-NEXT: buffer_store_dword v5, v11, s[0:3], 0 offen
-; GFX6-SDAG-NEXT: buffer_store_dword v4, v12, s[0:3], 0 offen
-; GFX6-SDAG-NEXT: buffer_store_dword v3, v13, s[0:3], 0 offen
-; GFX6-SDAG-NEXT: buffer_store_dword v2, v14, s[0:3], 0 offen
; GFX6-SDAG-NEXT: s_waitcnt vmcnt(7)
-; GFX6-SDAG-NEXT: buffer_store_dword v0, v15, s[0:3], 0 offen
+; GFX6-SDAG-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-SDAG-NEXT: s_waitcnt vmcnt(7)
+; GFX6-SDAG-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen
+; GFX6-SDAG-NEXT: s_waitcnt vmcnt(7)
+; GFX6-SDAG-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen
+; GFX6-SDAG-NEXT: s_waitcnt vmcnt(7)
+; GFX6-SDAG-NEXT: buffer_store_dword v6, v11, s[0:3], 0 offen
+; GFX6-SDAG-NEXT: s_waitcnt vmcnt(7)
+; GFX6-SDAG-NEXT: buffer_store_dword v5, v12, s[0:3], 0 offen
+; GFX6-SDAG-NEXT: s_waitcnt vmcnt(7)
+; GFX6-SDAG-NEXT: buffer_store_dword v4, v13, s[0:3], 0 offen
+; GFX6-SDAG-NEXT: s_waitcnt vmcnt(7)
+; GFX6-SDAG-NEXT: buffer_store_dword v3, v14, s[0:3], 0 offen
+; GFX6-SDAG-NEXT: s_waitcnt vmcnt(7)
+; GFX6-SDAG-NEXT: buffer_store_dword v2, v15, s[0:3], 0 offen
; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -11290,21 +11274,21 @@ define void @freeze_v8p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) {
; GFX7-SDAG-LABEL: freeze_v8p5:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_add_i32_e32 v2, vcc, 24, v0
-; GFX7-SDAG-NEXT: v_add_i32_e32 v3, vcc, 20, v0
-; GFX7-SDAG-NEXT: v_add_i32_e32 v4, vcc, 16, v0
-; GFX7-SDAG-NEXT: v_add_i32_e32 v5, vcc, 12, v0
-; GFX7-SDAG-NEXT: v_add_i32_e32 v6, vcc, 8, v0
-; GFX7-SDAG-NEXT: v_add_i32_e32 v7, vcc, 4, v0
-; GFX7-SDAG-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
-; GFX7-SDAG-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen
-; GFX7-SDAG-NEXT: buffer_load_dword v4, v4, s[0:3], 0 offen
-; GFX7-SDAG-NEXT: buffer_load_dword v5, v5, s[0:3], 0 offen
-; GFX7-SDAG-NEXT: buffer_load_dword v6, v6, s[0:3], 0 offen
-; GFX7-SDAG-NEXT: buffer_load_dword v7, v7, s[0:3], 0 offen
-; GFX7-SDAG-NEXT: buffer_load_dword v8, v0, s[0:3], 0 offen
-; GFX7-SDAG-NEXT: v_add_i32_e32 v0, vcc, 28, v0
+; GFX7-SDAG-NEXT: v_add_i32_e32 v2, vcc, 28, v0
+; GFX7-SDAG-NEXT: v_add_i32_e32 v3, vcc, 24, v0
+; GFX7-SDAG-NEXT: v_add_i32_e32 v4, vcc, 20, v0
+; GFX7-SDAG-NEXT: v_add_i32_e32 v5, vcc, 16, v0
+; GFX7-SDAG-NEXT: v_add_i32_e32 v6, vcc, 12, v0
+; GFX7-SDAG-NEXT: v_add_i32_e32 v7, vcc, 8, v0
+; GFX7-SDAG-NEXT: v_add_i32_e32 v8, vcc, 4, v0
; GFX7-SDAG-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-SDAG-NEXT: buffer_load_dword v8, v8, s[0:3], 0 offen
+; GFX7-SDAG-NEXT: buffer_load_dword v7, v7, s[0:3], 0 offen
+; GFX7-SDAG-NEXT: buffer_load_dword v6, v6, s[0:3], 0 offen
+; GFX7-SDAG-NEXT: buffer_load_dword v5, v5, s[0:3], 0 offen
+; GFX7-SDAG-NEXT: buffer_load_dword v4, v4, s[0:3], 0 offen
+; GFX7-SDAG-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen
+; GFX7-SDAG-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
; GFX7-SDAG-NEXT: v_add_i32_e32 v9, vcc, 4, v1
; GFX7-SDAG-NEXT: v_add_i32_e32 v10, vcc, 8, v1
; GFX7-SDAG-NEXT: v_add_i32_e32 v11, vcc, 12, v1
@@ -11312,16 +11296,22 @@ define void @freeze_v8p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) {
; GFX7-SDAG-NEXT: v_add_i32_e32 v13, vcc, 20, v1
; GFX7-SDAG-NEXT: v_add_i32_e32 v14, vcc, 24, v1
; GFX7-SDAG-NEXT: v_add_i32_e32 v15, vcc, 28, v1
-; GFX7-SDAG-NEXT: s_waitcnt vmcnt(1)
-; GFX7-SDAG-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen
-; GFX7-SDAG-NEXT: buffer_store_dword v7, v9, s[0:3], 0 offen
-; GFX7-SDAG-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen
-; GFX7-SDAG-NEXT: buffer_store_dword v5, v11, s[0:3], 0 offen
-; GFX7-SDAG-NEXT: buffer_store_dword v4, v12, s[0:3], 0 offen
-; GFX7-SDAG-NEXT: buffer_store_dword v3, v13, s[0:3], 0 offen
-; GFX7-SDAG-NEXT: buffer_store_dword v2, v14, s[0:3], 0 offen
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(7)
-; GFX7-SDAG-NEXT: buffer_store_dword v0, v15, s[0:3], 0 offen
+; GFX7-SDAG-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(7)
+; GFX7-SDAG-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(7)
+; GFX7-SDAG-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(7)
+; GFX7-SDAG-NEXT: buffer_store_dword v6, v11, s[0:3], 0 offen
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(7)
+; GFX7-SDAG-NEXT: buffer_store_dword v5, v12, s[0:3], 0 offen
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(7)
+; GFX7-SDAG-NEXT: buffer_store_dword v4, v13, s[0:3], 0 offen
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(7)
+; GFX7-SDAG-NEXT: buffer_store_dword v3, v14, s[0:3], 0 offen
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(7)
+; GFX7-SDAG-NEXT: buffer_store_dword v2, v15, s[0:3], 0 offen
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -11506,79 +11496,77 @@ define void @freeze_v16p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) {
; GFX6-SDAG-LABEL: freeze_v16p5:
; GFX6-SDAG: ; %bb.0:
; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-SDAG-NEXT: v_add_i32_e32 v5, vcc, 16, v0
-; GFX6-SDAG-NEXT: v_add_i32_e32 v6, vcc, 12, v0
-; GFX6-SDAG-NEXT: v_add_i32_e32 v7, vcc, 8, v0
-; GFX6-SDAG-NEXT: v_add_i32_e32 v8, vcc, 4, v0
+; GFX6-SDAG-NEXT: v_add_i32_e32 v2, vcc, 12, v0
+; GFX6-SDAG-NEXT: v_add_i32_e32 v3, vcc, 8, v0
+; GFX6-SDAG-NEXT: v_add_i32_e32 v4, vcc, 4, v0
+; GFX6-SDAG-NEXT: v_add_i32_e32 v5, vcc, 32, v0
+; GFX6-SDAG-NEXT: v_add_i32_e32 v6, vcc, 28, v0
+; GFX6-SDAG-NEXT: v_add_i32_e32 v7, vcc, 24, v0
+; GFX6-SDAG-NEXT: v_add_i32_e32 v8, vcc, 20, v0
+; GFX6-SDAG-NEXT: v_add_i32_e32 v9, vcc, 16, v0
+; GFX6-SDAG-NEXT: v_add_i32_e32 v10, vcc, 44, v0
+; GFX6-SDAG-NEXT: v_add_i32_e32 v12, vcc, 40, v0
+; GFX6-SDAG-NEXT: v_add_i32_e32 v13, vcc, 36, v0
+; GFX6-SDAG-NEXT: buffer_load_dword v4, v4, s[0:3], 0 offen
+; GFX6-SDAG-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
+; GFX6-SDAG-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen
+; GFX6-SDAG-NEXT: buffer_load_dword v9, v9, s[0:3], 0 offen
; GFX6-SDAG-NEXT: buffer_load_dword v5, v5, s[0:3], 0 offen
; GFX6-SDAG-NEXT: buffer_load_dword v6, v6, s[0:3], 0 offen
; GFX6-SDAG-NEXT: buffer_load_dword v7, v7, s[0:3], 0 offen
; GFX6-SDAG-NEXT: buffer_load_dword v8, v8, s[0:3], 0 offen
-; GFX6-SDAG-NEXT: v_add_i32_e32 v2, vcc, 56, v0
-; GFX6-SDAG-NEXT: v_add_i32_e32 v3, vcc, 52, v0
-; GFX6-SDAG-NEXT: v_add_i32_e32 v4, vcc, 48, v0
-; GFX6-SDAG-NEXT: v_add_i32_e32 v9, vcc, 44, v0
-; GFX6-SDAG-NEXT: v_add_i32_e32 v10, vcc, 40, v0
-; GFX6-SDAG-NEXT: v_add_i32_e32 v11, vcc, 36, v0
-; GFX6-SDAG-NEXT: v_add_i32_e32 v12, vcc, 32, v0
-; GFX6-SDAG-NEXT: v_add_i32_e32 v13, vcc, 28, v0
-; GFX6-SDAG-NEXT: v_add_i32_e32 v14, vcc, 24, v0
-; GFX6-SDAG-NEXT: v_add_i32_e32 v15, vcc, 20, v0
-; GFX6-SDAG-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
-; GFX6-SDAG-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen
-; GFX6-SDAG-NEXT: buffer_load_dword v4, v4, s[0:3], 0 offen
-; GFX6-SDAG-NEXT: buffer_load_dword v9, v9, s[0:3], 0 offen
-; GFX6-SDAG-NEXT: buffer_load_dword v10, v10, s[0:3], 0 offen
-; GFX6-SDAG-NEXT: buffer_load_dword v11, v11, s[0:3], 0 offen
-; GFX6-SDAG-NEXT: buffer_load_dword v16, v0, s[0:3], 0 offen
-; GFX6-SDAG-NEXT: buffer_load_dword v15, v15, s[0:3], 0 offen
-; GFX6-SDAG-NEXT: buffer_load_dword v14, v14, s[0:3], 0 offen
+; GFX6-SDAG-NEXT: buffer_load_dword v11, v0, s[0:3], 0 offen
; GFX6-SDAG-NEXT: buffer_load_dword v13, v13, s[0:3], 0 offen
; GFX6-SDAG-NEXT: buffer_load_dword v12, v12, s[0:3], 0 offen
+; GFX6-SDAG-NEXT: buffer_load_dword v10, v10, s[0:3], 0 offen
+; GFX6-SDAG-NEXT: v_add_i32_e32 v14, vcc, 48, v0
+; GFX6-SDAG-NEXT: buffer_load_dword v14, v14, s[0:3], 0 offen
+; GFX6-SDAG-NEXT: v_add_i32_e32 v15, vcc, 52, v0
+; GFX6-SDAG-NEXT: v_add_i32_e32 v16, vcc, 56, v0
+; GFX6-SDAG-NEXT: buffer_load_dword v15, v15, s[0:3], 0 offen
+; GFX6-SDAG-NEXT: buffer_load_dword v16, v16, s[0:3], 0 offen
; GFX6-SDAG-NEXT: v_add_i32_e32 v0, vcc, 60, v0
; GFX6-SDAG-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GFX6-SDAG-NEXT: v_add_i32_e32 v17, vcc, 4, v1
-; GFX6-SDAG-NEXT: v_add_i32_e32 v18, vcc, 8, v1
; GFX6-SDAG-NEXT: v_add_i32_e32 v19, vcc, 12, v1
+; GFX6-SDAG-NEXT: v_add_i32_e32 v18, vcc, 8, v1
; GFX6-SDAG-NEXT: s_waitcnt vmcnt(14)
-; GFX6-SDAG-NEXT: buffer_store_dword v6, v19, s[0:3], 0 offen
-; GFX6-SDAG-NEXT: s_waitcnt vmcnt(14)
-; GFX6-SDAG-NEXT: buffer_store_dword v7, v18, s[0:3], 0 offen
-; GFX6-SDAG-NEXT: s_waitcnt vmcnt(14)
-; GFX6-SDAG-NEXT: buffer_store_dword v8, v17, s[0:3], 0 offen
+; GFX6-SDAG-NEXT: buffer_store_dword v4, v17, s[0:3], 0 offen
; GFX6-SDAG-NEXT: s_waitcnt expcnt(0)
-; GFX6-SDAG-NEXT: v_add_i32_e32 v8, vcc, 16, v1
-; GFX6-SDAG-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen
+; GFX6-SDAG-NEXT: v_add_i32_e32 v4, vcc, 16, v1
+; GFX6-SDAG-NEXT: buffer_store_dword v2, v19, s[0:3], 0 offen
; GFX6-SDAG-NEXT: s_waitcnt expcnt(0)
-; GFX6-SDAG-NEXT: v_add_i32_e32 v5, vcc, 40, v1
+; GFX6-SDAG-NEXT: v_add_i32_e32 v2, vcc, 32, v1
; GFX6-SDAG-NEXT: v_add_i32_e32 v17, vcc, 20, v1
-; GFX6-SDAG-NEXT: v_add_i32_e32 v7, vcc, 24, v1
+; GFX6-SDAG-NEXT: s_waitcnt vmcnt(14)
+; GFX6-SDAG-NEXT: buffer_store_dword v3, v18, s[0:3], 0 offen
+; GFX6-SDAG-NEXT: s_waitcnt expcnt(0)
+; GFX6-SDAG-NEXT: v_add_i32_e32 v3, vcc, 24, v1
; GFX6-SDAG-NEXT: v_add_i32_e32 v18, vcc, 28, v1
-; GFX6-SDAG-NEXT: v_add_i32_e32 v6, vcc, 32, v1
; GFX6-SDAG-NEXT: v_add_i32_e32 v19, vcc, 36, v1
-; GFX6-SDAG-NEXT: v_add_i32_e32 v8, vcc, 44, v1
-; GFX6-SDAG-NEXT: s_waitcnt vmcnt(9)
-; GFX6-SDAG-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen
-; GFX6-SDAG-NEXT: s_waitcnt vmcnt(9)
-; GFX6-SDAG-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen
-; GFX6-SDAG-NEXT: s_waitcnt vmcnt(9)
-; GFX6-SDAG-NEXT: buffer_store_dword v14, v7, s[0:3], 0 offen
-; GFX6-SDAG-NEXT: s_waitcnt vmcnt(9)
-; GFX6-SDAG-NEXT: buffer_store_dword v13, v18, s[0:3], 0 offen
-; GFX6-SDAG-NEXT: s_waitcnt vmcnt(9)
-; GFX6-SDAG-NEXT: buffer_store_dword v12, v6, s[0:3], 0 offen
-; GFX6-SDAG-NEXT: buffer_store_dword v11, v19, s[0:3], 0 offen
-; GFX6-SDAG-NEXT: buffer_store_dword v10, v5, s[0:3], 0 offen
-; GFX6-SDAG-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen
-; GFX6-SDAG-NEXT: v_add_i32_e32 v5, vcc, 48, v1
-; GFX6-SDAG-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen
+; GFX6-SDAG-NEXT: buffer_store_dword v9, v4, s[0:3], 0 offen
+; GFX6-SDAG-NEXT: v_add_i32_e32 v4, vcc, 40, v1
; GFX6-SDAG-NEXT: s_waitcnt expcnt(0)
-; GFX6-SDAG-NEXT: v_add_i32_e32 v4, vcc, 52, v1
-; GFX6-SDAG-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
-; GFX6-SDAG-NEXT: s_waitcnt expcnt(0)
-; GFX6-SDAG-NEXT: v_add_i32_e32 v3, vcc, 56, v1
+; GFX6-SDAG-NEXT: v_add_i32_e32 v9, vcc, 44, v1
+; GFX6-SDAG-NEXT: s_waitcnt vmcnt(11)
+; GFX6-SDAG-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen
+; GFX6-SDAG-NEXT: buffer_store_dword v8, v17, s[0:3], 0 offen
+; GFX6-SDAG-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen
+; GFX6-SDAG-NEXT: buffer_store_dword v6, v18, s[0:3], 0 offen
+; GFX6-SDAG-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
+; GFX6-SDAG-NEXT: s_waitcnt vmcnt(14)
+; GFX6-SDAG-NEXT: buffer_store_dword v13, v19, s[0:3], 0 offen
+; GFX6-SDAG-NEXT: buffer_store_dword v12, v4, s[0:3], 0 offen
+; GFX6-SDAG-NEXT: s_waitcnt vmcnt(14)
+; GFX6-SDAG-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen
+; GFX6-SDAG-NEXT: v_add_i32_e32 v2, vcc, 48, v1
+; GFX6-SDAG-NEXT: buffer_store_dword v14, v2, s[0:3], 0 offen
+; GFX6-SDAG-NEXT: v_add_i32_e32 v2, vcc, 52, v1
+; GFX6-SDAG-NEXT: s_waitcnt vmcnt(14)
+; GFX6-SDAG-NEXT: buffer_store_dword v15, v2, s[0:3], 0 offen
+; GFX6-SDAG-NEXT: v_add_i32_e32 v2, vcc, 56, v1
; GFX6-SDAG-NEXT: v_add_i32_e32 v1, vcc, 60, v1
-; GFX6-SDAG-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
+; GFX6-SDAG-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen
; GFX6-SDAG-NEXT: s_waitcnt vmcnt(14)
; GFX6-SDAG-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0)
@@ -11668,75 +11656,73 @@ define void @freeze_v16p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) {
; GFX7-SDAG-LABEL: freeze_v16p5:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_add_i32_e32 v5, vcc, 16, v0
-; GFX7-SDAG-NEXT: v_add_i32_e32 v6, vcc, 12, v0
-; GFX7-SDAG-NEXT: v_add_i32_e32 v7, vcc, 8, v0
-; GFX7-SDAG-NEXT: v_add_i32_e32 v8, vcc, 4, v0
+; GFX7-SDAG-NEXT: v_add_i32_e32 v2, vcc, 12, v0
+; GFX7-SDAG-NEXT: v_add_i32_e32 v3, vcc, 8, v0
+; GFX7-SDAG-NEXT: v_add_i32_e32 v4, vcc, 4, v0
+; GFX7-SDAG-NEXT: v_add_i32_e32 v5, vcc, 32, v0
+; GFX7-SDAG-NEXT: v_add_i32_e32 v6, vcc, 28, v0
+; GFX7-SDAG-NEXT: v_add_i32_e32 v7, vcc, 24, v0
+; GFX7-SDAG-NEXT: v_add_i32_e32 v8, vcc, 20, v0
+; GFX7-SDAG-NEXT: v_add_i32_e32 v9, vcc, 16, v0
+; GFX7-SDAG-NEXT: v_add_i32_e32 v10, vcc, 44, v0
+; GFX7-SDAG-NEXT: v_add_i32_e32 v12, vcc, 40, v0
+; GFX7-SDAG-NEXT: v_add_i32_e32 v13, vcc, 36, v0
+; GFX7-SDAG-NEXT: buffer_load_dword v4, v4, s[0:3], 0 offen
+; GFX7-SDAG-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
+; GFX7-SDAG-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen
+; GFX7-SDAG-NEXT: buffer_load_dword v9, v9, s[0:3], 0 offen
; GFX7-SDAG-NEXT: buffer_load_dword v5, v5, s[0:3], 0 offen
; GFX7-SDAG-NEXT: buffer_load_dword v6, v6, s[0:3], 0 offen
; GFX7-SDAG-NEXT: buffer_load_dword v7, v7, s[0:3], 0 offen
; GFX7-SDAG-NEXT: buffer_load_dword v8, v8, s[0:3], 0 offen
-; GFX7-SDAG-NEXT: v_add_i32_e32 v2, vcc, 56, v0
-; GFX7-SDAG-NEXT: v_add_i32_e32 v3, vcc, 52, v0
-; GFX7-SDAG-NEXT: v_add_i32_e32 v4, vcc, 48, v0
-; GFX7-SDAG-NEXT: v_add_i32_e32 v9, vcc, 44, v0
-; GFX7-SDAG-NEXT: v_add_i32_e32 v10, vcc, 40, v0
-; GFX7-SDAG-NEXT: v_add_i32_e32 v11, vcc, 36, v0
-; GFX7-SDAG-NEXT: v_add_i32_e32 v12, vcc, 32, v0
-; GFX7-SDAG-NEXT: v_add_i32_e32 v13, vcc, 28, v0
-; GFX7-SDAG-NEXT: v_add_i32_e32 v14, vcc, 24, v0
-; GFX7-SDAG-NEXT: v_add_i32_e32 v15, vcc, 20, v0
-; GFX7-SDAG-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
-; GFX7-SDAG-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen
-; GFX7-SDAG-NEXT: buffer_load_dword v4, v4, s[0:3], 0 offen
-; GFX7-SDAG-NEXT: buffer_load_dword v9, v9, s[0:3], 0 offen
-; GFX7-SDAG-NEXT: buffer_load_dword v10, v10, s[0:3], 0 offen
-; GFX7-SDAG-NEXT: buffer_load_dword v11, v11, s[0:3], 0 offen
-; GFX7-SDAG-NEXT: buffer_load_dword v16, v0, s[0:3], 0 offen
-; GFX7-SDAG-NEXT: buffer_load_dword v15, v15, s[0:3], 0 offen
-; GFX7-SDAG-NEXT: buffer_load_dword v14, v14, s[0:3], 0 offen
+; GFX7-SDAG-NEXT: buffer_load_dword v11, v0, s[0:3], 0 offen
; GFX7-SDAG-NEXT: buffer_load_dword v13, v13, s[0:3], 0 offen
; GFX7-SDAG-NEXT: buffer_load_dword v12, v12, s[0:3], 0 offen
+; GFX7-SDAG-NEXT: buffer_load_dword v10, v10, s[0:3], 0 offen
+; GFX7-SDAG-NEXT: v_add_i32_e32 v14, vcc, 48, v0
+; GFX7-SDAG-NEXT: buffer_load_dword v14, v14, s[0:3], 0 offen
+; GFX7-SDAG-NEXT: v_add_i32_e32 v15, vcc, 52, v0
+; GFX7-SDAG-NEXT: v_add_i32_e32 v16, vcc, 56, v0
+; GFX7-SDAG-NEXT: buffer_load_dword v15, v15, s[0:3], 0 offen
+; GFX7-SDAG-NEXT: buffer_load_dword v16, v16, s[0:3], 0 offen
; GFX7-SDAG-NEXT: v_add_i32_e32 v0, vcc, 60, v0
; GFX7-SDAG-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GFX7-SDAG-NEXT: v_add_i32_e32 v17, vcc, 4, v1
-; GFX7-SDAG-NEXT: v_add_i32_e32 v18, vcc, 8, v1
; GFX7-SDAG-NEXT: v_add_i32_e32 v19, vcc, 12, v1
+; GFX7-SDAG-NEXT: v_add_i32_e32 v18, vcc, 8, v1
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(14)
-; GFX7-SDAG-NEXT: buffer_store_dword v6, v19, s[0:3], 0 offen
-; GFX7-SDAG-NEXT: s_waitcnt vmcnt(14)
-; GFX7-SDAG-NEXT: buffer_store_dword v7, v18, s[0:3], 0 offen
-; GFX7-SDAG-NEXT: s_waitcnt vmcnt(14)
-; GFX7-SDAG-NEXT: buffer_store_dword v8, v17, s[0:3], 0 offen
-; GFX7-SDAG-NEXT: v_add_i32_e32 v8, vcc, 16, v1
-; GFX7-SDAG-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen
-; GFX7-SDAG-NEXT: v_add_i32_e32 v5, vcc, 40, v1
+; GFX7-SDAG-NEXT: buffer_store_dword v4, v17, s[0:3], 0 offen
+; GFX7-SDAG-NEXT: v_add_i32_e32 v4, vcc, 16, v1
+; GFX7-SDAG-NEXT: buffer_store_dword v2, v19, s[0:3], 0 offen
+; GFX7-SDAG-NEXT: v_add_i32_e32 v2, vcc, 32, v1
; GFX7-SDAG-NEXT: v_add_i32_e32 v17, vcc, 20, v1
-; GFX7-SDAG-NEXT: v_add_i32_e32 v7, vcc, 24, v1
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(14)
+; GFX7-SDAG-NEXT: buffer_store_dword v3, v18, s[0:3], 0 offen
+; GFX7-SDAG-NEXT: v_add_i32_e32 v3, vcc, 24, v1
; GFX7-SDAG-NEXT: v_add_i32_e32 v18, vcc, 28, v1
-; GFX7-SDAG-NEXT: v_add_i32_e32 v6, vcc, 32, v1
; GFX7-SDAG-NEXT: v_add_i32_e32 v19, vcc, 36, v1
-; GFX7-SDAG-NEXT: v_add_i32_e32 v8, vcc, 44, v1
-; GFX7-SDAG-NEXT: s_waitcnt vmcnt(9)
-; GFX7-SDAG-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen
-; GFX7-SDAG-NEXT: s_waitcnt vmcnt(9)
-; GFX7-SDAG-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen
-; GFX7-SDAG-NEXT: s_waitcnt vmcnt(9)
-; GFX7-SDAG-NEXT: buffer_store_dword v14, v7, s[0:3], 0 offen
-; GFX7-SDAG-NEXT: s_waitcnt vmcnt(9)
-; GFX7-SDAG-NEXT: buffer_store_dword v13, v18, s[0:3], 0 offen
-; GFX7-SDAG-NEXT: s_waitcnt vmcnt(9)
-; GFX7-SDAG-NEXT: buffer_store_dword v12, v6, s[0:3], 0 offen
-; GFX7-SDAG-NEXT: buffer_store_dword v11, v19, s[0:3], 0 offen
-; GFX7-SDAG-NEXT: buffer_store_dword v10, v5, s[0:3], 0 offen
-; GFX7-SDAG-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen
-; GFX7-SDAG-NEXT: v_add_i32_e32 v5, vcc, 48, v1
-; GFX7-SDAG-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen
-; GFX7-SDAG-NEXT: v_add_i32_e32 v4, vcc, 52, v1
-; GFX7-SDAG-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
-; GFX7-SDAG-NEXT: v_add_i32_e32 v3, vcc, 56, v1
+; GFX7-SDAG-NEXT: buffer_store_dword v9, v4, s[0:3], 0 offen
+; GFX7-SDAG-NEXT: v_add_i32_e32 v4, vcc, 40, v1
+; GFX7-SDAG-NEXT: v_add_i32_e32 v9, vcc, 44, v1
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(11)
+; GFX7-SDAG-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen
+; GFX7-SDAG-NEXT: buffer_store_dword v8, v17, s[0:3], 0 offen
+; GFX7-SDAG-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen
+; GFX7-SDAG-NEXT: buffer_store_dword v6, v18, s[0:3], 0 offen
+; GFX7-SDAG-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(14)
+; GFX7-SDAG-NEXT: buffer_store_dword v13, v19, s[0:3], 0 offen
+; GFX7-SDAG-NEXT: buffer_store_dword v12, v4, s[0:3], 0 offen
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(14)
+; GFX7-SDAG-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen
+; GFX7-SDAG-NEXT: v_add_i32_e32 v2, vcc, 48, v1
+; GFX7-SDAG-NEXT: buffer_store_dword v14, v2, s[0:3], 0 offen
+; GFX7-SDAG-NEXT: v_add_i32_e32 v2, vcc, 52, v1
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(14)
+; GFX7-SDAG-NEXT: buffer_store_dword v15, v2, s[0:3], 0 offen
+; GFX7-SDAG-NEXT: v_add_i32_e32 v2, vcc, 56, v1
; GFX7-SDAG-NEXT: v_add_i32_e32 v1, vcc, 60, v1
-; GFX7-SDAG-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
+; GFX7-SDAG-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(14)
; GFX7-SDAG-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll b/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll
index 0658997d087bf..b8a3aa97d2b4e 100644
--- a/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll
+++ b/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll
@@ -34,8 +34,8 @@ define amdgpu_kernel void @divergent_or3_b64(ptr addrspace(1) %arg) {
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_lshlrev_b32_e32 v6, 5, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_load_dwordx2 v[4:5], v6, s[0:1] offset:16
; GCN-NEXT: global_load_dwordx4 v[0:3], v6, s[0:1]
+; GCN-NEXT: global_load_dwordx2 v[4:5], v6, s[0:1] offset:16
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_or3_b32 v1, v3, v1, v5
; GCN-NEXT: v_or3_b32 v0, v2, v0, v4
diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
index 61d09fda44528..5abb7f809b5ef 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -5146,11 +5146,11 @@ define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] offset:4
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1] offset:4
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, 0
+; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, 0
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
@@ -5162,11 +5162,11 @@ define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] offset:4
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] offset:4
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, 0
+; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, 0
; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX10-DL-NEXT: s_endpgm
;
@@ -5180,10 +5180,10 @@ define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
-; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1] offset:4
-; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3]
+; GFX11-DL-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1] offset:4
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0
+; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0
; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5]
; GFX11-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index 92ea83fdfb982..9cb12803dde6a 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -3426,20 +3426,20 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out,
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
-; CI-NEXT: v_lshlrev_b32_e32 v4, 5, v0
+; CI-NEXT: v_lshlrev_b32_e32 v11, 5, v0
; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v1, s3
-; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
-; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT: v_add_i32_e32 v2, vcc, 16, v0
+; CI-NEXT: v_mov_b32_e32 v0, s3
+; CI-NEXT: v_add_i32_e32 v4, vcc, s2, v11
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; CI-NEXT: flat_load_dwordx4 v[7:10], v[2:3]
-; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc
+; CI-NEXT: flat_load_dwordx4 v[0:3], v[4:5]
+; CI-NEXT: v_add_i32_e32 v4, vcc, 16, v4
+; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; CI-NEXT: flat_load_dwordx4 v[7:10], v[4:5]
; CI-NEXT: v_mov_b32_e32 v5, s1
-; CI-NEXT: v_add_i32_e32 v4, vcc, s0, v4
+; CI-NEXT: v_add_i32_e32 v4, vcc, s0, v11
; CI-NEXT: v_cvt_f32_f16_e32 v6, s4
; CI-NEXT: s_cmp_eq_u32 s5, 15
; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
@@ -3450,106 +3450,105 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out,
; CI-NEXT: s_cselect_b64 s[2:3], -1, 0
; CI-NEXT: s_cmp_eq_u32 s5, 12
; CI-NEXT: s_waitcnt vmcnt(1)
-; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v10
-; CI-NEXT: v_cvt_f32_f16_e32 v10, v10
-; CI-NEXT: v_lshrrev_b32_e32 v12, 16, v9
-; CI-NEXT: v_lshrrev_b32_e32 v13, 16, v8
+; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v3
; CI-NEXT: v_cvt_f32_f16_e32 v11, v11
+; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
+; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v2
+; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: v_lshrrev_b32_e32 v12, 16, v10
+; CI-NEXT: v_cvt_f32_f16_e32 v10, v10
+; CI-NEXT: v_lshrrev_b32_e32 v13, 16, v9
+; CI-NEXT: v_lshrrev_b32_e32 v14, 16, v8
; CI-NEXT: v_cvt_f32_f16_e32 v12, v12
-; CI-NEXT: v_cvt_f32_f16_e32 v9, v9
; CI-NEXT: v_cvt_f32_f16_e32 v13, v13
-; CI-NEXT: v_cvt_f32_f16_e32 v8, v8
+; CI-NEXT: v_cvt_f32_f16_e32 v9, v9
+; CI-NEXT: v_cvt_f32_f16_e32 v14, v14
; CI-NEXT: v_cndmask_b32_e64 v10, v10, v6, s[0:1]
; CI-NEXT: s_cselect_b64 s[0:1], -1, 0
; CI-NEXT: s_cmp_eq_u32 s5, 11
-; CI-NEXT: v_cndmask_b32_e32 v11, v11, v6, vcc
-; CI-NEXT: v_cndmask_b32_e64 v12, v12, v6, s[2:3]
+; CI-NEXT: v_cndmask_b32_e32 v12, v12, v6, vcc
+; CI-NEXT: v_cndmask_b32_e64 v13, v13, v6, s[2:3]
; CI-NEXT: s_cselect_b64 vcc, -1, 0
-; CI-NEXT: s_cmp_eq_u32 s5, 10
+; CI-NEXT: v_cvt_f32_f16_e32 v8, v8
; CI-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[0:1]
-; CI-NEXT: v_cndmask_b32_e32 v13, v13, v6, vcc
-; CI-NEXT: v_cvt_f16_f32_e32 v12, v12
-; CI-NEXT: s_cselect_b64 vcc, -1, 0
-; CI-NEXT: v_cvt_f16_f32_e32 v9, v9
+; CI-NEXT: v_cndmask_b32_e32 v14, v14, v6, vcc
; CI-NEXT: v_cvt_f16_f32_e32 v13, v13
+; CI-NEXT: v_cvt_f16_f32_e32 v9, v9
+; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
+; CI-NEXT: s_cmp_eq_u32 s5, 10
+; CI-NEXT: s_cselect_b64 vcc, -1, 0
+; CI-NEXT: v_lshrrev_b32_e32 v15, 16, v7
; CI-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc
+; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; CI-NEXT: v_cvt_f16_f32_e32 v8, v8
-; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; CI-NEXT: v_lshrrev_b32_e32 v14, 16, v7
-; CI-NEXT: v_or_b32_e32 v9, v9, v12
-; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v13
-; CI-NEXT: v_or_b32_e32 v8, v8, v12
-; CI-NEXT: v_cvt_f32_f16_e32 v12, v14
+; CI-NEXT: v_or_b32_e32 v9, v9, v13
+; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v14
+; CI-NEXT: v_cvt_f32_f16_e32 v14, v15
; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_lshrrev_b32_e32 v15, 16, v3
-; CI-NEXT: v_cvt_f32_f16_e32 v13, v15
; CI-NEXT: s_cmp_eq_u32 s5, 9
-; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
-; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v2
; CI-NEXT: s_cselect_b64 vcc, -1, 0
; CI-NEXT: s_cmp_eq_u32 s5, 8
-; CI-NEXT: v_cvt_f32_f16_e32 v14, v16
-; CI-NEXT: v_cndmask_b32_e32 v12, v12, v6, vcc
+; CI-NEXT: v_or_b32_e32 v8, v8, v13
+; CI-NEXT: v_cvt_f32_f16_e32 v13, v16
+; CI-NEXT: v_cndmask_b32_e32 v14, v14, v6, vcc
; CI-NEXT: s_cselect_b64 vcc, -1, 0
; CI-NEXT: s_cmp_eq_u32 s5, 7
-; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; CI-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc
; CI-NEXT: s_cselect_b64 vcc, -1, 0
; CI-NEXT: s_cmp_eq_u32 s5, 6
-; CI-NEXT: v_cndmask_b32_e32 v13, v13, v6, vcc
+; CI-NEXT: v_cndmask_b32_e32 v11, v11, v6, vcc
; CI-NEXT: s_cselect_b64 vcc, -1, 0
; CI-NEXT: s_cmp_eq_u32 s5, 5
-; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
; CI-NEXT: v_cvt_f16_f32_e32 v12, v12
; CI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
; CI-NEXT: s_cselect_b64 vcc, -1, 0
; CI-NEXT: s_cmp_eq_u32 s5, 4
; CI-NEXT: v_cvt_f16_f32_e32 v10, v10
-; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
-; CI-NEXT: v_cvt_f16_f32_e32 v13, v13
-; CI-NEXT: v_cndmask_b32_e32 v14, v14, v6, vcc
+; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
+; CI-NEXT: v_cndmask_b32_e32 v13, v13, v6, vcc
; CI-NEXT: s_cselect_b64 vcc, -1, 0
; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
+; CI-NEXT: v_cvt_f16_f32_e32 v13, v13
; CI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; CI-NEXT: v_or_b32_e32 v10, v10, v11
-; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v1
-; CI-NEXT: v_or_b32_e32 v7, v7, v12
-; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v13
-; CI-NEXT: v_cvt_f32_f16_e32 v11, v11
-; CI-NEXT: v_or_b32_e32 v3, v3, v12
-; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v14
+; CI-NEXT: v_or_b32_e32 v10, v10, v12
+; CI-NEXT: v_lshrrev_b32_e32 v12, 16, v1
+; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; CI-NEXT: v_cvt_f32_f16_e32 v12, v12
+; CI-NEXT: v_or_b32_e32 v3, v3, v11
+; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v13
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; CI-NEXT: v_or_b32_e32 v2, v2, v12
-; CI-NEXT: v_lshrrev_b32_e32 v12, 16, v0
+; CI-NEXT: v_or_b32_e32 v2, v2, v11
+; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v0
; CI-NEXT: s_cmp_eq_u32 s5, 3
-; CI-NEXT: v_cvt_f32_f16_e32 v12, v12
+; CI-NEXT: v_cvt_f32_f16_e32 v11, v11
; CI-NEXT: s_cselect_b64 vcc, -1, 0
; CI-NEXT: s_cmp_eq_u32 s5, 2
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; CI-NEXT: v_cndmask_b32_e32 v11, v11, v6, vcc
+; CI-NEXT: v_cndmask_b32_e32 v12, v12, v6, vcc
; CI-NEXT: s_cselect_b64 vcc, -1, 0
; CI-NEXT: s_cmp_eq_u32 s5, 1
; CI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
; CI-NEXT: s_cselect_b64 vcc, -1, 0
; CI-NEXT: s_cmp_eq_u32 s5, 0
-; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
-; CI-NEXT: v_cndmask_b32_e32 v12, v12, v6, vcc
+; CI-NEXT: v_cvt_f16_f32_e32 v12, v12
+; CI-NEXT: v_cndmask_b32_e32 v11, v11, v6, vcc
; CI-NEXT: s_cselect_b64 vcc, -1, 0
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_cvt_f16_f32_e32 v12, v12
+; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
; CI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v11
-; CI-NEXT: v_or_b32_e32 v1, v1, v6
+; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
+; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v12
+; CI-NEXT: v_or_b32_e32 v1, v1, v6
+; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v11
; CI-NEXT: v_or_b32_e32 v0, v0, v6
+; CI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; CI-NEXT: s_nop 0
+; CI-NEXT: v_or_b32_e32 v7, v7, v14
; CI-NEXT: v_add_i32_e32 v0, vcc, 16, v4
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
; CI-NEXT: flat_store_dwordx4 v[0:1], v[7:10]
diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
index d4867dbaa14b3..2dd009dee7339 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
@@ -300,8 +300,6 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32
-; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_getpc_b64 s[16:17]
; GFX9-NEXT: s_add_u32 s16, s16, byval_align16_f64_arg at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s17, s17, byval_align16_f64_arg at rel32@hi+12
@@ -309,6 +307,8 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24
; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32
diff --git a/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll b/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll
index 1caa1442fd2fd..110e9101a924c 100644
--- a/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll
+++ b/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll
@@ -12,9 +12,9 @@ define void @issue92561(ptr addrspace(1) %arg) {
; SDAG: ; %bb.0: ; %bb
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: s_clause 0x1
-; SDAG-NEXT: global_load_b128 v[4:7], v[0:1], off offset:16
-; SDAG-NEXT: global_load_b128 v[0:3], v[0:1], off
-; SDAG-NEXT: v_mov_b32_e32 v8, 0
+; SDAG-NEXT: global_load_b128 v[2:5], v[0:1], off
+; SDAG-NEXT: global_load_b128 v[6:9], v[0:1], off offset:16
+; SDAG-NEXT: v_mov_b32_e32 v1, 0
; SDAG-NEXT: s_mov_b32 s12, 0
; SDAG-NEXT: s_mov_b32 s3, exec_lo
; SDAG-NEXT: s_mov_b32 s13, s12
@@ -22,31 +22,31 @@ define void @issue92561(ptr addrspace(1) %arg) {
; SDAG-NEXT: s_mov_b32 s15, s12
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
-; SDAG-NEXT: v_readfirstlane_b32 s4, v0
-; SDAG-NEXT: v_readfirstlane_b32 s5, v1
-; SDAG-NEXT: v_readfirstlane_b32 s6, v2
-; SDAG-NEXT: v_readfirstlane_b32 s7, v3
-; SDAG-NEXT: v_readfirstlane_b32 s8, v4
-; SDAG-NEXT: v_readfirstlane_b32 s9, v5
-; SDAG-NEXT: v_readfirstlane_b32 s10, v6
-; SDAG-NEXT: v_readfirstlane_b32 s11, v7
-; SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; SDAG-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; SDAG-NEXT: v_cmp_eq_u64_e64 s1, s[8:9], v[4:5]
+; SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; SDAG-NEXT: v_readfirstlane_b32 s6, v4
+; SDAG-NEXT: v_readfirstlane_b32 s7, v5
+; SDAG-NEXT: v_readfirstlane_b32 s8, v6
+; SDAG-NEXT: v_readfirstlane_b32 s9, v7
+; SDAG-NEXT: v_readfirstlane_b32 s10, v8
+; SDAG-NEXT: v_readfirstlane_b32 s11, v9
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[2:3]
+; SDAG-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[4:5]
+; SDAG-NEXT: v_cmp_eq_u64_e64 s1, s[8:9], v[6:7]
; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; SDAG-NEXT: v_cmp_eq_u64_e64 s2, s[10:11], v[6:7]
+; SDAG-NEXT: v_cmp_eq_u64_e64 s2, s[10:11], v[8:9]
; SDAG-NEXT: s_and_b32 s0, vcc_lo, s0
; SDAG-NEXT: s_and_b32 s0, s0, s1
; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; SDAG-NEXT: s_and_b32 s0, s0, s2
; SDAG-NEXT: s_and_saveexec_b32 s0, s0
-; SDAG-NEXT: image_sample_c_lz v9, [v8, v8, v8, v8], s[4:11], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
-; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+; SDAG-NEXT: image_sample_c_lz v0, [v1, v1, v1, v1], s[4:11], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
+; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
; SDAG-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; SDAG-NEXT: s_cbranch_execnz .LBB0_1
; SDAG-NEXT: ; %bb.2:
; SDAG-NEXT: s_mov_b32 exec_lo, s3
-; SDAG-NEXT: v_dual_mov_b32 v0, 0x7fc00000 :: v_dual_mov_b32 v1, 1.0
+; SDAG-NEXT: v_dual_mov_b32 v2, 0x7fc00000 :: v_dual_mov_b32 v3, 1.0
; SDAG-NEXT: s_mov_b32 s0, s12
; SDAG-NEXT: s_mov_b32 s1, s12
; SDAG-NEXT: s_mov_b32 s2, s12
@@ -56,18 +56,20 @@ define void @issue92561(ptr addrspace(1) %arg) {
; SDAG-NEXT: s_mov_b32 s6, s12
; SDAG-NEXT: s_mov_b32 s7, s12
; SDAG-NEXT: s_clause 0x2
-; SDAG-NEXT: image_sample_c_lz v0, [v8, v8, v0, v8], s[0:7], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
-; SDAG-NEXT: image_sample_c_lz v2, [v8, v8, v8, v8], s[0:7], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
-; SDAG-NEXT: image_sample_c_lz v1, [v8, v1, v8, v8], s[0:7], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
+; SDAG-NEXT: image_sample_c_lz v2, [v1, v1, v2, v1], s[0:7], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
+; SDAG-NEXT: image_sample_c_lz v3, [v1, v3, v1, v1], s[0:7], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
+; SDAG-NEXT: image_sample_c_lz v4, [v1, v1, v1, v1], s[0:7], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
; SDAG-NEXT: s_waitcnt vmcnt(2)
-; SDAG-NEXT: v_dual_add_f32 v0, v9, v0 :: v_dual_mov_b32 v9, v8
+; SDAG-NEXT: v_add_f32_e32 v0, v0, v2
+; SDAG-NEXT: v_mov_b32_e32 v2, v1
+; SDAG-NEXT: s_waitcnt vmcnt(1)
+; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; SDAG-NEXT: v_dual_add_f32 v0, v3, v0 :: v_dual_mov_b32 v3, 0
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-NEXT: v_add_f32_e32 v0, v1, v0
-; SDAG-NEXT: v_add_f32_e32 v0, v2, v0
+; SDAG-NEXT: v_add_f32_e32 v0, v4, v0
; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-NEXT: v_dual_mul_f32 v7, 0x3e800000, v0 :: v_dual_mov_b32 v0, 0
-; SDAG-NEXT: image_store v[7:9], [v0, v0], s[0:7] dim:SQ_RSRC_IMG_2D unorm
+; SDAG-NEXT: v_mul_f32_e32 v0, 0x3e800000, v0
+; SDAG-NEXT: image_store v[0:2], [v3, v3], s[0:7] dim:SQ_RSRC_IMG_2D unorm
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: issue92561:
diff --git a/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll b/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll
index 3d79bdc25336d..9c1f588604d8c 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll
@@ -341,15 +341,16 @@ define amdgpu_kernel void @test_flat_misaligned_v4(ptr %arg) {
; SPLIT-NEXT: v_add_co_u32 v6, vcc_lo, v0, 8
; SPLIT-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
; SPLIT-NEXT: s_clause 0x3
-; SPLIT-NEXT: flat_load_dword v8, v[2:3]
+; SPLIT-NEXT: flat_load_dword v8, v[0:1]
; SPLIT-NEXT: flat_load_dword v9, v[4:5]
-; SPLIT-NEXT: flat_load_dword v10, v[0:1]
+; SPLIT-NEXT: flat_load_dword v10, v[2:3]
; SPLIT-NEXT: flat_load_dword v11, v[6:7]
-; SPLIT-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; SPLIT-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
+; SPLIT-NEXT: flat_store_dword v[2:3], v8
+; SPLIT-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3)
; SPLIT-NEXT: flat_store_dword v[6:7], v9
-; SPLIT-NEXT: s_waitcnt vmcnt(1) lgkmcnt(2)
-; SPLIT-NEXT: flat_store_dword v[2:3], v10
-; SPLIT-NEXT: flat_store_dword v[0:1], v8
+; SPLIT-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3)
+; SPLIT-NEXT: flat_store_dword v[0:1], v10
; SPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3)
; SPLIT-NEXT: flat_store_dword v[4:5], v11
; SPLIT-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
index e7d7f87e4fc4c..5aca6131ab2f6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
@@ -5836,14 +5836,14 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg
; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; NOLIT-SRCC-NEXT: v_lshlrev_b32_e32 v32, 7, v0
; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
-; NOLIT-SRCC-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
; NOLIT-SRCC-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
-; NOLIT-SRCC-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
+; NOLIT-SRCC-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
; NOLIT-SRCC-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
-; NOLIT-SRCC-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
+; NOLIT-SRCC-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
; NOLIT-SRCC-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
-; NOLIT-SRCC-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
+; NOLIT-SRCC-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
; NOLIT-SRCC-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
+; NOLIT-SRCC-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
; NOLIT-SRCC-NEXT: s_waitcnt vmcnt(0)
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v0
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1
@@ -5931,14 +5931,14 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg
; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; LIT-SRCC-NEXT: v_lshlrev_b32_e32 v32, 7, v0
; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
-; LIT-SRCC-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
; LIT-SRCC-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
-; LIT-SRCC-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
+; LIT-SRCC-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
; LIT-SRCC-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
-; LIT-SRCC-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
+; LIT-SRCC-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
; LIT-SRCC-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
-; LIT-SRCC-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
+; LIT-SRCC-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
; LIT-SRCC-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
+; LIT-SRCC-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
; LIT-SRCC-NEXT: s_waitcnt vmcnt(0)
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v0
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1
@@ -6029,14 +6029,14 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg
; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
; GFX90A-NEXT: v_mov_b32_e32 v2, 2.0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112
; GFX90A-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96
-; GFX90A-NEXT: global_load_dwordx4 a[20:23], v0, s[0:1] offset:80
+; GFX90A-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112
; GFX90A-NEXT: global_load_dwordx4 a[16:19], v0, s[0:1] offset:64
-; GFX90A-NEXT: global_load_dwordx4 a[12:15], v0, s[0:1] offset:48
+; GFX90A-NEXT: global_load_dwordx4 a[20:23], v0, s[0:1] offset:80
; GFX90A-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32
-; GFX90A-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16
+; GFX90A-NEXT: global_load_dwordx4 a[12:15], v0, s[0:1] offset:48
; GFX90A-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1]
+; GFX90A-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: s_nop 15
@@ -6059,14 +6059,14 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg
; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
; GFX942-NEXT: v_mov_b32_e32 v2, 2.0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112
; GFX942-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96
-; GFX942-NEXT: global_load_dwordx4 a[20:23], v0, s[0:1] offset:80
+; GFX942-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112
; GFX942-NEXT: global_load_dwordx4 a[16:19], v0, s[0:1] offset:64
-; GFX942-NEXT: global_load_dwordx4 a[12:15], v0, s[0:1] offset:48
+; GFX942-NEXT: global_load_dwordx4 a[20:23], v0, s[0:1] offset:80
; GFX942-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32
-; GFX942-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16
+; GFX942-NEXT: global_load_dwordx4 a[12:15], v0, s[0:1] offset:48
; GFX942-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1]
+; GFX942-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_nop 15
@@ -6089,14 +6089,14 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg
; GFX942-VGPR-NEXT: v_mov_b32_e32 v33, 1.0
; GFX942-VGPR-NEXT: v_mov_b32_e32 v34, 2.0
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
; GFX942-VGPR-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
-; GFX942-VGPR-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
+; GFX942-VGPR-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
; GFX942-VGPR-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
-; GFX942-VGPR-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
+; GFX942-VGPR-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
; GFX942-VGPR-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
-; GFX942-VGPR-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
+; GFX942-VGPR-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
; GFX942-VGPR-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
+; GFX942-VGPR-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
; GFX942-VGPR-NEXT: s_waitcnt vmcnt(0)
; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v34, v[0:31] cbsz:1 abid:2 blgp:3
; GFX942-VGPR-NEXT: s_nop 15
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
index 25996ee11c5a1..d45aaf41a4f27 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
@@ -175,10 +175,10 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) %
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
-; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
+; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
+; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[2:3]
; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[0:1]
; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[14:15]
@@ -630,10 +630,10 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_bf16__vgpr(ptr addrspace(1)
; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
; GCN-NEXT: s_load_dword s16, s[4:5], 0x64
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
-; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
+; GCN-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
+; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; GCN-NEXT: v_mov_b64_e32 v[28:29], s[2:3]
; GCN-NEXT: v_mov_b64_e32 v[26:27], s[0:1]
; GCN-NEXT: v_mov_b64_e32 v[24:25], s[14:15]
@@ -959,10 +959,10 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
-; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
+; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
+; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; SDAG-NEXT: v_mov_b32_e32 v24, s8
; SDAG-NEXT: v_mov_b32_e32 v25, s9
; SDAG-NEXT: v_mov_b32_e32 v26, s10
@@ -1956,10 +1956,10 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace(
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
-; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
+; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
+; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; SDAG-NEXT: v_mov_b32_e32 v24, s8
; SDAG-NEXT: v_mov_b32_e32 v25, s9
; SDAG-NEXT: v_mov_b32_e32 v26, s10
@@ -2305,10 +2305,10 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace(
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
-; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
+; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
+; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; SDAG-NEXT: v_mov_b32_e32 v24, s8
; SDAG-NEXT: v_mov_b32_e32 v25, s9
; SDAG-NEXT: v_mov_b32_e32 v26, s10
@@ -2654,10 +2654,10 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace(
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
-; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
+; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
+; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; SDAG-NEXT: v_mov_b32_e32 v24, s8
; SDAG-NEXT: v_mov_b32_e32 v25, s9
; SDAG-NEXT: v_mov_b32_e32 v26, s10
@@ -3003,10 +3003,10 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace(
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
-; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
+; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
+; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; SDAG-NEXT: v_mov_b32_e32 v24, s8
; SDAG-NEXT: v_mov_b32_e32 v25, s9
; SDAG-NEXT: v_mov_b32_e32 v26, s10
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
index 2752649550b69..2c744241d42cd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
@@ -2530,9 +2530,9 @@ define void @test_writelane_v7i32(ptr addrspace(1) %out, <7 x i32> %src, i32 %sr
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX802-SDAG-NEXT: v_add_u32_e32 v17, vcc, 16, v0
-; GFX802-SDAG-NEXT: flat_load_dwordx4 v[10:13], v[0:1]
; GFX802-SDAG-NEXT: v_addc_u32_e32 v18, vcc, 0, v1, vcc
; GFX802-SDAG-NEXT: flat_load_dwordx3 v[14:16], v[17:18]
+; GFX802-SDAG-NEXT: flat_load_dwordx4 v[10:13], v[0:1]
; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v9
; GFX802-SDAG-NEXT: v_readfirstlane_b32 s8, v5
; GFX802-SDAG-NEXT: v_readfirstlane_b32 s9, v4
@@ -2543,12 +2543,12 @@ define void @test_writelane_v7i32(ptr addrspace(1) %out, <7 x i32> %src, i32 %sr
; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v7
; GFX802-SDAG-NEXT: v_readfirstlane_b32 s7, v6
; GFX802-SDAG-NEXT: s_waitcnt vmcnt(1)
+; GFX802-SDAG-NEXT: v_writelane_b32 v16, s5, m0
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX802-SDAG-NEXT: v_writelane_b32 v13, s8, m0
; GFX802-SDAG-NEXT: v_writelane_b32 v12, s9, m0
; GFX802-SDAG-NEXT: v_writelane_b32 v11, s10, m0
; GFX802-SDAG-NEXT: v_writelane_b32 v10, s11, m0
-; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX802-SDAG-NEXT: v_writelane_b32 v16, s5, m0
; GFX802-SDAG-NEXT: v_writelane_b32 v15, s6, m0
; GFX802-SDAG-NEXT: v_writelane_b32 v14, s7, m0
; GFX802-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[10:13]
@@ -2931,9 +2931,9 @@ define void @test_writelane_v3i64(ptr addrspace(1) %out, <3 x i64> %src, i32 %sr
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX802-SDAG-NEXT: v_add_u32_e32 v13, vcc, 16, v0
-; GFX802-SDAG-NEXT: flat_load_dwordx4 v[9:12], v[0:1]
; GFX802-SDAG-NEXT: v_addc_u32_e32 v14, vcc, 0, v1, vcc
; GFX802-SDAG-NEXT: flat_load_dwordx2 v[15:16], v[13:14]
+; GFX802-SDAG-NEXT: flat_load_dwordx4 v[9:12], v[0:1]
; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v8
; GFX802-SDAG-NEXT: v_readfirstlane_b32 s7, v5
; GFX802-SDAG-NEXT: v_readfirstlane_b32 s8, v4
@@ -2943,12 +2943,12 @@ define void @test_writelane_v3i64(ptr addrspace(1) %out, <3 x i64> %src, i32 %sr
; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v7
; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v6
; GFX802-SDAG-NEXT: s_waitcnt vmcnt(1)
+; GFX802-SDAG-NEXT: v_writelane_b32 v16, s5, m0
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX802-SDAG-NEXT: v_writelane_b32 v12, s7, m0
; GFX802-SDAG-NEXT: v_writelane_b32 v11, s8, m0
; GFX802-SDAG-NEXT: v_writelane_b32 v10, s9, m0
; GFX802-SDAG-NEXT: v_writelane_b32 v9, s10, m0
-; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX802-SDAG-NEXT: v_writelane_b32 v16, s5, m0
; GFX802-SDAG-NEXT: v_writelane_b32 v15, s6, m0
; GFX802-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[9:12]
; GFX802-SDAG-NEXT: flat_store_dwordx2 v[13:14], v[15:16]
@@ -3096,9 +3096,9 @@ define void @test_writelane_v4f64(ptr addrspace(1) %out, <4 x double> %src, i32
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX802-SDAG-NEXT: v_add_u32_e32 v19, vcc, 16, v0
-; GFX802-SDAG-NEXT: flat_load_dwordx4 v[11:14], v[0:1]
; GFX802-SDAG-NEXT: v_addc_u32_e32 v20, vcc, 0, v1, vcc
-; GFX802-SDAG-NEXT: flat_load_dwordx4 v[15:18], v[19:20]
+; GFX802-SDAG-NEXT: flat_load_dwordx4 v[11:14], v[19:20]
+; GFX802-SDAG-NEXT: flat_load_dwordx4 v[15:18], v[0:1]
; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v10
; GFX802-SDAG-NEXT: v_readfirstlane_b32 s9, v5
; GFX802-SDAG-NEXT: v_readfirstlane_b32 s10, v4
@@ -3110,17 +3110,17 @@ define void @test_writelane_v4f64(ptr addrspace(1) %out, <4 x double> %src, i32
; GFX802-SDAG-NEXT: v_readfirstlane_b32 s7, v7
; GFX802-SDAG-NEXT: v_readfirstlane_b32 s8, v6
; GFX802-SDAG-NEXT: s_waitcnt vmcnt(1)
-; GFX802-SDAG-NEXT: v_writelane_b32 v14, s9, m0
-; GFX802-SDAG-NEXT: v_writelane_b32 v13, s10, m0
-; GFX802-SDAG-NEXT: v_writelane_b32 v12, s11, m0
-; GFX802-SDAG-NEXT: v_writelane_b32 v11, s12, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v14, s5, m0
; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX802-SDAG-NEXT: v_writelane_b32 v18, s5, m0
-; GFX802-SDAG-NEXT: v_writelane_b32 v17, s6, m0
-; GFX802-SDAG-NEXT: v_writelane_b32 v16, s7, m0
-; GFX802-SDAG-NEXT: v_writelane_b32 v15, s8, m0
-; GFX802-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[11:14]
-; GFX802-SDAG-NEXT: flat_store_dwordx4 v[19:20], v[15:18]
+; GFX802-SDAG-NEXT: v_writelane_b32 v18, s9, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v17, s10, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v16, s11, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v15, s12, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v13, s6, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v12, s7, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v11, s8, m0
+; GFX802-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[15:18]
+; GFX802-SDAG-NEXT: flat_store_dwordx4 v[19:20], v[11:14]
; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -3285,21 +3285,21 @@ define void @test_writelane_v8f64(ptr addrspace(1) %out, <8 x double> %src, i32
; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v18
; GFX802-SDAG-NEXT: flat_load_dwordx4 v[18:21], v[0:1]
-; GFX802-SDAG-NEXT: v_add_u32_e32 v22, vcc, 16, v0
+; GFX802-SDAG-NEXT: v_add_u32_e32 v22, vcc, 32, v0
; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v5
; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v4
; GFX802-SDAG-NEXT: v_readfirstlane_b32 s7, v3
; GFX802-SDAG-NEXT: v_readfirstlane_b32 s8, v2
; GFX802-SDAG-NEXT: v_addc_u32_e32 v23, vcc, 0, v1, vcc
; GFX802-SDAG-NEXT: s_mov_b32 m0, s4
-; GFX802-SDAG-NEXT: flat_load_dwordx4 v[2:5], v[22:23]
-; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v9
; GFX802-SDAG-NEXT: v_readfirstlane_b32 s10, v15
; GFX802-SDAG-NEXT: v_readfirstlane_b32 s11, v14
; GFX802-SDAG-NEXT: v_readfirstlane_b32 s12, v13
; GFX802-SDAG-NEXT: v_readfirstlane_b32 s13, v12
+; GFX802-SDAG-NEXT: flat_load_dwordx4 v[12:15], v[22:23]
; GFX802-SDAG-NEXT: v_readfirstlane_b32 s14, v11
; GFX802-SDAG-NEXT: v_readfirstlane_b32 s15, v10
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v9
; GFX802-SDAG-NEXT: v_readfirstlane_b32 s9, v16
; GFX802-SDAG-NEXT: s_waitcnt vmcnt(1)
; GFX802-SDAG-NEXT: v_writelane_b32 v21, s5, m0
@@ -3307,34 +3307,34 @@ define void @test_writelane_v8f64(ptr addrspace(1) %out, <8 x double> %src, i32
; GFX802-SDAG-NEXT: v_writelane_b32 v19, s7, m0
; GFX802-SDAG-NEXT: v_writelane_b32 v18, s8, m0
; GFX802-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[18:21]
-; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v8
-; GFX802-SDAG-NEXT: v_add_u32_e32 v18, vcc, 32, v0
-; GFX802-SDAG-NEXT: v_addc_u32_e32 v19, vcc, 0, v1, vcc
-; GFX802-SDAG-NEXT: v_add_u32_e32 v0, vcc, 48, v0
-; GFX802-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v7
+; GFX802-SDAG-NEXT: v_add_u32_e32 v18, vcc, 48, v0
+; GFX802-SDAG-NEXT: v_addc_u32_e32 v19, vcc, 0, v1, vcc
+; GFX802-SDAG-NEXT: v_add_u32_e32 v20, vcc, 16, v0
+; GFX802-SDAG-NEXT: v_addc_u32_e32 v21, vcc, 0, v1, vcc
+; GFX802-SDAG-NEXT: flat_load_dwordx4 v[0:3], v[20:21]
; GFX802-SDAG-NEXT: v_readfirstlane_b32 s7, v6
-; GFX802-SDAG-NEXT: flat_load_dwordx4 v[6:9], v[0:1]
-; GFX802-SDAG-NEXT: flat_load_dwordx4 v[12:15], v[18:19]
+; GFX802-SDAG-NEXT: flat_load_dwordx4 v[4:7], v[18:19]
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v8
; GFX802-SDAG-NEXT: v_readfirstlane_b32 s8, v17
; GFX802-SDAG-NEXT: s_waitcnt vmcnt(3)
-; GFX802-SDAG-NEXT: v_writelane_b32 v5, s4, m0
-; GFX802-SDAG-NEXT: v_writelane_b32 v4, s5, m0
-; GFX802-SDAG-NEXT: v_writelane_b32 v3, s6, m0
-; GFX802-SDAG-NEXT: v_writelane_b32 v2, s7, m0
-; GFX802-SDAG-NEXT: s_waitcnt vmcnt(1)
-; GFX802-SDAG-NEXT: v_writelane_b32 v9, s8, m0
-; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX802-SDAG-NEXT: v_writelane_b32 v15, s12, m0
; GFX802-SDAG-NEXT: v_writelane_b32 v14, s13, m0
; GFX802-SDAG-NEXT: v_writelane_b32 v13, s14, m0
; GFX802-SDAG-NEXT: v_writelane_b32 v12, s15, m0
-; GFX802-SDAG-NEXT: v_writelane_b32 v8, s9, m0
-; GFX802-SDAG-NEXT: v_writelane_b32 v7, s10, m0
-; GFX802-SDAG-NEXT: v_writelane_b32 v6, s11, m0
-; GFX802-SDAG-NEXT: flat_store_dwordx4 v[18:19], v[12:15]
-; GFX802-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[6:9]
-; GFX802-SDAG-NEXT: flat_store_dwordx4 v[22:23], v[2:5]
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(1)
+; GFX802-SDAG-NEXT: v_writelane_b32 v3, s4, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v2, s5, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v1, s6, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v0, s7, m0
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: v_writelane_b32 v7, s8, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v6, s9, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v5, s10, m0
+; GFX802-SDAG-NEXT: v_writelane_b32 v4, s11, m0
+; GFX802-SDAG-NEXT: flat_store_dwordx4 v[22:23], v[12:15]
+; GFX802-SDAG-NEXT: flat_store_dwordx4 v[18:19], v[4:7]
+; GFX802-SDAG-NEXT: flat_store_dwordx4 v[20:21], v[0:3]
; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll
index 796884a612816..1480080259a33 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll
@@ -56,9 +56,9 @@ define void @test_writelane_v3p0(ptr addrspace(1) %out, <3 x ptr> %src, i32 %src
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX802-SDAG-NEXT: v_add_u32_e32 v13, vcc, 16, v0
-; GFX802-SDAG-NEXT: flat_load_dwordx4 v[9:12], v[0:1]
; GFX802-SDAG-NEXT: v_addc_u32_e32 v14, vcc, 0, v1, vcc
; GFX802-SDAG-NEXT: flat_load_dwordx2 v[15:16], v[13:14]
+; GFX802-SDAG-NEXT: flat_load_dwordx4 v[9:12], v[0:1]
; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v8
; GFX802-SDAG-NEXT: v_readfirstlane_b32 s7, v5
; GFX802-SDAG-NEXT: v_readfirstlane_b32 s8, v4
@@ -68,12 +68,12 @@ define void @test_writelane_v3p0(ptr addrspace(1) %out, <3 x ptr> %src, i32 %src
; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v7
; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v6
; GFX802-SDAG-NEXT: s_waitcnt vmcnt(1)
+; GFX802-SDAG-NEXT: v_writelane_b32 v16, s5, m0
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX802-SDAG-NEXT: v_writelane_b32 v12, s7, m0
; GFX802-SDAG-NEXT: v_writelane_b32 v11, s8, m0
; GFX802-SDAG-NEXT: v_writelane_b32 v10, s9, m0
; GFX802-SDAG-NEXT: v_writelane_b32 v9, s10, m0
-; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX802-SDAG-NEXT: v_writelane_b32 v16, s5, m0
; GFX802-SDAG-NEXT: v_writelane_b32 v15, s6, m0
; GFX802-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[9:12]
; GFX802-SDAG-NEXT: flat_store_dwordx2 v[13:14], v[15:16]
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index 0194e3c6ce37b..7bae1a14704a3 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -766,28 +766,28 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v8, 0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: s_clause 0x7
+; GFX12-TRUE16-NEXT: global_load_d16_b16 v1, v8, s[0:1] offset:4
+; GFX12-TRUE16-NEXT: global_load_d16_b16 v2, v8, s[0:1] offset:8
; GFX12-TRUE16-NEXT: global_load_d16_b16 v3, v8, s[0:1] offset:12
-; GFX12-TRUE16-NEXT: global_load_d16_b16 v7, v8, s[0:1] offset:28
-; GFX12-TRUE16-NEXT: global_load_d16_b16 v6, v8, s[0:1] offset:24
-; GFX12-TRUE16-NEXT: global_load_d16_b16 v5, v8, s[0:1] offset:20
; GFX12-TRUE16-NEXT: global_load_d16_b16 v4, v8, s[0:1] offset:16
-; GFX12-TRUE16-NEXT: global_load_d16_b16 v2, v8, s[0:1] offset:8
-; GFX12-TRUE16-NEXT: global_load_d16_b16 v1, v8, s[0:1] offset:4
+; GFX12-TRUE16-NEXT: global_load_d16_b16 v5, v8, s[0:1] offset:20
+; GFX12-TRUE16-NEXT: global_load_d16_b16 v6, v8, s[0:1] offset:24
+; GFX12-TRUE16-NEXT: global_load_d16_b16 v7, v8, s[0:1] offset:28
; GFX12-TRUE16-NEXT: global_load_d16_b16 v0, v8, s[0:1]
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7
-; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v3, v8, s[0:1] offset:14
-; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7
-; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v7, v8, s[0:1] offset:30
+; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v1, v8, s[0:1] offset:6
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7
-; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v6, v8, s[0:1] offset:26
+; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v2, v8, s[0:1] offset:10
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7
-; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v5, v8, s[0:1] offset:22
+; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v3, v8, s[0:1] offset:14
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7
; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v4, v8, s[0:1] offset:18
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7
-; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v2, v8, s[0:1] offset:10
+; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v5, v8, s[0:1] offset:22
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7
-; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v1, v8, s[0:1] offset:6
+; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v6, v8, s[0:1] offset:26
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7
+; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v7, v8, s[0:1] offset:30
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7
; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v0, v8, s[0:1] offset:2
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
@@ -802,28 +802,28 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #
; GFX12-FAKE16-NEXT: v_mov_b32_e32 v8, 0
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX12-FAKE16-NEXT: s_clause 0x7
+; GFX12-FAKE16-NEXT: global_load_u16 v1, v8, s[0:1] offset:4
+; GFX12-FAKE16-NEXT: global_load_u16 v2, v8, s[0:1] offset:8
; GFX12-FAKE16-NEXT: global_load_u16 v3, v8, s[0:1] offset:12
-; GFX12-FAKE16-NEXT: global_load_u16 v7, v8, s[0:1] offset:28
-; GFX12-FAKE16-NEXT: global_load_u16 v6, v8, s[0:1] offset:24
-; GFX12-FAKE16-NEXT: global_load_u16 v5, v8, s[0:1] offset:20
; GFX12-FAKE16-NEXT: global_load_u16 v4, v8, s[0:1] offset:16
-; GFX12-FAKE16-NEXT: global_load_u16 v2, v8, s[0:1] offset:8
-; GFX12-FAKE16-NEXT: global_load_u16 v1, v8, s[0:1] offset:4
+; GFX12-FAKE16-NEXT: global_load_u16 v5, v8, s[0:1] offset:20
+; GFX12-FAKE16-NEXT: global_load_u16 v6, v8, s[0:1] offset:24
+; GFX12-FAKE16-NEXT: global_load_u16 v7, v8, s[0:1] offset:28
; GFX12-FAKE16-NEXT: global_load_u16 v0, v8, s[0:1]
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x7
-; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v3, v8, s[0:1] offset:14
-; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x7
-; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v7, v8, s[0:1] offset:30
+; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v1, v8, s[0:1] offset:6
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x7
-; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v6, v8, s[0:1] offset:26
+; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v2, v8, s[0:1] offset:10
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x7
-; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v5, v8, s[0:1] offset:22
+; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v3, v8, s[0:1] offset:14
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x7
; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v4, v8, s[0:1] offset:18
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x7
-; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v2, v8, s[0:1] offset:10
+; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v5, v8, s[0:1] offset:22
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x7
-; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v1, v8, s[0:1] offset:6
+; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v6, v8, s[0:1] offset:26
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x7
+; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v7, v8, s[0:1] offset:30
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x7
; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v0, v8, s[0:1] offset:2
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index 388006281abdc..634a5adab58e2 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -3652,194 +3652,193 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
-; GCN-HSA-NEXT: flat_load_dwordx4 v[24:27], v[0:1]
; GCN-HSA-NEXT: s_add_u32 s4, s2, 16
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
; GCN-HSA-NEXT: s_add_u32 s6, s2, 32
; GCN-HSA-NEXT: s_addc_u32 s7, s3, 0
; GCN-HSA-NEXT: s_add_u32 s8, s2, 48
; GCN-HSA-NEXT: s_addc_u32 s9, s3, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v17, s9
; GCN-HSA-NEXT: s_add_u32 s10, s2, 64
-; GCN-HSA-NEXT: v_mov_b32_e32 v16, s8
; GCN-HSA-NEXT: s_addc_u32 s11, s3, 0
-; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[16:17]
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s11
+; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GCN-HSA-NEXT: s_add_u32 s10, s2, 0x50
; GCN-HSA-NEXT: s_addc_u32 s11, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11
; GCN-HSA-NEXT: s_add_u32 s10, s2, 0x60
-; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[4:5]
+; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[4:5]
; GCN-HSA-NEXT: s_addc_u32 s11, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10
-; GCN-HSA-NEXT: s_add_u32 s2, s2, 0x70
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11
+; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[4:5]
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
+; GCN-HSA-NEXT: s_add_u32 s2, s2, 0x70
; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0
; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
-; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2
-; GCN-HSA-NEXT: flat_load_dwordx4 v[28:31], v[12:13]
-; GCN-HSA-NEXT: v_mov_b32_e32 v13, s5
-; GCN-HSA-NEXT: v_mov_b32_e32 v15, s7
-; GCN-HSA-NEXT: v_mov_b32_e32 v12, s4
-; GCN-HSA-NEXT: v_mov_b32_e32 v14, s6
-; GCN-HSA-NEXT: flat_load_dwordx4 v[16:19], v[12:13]
-; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[14:15]
+; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2
+; GCN-HSA-NEXT: flat_load_dwordx4 v[28:31], v[16:17]
+; GCN-HSA-NEXT: v_mov_b32_e32 v17, s9
+; GCN-HSA-NEXT: v_mov_b32_e32 v16, s8
+; GCN-HSA-NEXT: flat_load_dwordx4 v[24:27], v[16:17]
+; GCN-HSA-NEXT: v_mov_b32_e32 v17, s5
+; GCN-HSA-NEXT: v_mov_b32_e32 v19, s7
+; GCN-HSA-NEXT: v_mov_b32_e32 v16, s4
+; GCN-HSA-NEXT: v_mov_b32_e32 v18, s6
+; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[16:17]
+; GCN-HSA-NEXT: flat_load_dwordx4 v[16:19], v[18:19]
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v37, s1
-; GCN-HSA-NEXT: v_mov_b32_e32 v36, s0
-; GCN-HSA-NEXT: s_waitcnt vmcnt(7)
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v35, 16, v25
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v33, 16, v24
-; GCN-HSA-NEXT: v_and_b32_e32 v34, 0xffff, v25
-; GCN-HSA-NEXT: v_and_b32_e32 v32, 0xffff, v24
-; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0
-; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xf0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[32:35]
-; GCN-HSA-NEXT: v_mov_b32_e32 v37, s3
+; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xe0
; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v36, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0
-; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xd0
+; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xf0
; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0
-; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xa0
+; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xc0
; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0
-; GCN-HSA-NEXT: s_add_u32 s10, s0, 0xb0
+; GCN-HSA-NEXT: s_add_u32 s10, s0, 0xd0
; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0
-; GCN-HSA-NEXT: s_add_u32 s12, s0, 0x80
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v35, 16, v27
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v33, 16, v26
-; GCN-HSA-NEXT: v_and_b32_e32 v34, 0xffff, v27
-; GCN-HSA-NEXT: v_and_b32_e32 v32, 0xffff, v26
+; GCN-HSA-NEXT: s_add_u32 s12, s0, 0xa0
; GCN-HSA-NEXT: s_addc_u32 s13, s1, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[32:35]
+; GCN-HSA-NEXT: s_add_u32 s14, s0, 0xb0
+; GCN-HSA-NEXT: s_addc_u32 s15, s1, 0
+; GCN-HSA-NEXT: s_add_u32 s16, s0, 0x80
+; GCN-HSA-NEXT: s_addc_u32 s17, s1, 0
; GCN-HSA-NEXT: s_waitcnt vmcnt(7)
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v1
-; GCN-HSA-NEXT: v_mov_b32_e32 v33, s13
-; GCN-HSA-NEXT: v_mov_b32_e32 v32, s12
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v0
-; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xffff, v1
-; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v0
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8
-; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[24:27]
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v35, 16, v1
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v33, 16, v0
+; GCN-HSA-NEXT: v_and_b32_e32 v34, 0xffff, v1
+; GCN-HSA-NEXT: v_and_b32_e32 v32, 0xffff, v0
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s16
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s17
+; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[32:35]
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12
; GCN-HSA-NEXT: s_waitcnt vmcnt(7)
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v9
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v8
-; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xffff, v9
-; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v8
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v35, 16, v13
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v33, 16, v12
+; GCN-HSA-NEXT: v_and_b32_e32 v34, 0xffff, v13
+; GCN-HSA-NEXT: v_and_b32_e32 v32, 0xffff, v12
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s13
+; GCN-HSA-NEXT: v_mov_b32_e32 v12, s14
+; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[32:35]
+; GCN-HSA-NEXT: v_mov_b32_e32 v13, s15
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v35, 16, v15
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GCN-HSA-NEXT: v_and_b32_e32 v34, 0xffff, v15
+; GCN-HSA-NEXT: v_and_b32_e32 v32, 0xffff, v14
+; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[32:35]
+; GCN-HSA-NEXT: s_waitcnt vmcnt(8)
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v9
+; GCN-HSA-NEXT: v_mov_b32_e32 v33, s9
+; GCN-HSA-NEXT: v_mov_b32_e32 v32, s8
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v8
+; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v9
+; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v8
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s10
-; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[24:27]
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
+; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[12:15]
+; GCN-HSA-NEXT: v_mov_b32_e32 v33, s3
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v11
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 16, v10
+; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xffff, v11
+; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xffff, v10
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s11
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v11
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v10
-; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xffff, v11
-; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v10
-; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[24:27]
-; GCN-HSA-NEXT: s_waitcnt vmcnt(8)
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v5
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
+; GCN-HSA-NEXT: v_mov_b32_e32 v32, s2
+; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[11:14]
+; GCN-HSA-NEXT: s_waitcnt vmcnt(9)
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v4
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v5
; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xffff, v5
; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v4
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT: v_mov_b32_e32 v33, s7
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v7
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v6
-; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xffff, v7
-; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v6
-; GCN-HSA-NEXT: v_mov_b32_e32 v32, s6
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[8:11]
-; GCN-HSA-NEXT: s_waitcnt vmcnt(8)
-; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xffff, v28
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v29
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v28
-; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v29
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90
-; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[24:27]
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
+; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v31
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v30
-; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xffff, v31
-; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v30
-; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[7:10]
-; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[24:27]
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v7
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v6
+; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v7
+; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v6
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5
+; GCN-HSA-NEXT: v_mov_b32_e32 v35, s7
+; GCN-HSA-NEXT: s_waitcnt vmcnt(9)
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v29
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v28
+; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v29
+; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xffff, v28
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60
+; GCN-HSA-NEXT: v_mov_b32_e32 v34, s6
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 16, v3
+; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[12:15]
+; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xffff, v30
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v31
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 16, v30
+; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xffff, v31
+; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[7:10]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[34:35], v[11:14]
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v2
; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v3
; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xffff, v2
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[3:6]
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v20
+; GCN-HSA-NEXT: s_waitcnt vmcnt(10)
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v17
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v21
-; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v21
-; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v20
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 64
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v23
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v22
-; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v23
-; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v22
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: s_waitcnt vmcnt(12)
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v18
-; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
-; GCN-HSA-NEXT: s_waitcnt vmcnt(12)
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v15
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v17
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v16
-; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v18
-; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v17
-; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v16
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v13
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v12
-; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xffff, v15
-; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v13
-; GCN-HSA-NEXT: v_and_b32_e32 v15, 0xffff, v12
-; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v16
+; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v17
+; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v16
+; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[15:18]
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v25
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v24
+; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v25
+; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v24
+; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v27
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v26
+; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v27
+; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v26
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 32
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v14
-; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v14
+; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v19
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v18
+; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xffff, v19
+; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v18
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
; GCN-HSA-NEXT: s_add_u32 s0, s0, 48
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v21
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v20
+; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v21
+; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v20
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v19
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v23
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v19
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v22
+; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v23
+; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v22
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: s_endpgm
@@ -4494,11 +4493,11 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2
; GCN-HSA-NEXT: flat_load_dwordx4 v[24:27], v[8:9]
-; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5
-; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4
+; GCN-HSA-NEXT: v_mov_b32_e32 v9, s7
+; GCN-HSA-NEXT: v_mov_b32_e32 v17, s5
+; GCN-HSA-NEXT: v_mov_b32_e32 v8, s6
+; GCN-HSA-NEXT: v_mov_b32_e32 v16, s4
; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
-; GCN-HSA-NEXT: v_mov_b32_e32 v17, s7
-; GCN-HSA-NEXT: v_mov_b32_e32 v16, s6
; GCN-HSA-NEXT: flat_load_dwordx4 v[16:19], v[16:17]
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
@@ -4595,49 +4594,48 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou
; GCN-HSA-NEXT: v_bfe_i32 v22, v3, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v20, v2, 0, 16
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[20:23]
-; GCN-HSA-NEXT: s_waitcnt vmcnt(11)
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v9
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v8
-; GCN-HSA-NEXT: v_bfe_i32 v2, v9, 0, 16
-; GCN-HSA-NEXT: v_bfe_i32 v0, v8, 0, 16
-; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2
+; GCN-HSA-NEXT: s_waitcnt vmcnt(10)
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v17
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v16
+; GCN-HSA-NEXT: v_bfe_i32 v2, v17, 0, 16
+; GCN-HSA-NEXT: v_bfe_i32 v0, v16, 0, 16
+; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
-; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2
+; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3]
+; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 64
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v24
-; GCN-HSA-NEXT: v_bfe_i32 v12, v24, 0, 16
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v24, 16, v11
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 16, v10
-; GCN-HSA-NEXT: v_bfe_i32 v23, v11, 0, 16
-; GCN-HSA-NEXT: v_bfe_i32 v21, v10, 0, 16
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v11
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v10
+; GCN-HSA-NEXT: v_bfe_i32 v2, v11, 0, 16
+; GCN-HSA-NEXT: v_bfe_i32 v0, v10, 0, 16
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 16, v9
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 16, v8
+; GCN-HSA-NEXT: v_bfe_i32 v22, v9, 0, 16
+; GCN-HSA-NEXT: v_bfe_i32 v20, v8, 0, 16
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 16, v19
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v18
+; GCN-HSA-NEXT: v_bfe_i32 v10, v19, 0, 16
+; GCN-HSA-NEXT: v_bfe_i32 v8, v18, 0, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[21:24]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11]
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v25
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50
-; GCN-HSA-NEXT: s_waitcnt vmcnt(12)
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v19
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v18
-; GCN-HSA-NEXT: v_bfe_i32 v2, v19, 0, 16
-; GCN-HSA-NEXT: v_bfe_i32 v0, v18, 0, 16
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v20, 16, v17
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v16
-; GCN-HSA-NEXT: v_bfe_i32 v19, v17, 0, 16
-; GCN-HSA-NEXT: v_bfe_i32 v17, v16, 0, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[17:20]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[20:23]
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 32
; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v25
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v24
; GCN-HSA-NEXT: v_bfe_i32 v14, v25, 0, 16
+; GCN-HSA-NEXT: v_bfe_i32 v12, v24, 0, 16
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: s_add_u32 s0, s0, 48
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15]
@@ -4666,107 +4664,106 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
-; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16
-; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32
+; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0
+; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:16
+; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48
-; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[23:26], off, s[8:11], 0 offset:64
-; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[27:30], off, s[8:11], 0 offset:80
-; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[31:34], off, s[8:11], 0 offset:96
-; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[35:38], off, s[8:11], 0 offset:112
+; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:64
+; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:80
+; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[32:35], off, s[8:11], 0 offset:96
+; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[36:39], off, s[8:11], 0 offset:112
; GCN-NOHSA-VI-NEXT: s_addc_u32 s89, s89, 0
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(7)
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 16, v15
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 16, v14
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v15, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v14, 0, 16
-; GCN-NOHSA-VI-NEXT: buffer_store_dword v15, off, s[88:91], 0 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT: buffer_store_dword v16, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT: buffer_store_dword v17, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT: buffer_store_dword v18, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 16, v13
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 16, v12
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v13, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v12, 0, 16
-; GCN-NOHSA-VI-NEXT: buffer_store_dword v13, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT: buffer_store_dword v14, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT: buffer_store_dword v15, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT: buffer_store_dword v16, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(14)
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 16, v11
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 16, v10
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v11, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v10, 0, 16
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v42, 16, v9
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v40, 16, v8
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v41, v9, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v39, v8, 0, 16
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 16, v23
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 16, v22
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v23, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v22, 0, 16
+; GCN-NOHSA-VI-NEXT: buffer_store_dword v4, off, s[88:91], 0 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT: buffer_store_dword v5, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT: buffer_store_dword v6, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT: buffer_store_dword v7, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 16, v21
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 16, v20
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v21, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v20, 0, 16
+; GCN-NOHSA-VI-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT: buffer_store_dword v5, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT: buffer_store_dword v6, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT: buffer_store_dword v7, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(12)
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 16, v3
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 16, v2
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v3, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v2, 0, 16
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v50, 16, v1
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v48, 16, v0
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v49, v1, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v47, v0, 0, 16
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 16, v3
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v13, 16, v2
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v3, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v2, 0, 16
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v55, 16, v1
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v53, 16, v0
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v54, v1, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v52, v0, 0, 16
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(8)
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 16, v36
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 16, v35
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v36, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v35, 0, 16
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 16, v7
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v13, 16, v6
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v7, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v6, 0, 16
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v46, 16, v5
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v44, 16, v4
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v45, v5, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v43, v4, 0, 16
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v54, 16, v26
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v52, 16, v25
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v53, v26, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v51, v25, 0, 16
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v58, 16, v24
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v56, 16, v23
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v57, v24, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v55, v23, 0, 16
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v26, 16, v30
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v24, 16, v29
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v25, v30, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v23, v29, 0, 16
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v62, 16, v28
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v60, 16, v27
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v61, v28, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v59, v27, 0, 16
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v30, 16, v34
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v28, 16, v33
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v29, v34, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v27, v33, 0, 16
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 16, v32
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 16, v31
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v32, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v31, 0, 16
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v34, 16, v38
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v32, 16, v37
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v33, v38, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v31, v37, 0, 16
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 16, v37
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 16, v36
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v37, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v36, 0, 16
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v22, 16, v19
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v20, 16, v18
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v21, v19, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v18, 0, 16
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v43, 16, v17
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v41, 16, v16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v42, v17, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v40, v16, 0, 16
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v47, 16, v11
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v45, 16, v10
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v46, v11, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v44, v10, 0, 16
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v51, 16, v9
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v49, 16, v8
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v50, v9, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v48, v8, 0, 16
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 16, v27
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 16, v26
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v27, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v26, 0, 16
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v59, 16, v25
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v57, 16, v24
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v58, v25, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v56, v24, 0, 16
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v26, 16, v31
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v24, 16, v30
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v25, v31, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v23, v30, 0, 16
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v63, 16, v29
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v61, 16, v28
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v62, v29, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v60, v28, 0, 16
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v30, 16, v35
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v28, 16, v34
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v29, v35, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v27, v34, 0, 16
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 16, v33
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 16, v32
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v33, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v32, 0, 16
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v34, 16, v39
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v32, 16, v38
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v33, v39, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v31, v38, 0, 16
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:240
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:208
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[59:62], off, s[0:3], 0 offset:160
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:160
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:176
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[55:58], off, s[0:3], 0 offset:128
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[51:54], off, s[0:3], 0 offset:144
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[47:50], off, s[0:3], 0 offset:96
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[43:46], off, s[0:3], 0 offset:64
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[39:42], off, s[0:3], 0 offset:32
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:128
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:144
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:96
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:112
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:64
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:80
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:32
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:48
; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:16 ; 4-byte Folded Reload
; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload
; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:24 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
index 7203545ebf9a8..e0164e6ccaf9b 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
@@ -2377,8 +2377,8 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou
; SI-NOHSA-NEXT: s_mov_b32 s9, s7
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32
-; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16
-; SI-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0
+; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0
+; SI-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:16
; SI-NOHSA-NEXT: s_waitcnt vmcnt(3)
; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v19, 31, v3
; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v17, 31, v2
@@ -2397,24 +2397,23 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou
; SI-NOHSA-NEXT: v_mov_b32_e32 v26, v5
; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v6
; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v7
-; SI-NOHSA-NEXT: s_waitcnt vmcnt(1)
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v7, 31, v11
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v5, 31, v10
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v31, 31, v9
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v29, 31, v8
-; SI-NOHSA-NEXT: v_mov_b32_e32 v28, v8
-; SI-NOHSA-NEXT: v_mov_b32_e32 v30, v9
-; SI-NOHSA-NEXT: v_mov_b32_e32 v4, v10
-; SI-NOHSA-NEXT: v_mov_b32_e32 v6, v11
; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v11, 31, v15
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v9, 31, v14
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v35, 31, v13
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v33, 31, v12
-; SI-NOHSA-NEXT: v_mov_b32_e32 v32, v12
-; SI-NOHSA-NEXT: v_mov_b32_e32 v34, v13
-; SI-NOHSA-NEXT: v_mov_b32_e32 v8, v14
-; SI-NOHSA-NEXT: v_mov_b32_e32 v10, v15
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v7, 31, v15
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v5, 31, v14
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v31, 31, v13
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v29, 31, v12
+; SI-NOHSA-NEXT: v_mov_b32_e32 v28, v12
+; SI-NOHSA-NEXT: v_mov_b32_e32 v30, v13
+; SI-NOHSA-NEXT: v_mov_b32_e32 v4, v14
+; SI-NOHSA-NEXT: v_mov_b32_e32 v6, v15
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v15, 31, v11
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v13, 31, v10
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v35, 31, v9
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v33, 31, v8
+; SI-NOHSA-NEXT: v_mov_b32_e32 v32, v8
+; SI-NOHSA-NEXT: v_mov_b32_e32 v34, v9
+; SI-NOHSA-NEXT: v_mov_b32_e32 v12, v10
+; SI-NOHSA-NEXT: v_mov_b32_e32 v14, v11
; SI-NOHSA-NEXT: s_mov_b32 s0, s4
; SI-NOHSA-NEXT: s_mov_b32 s1, s5
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:96
@@ -2424,7 +2423,7 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:32
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0
-; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
+; SI-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16
; SI-NOHSA-NEXT: s_endpgm
;
; GCNX3-HSA-LABEL: global_sextload_v16i32_to_v16i64:
@@ -2686,51 +2685,51 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou
; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v36, s[2:3] offset:32
; GCN-HSA-NEXT: global_load_dwordx4 v[4:7], v36, s[2:3] offset:48
; GCN-HSA-NEXT: global_load_dwordx4 v[8:11], v36, s[2:3] offset:16
-; GCN-HSA-NEXT: global_load_dwordx4 v[12:15], v36, s[2:3]
-; GCN-HSA-NEXT: s_waitcnt vmcnt(3)
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v3
; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v27, 31, v5
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v4
-; GCN-HSA-NEXT: v_mov_b32_e32 v24, v4
-; GCN-HSA-NEXT: v_mov_b32_e32 v26, v5
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v2
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 31, v7
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 31, v6
-; GCN-HSA-NEXT: v_mov_b32_e32 v20, v6
-; GCN-HSA-NEXT: v_mov_b32_e32 v22, v7
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v3
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v2
+; GCN-HSA-NEXT: s_waitcnt vmcnt(1)
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v7
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v6
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 31, v5
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 31, v4
+; GCN-HSA-NEXT: v_mov_b32_e32 v20, v4
+; GCN-HSA-NEXT: v_mov_b32_e32 v22, v5
+; GCN-HSA-NEXT: v_mov_b32_e32 v16, v6
+; GCN-HSA-NEXT: v_mov_b32_e32 v18, v7
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, v0
; GCN-HSA-NEXT: v_mov_b32_e32 v6, v1
-; GCN-HSA-NEXT: v_mov_b32_e32 v16, v2
-; GCN-HSA-NEXT: v_mov_b32_e32 v18, v3
-; GCN-HSA-NEXT: s_waitcnt vmcnt(1)
+; GCN-HSA-NEXT: v_mov_b32_e32 v12, v2
+; GCN-HSA-NEXT: v_mov_b32_e32 v14, v3
+; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v11
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v10
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v31, 31, v9
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v29, 31, v8
-; GCN-HSA-NEXT: v_mov_b32_e32 v28, v8
-; GCN-HSA-NEXT: v_mov_b32_e32 v30, v9
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v27, 31, v9
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v8
+; GCN-HSA-NEXT: v_mov_b32_e32 v24, v8
+; GCN-HSA-NEXT: v_mov_b32_e32 v26, v9
; GCN-HSA-NEXT: v_mov_b32_e32 v0, v10
; GCN-HSA-NEXT: v_mov_b32_e32 v2, v11
+; GCN-HSA-NEXT: global_load_dwordx4 v[8:11], v36, s[2:3]
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v15
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v14
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v35, 31, v13
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v33, 31, v12
-; GCN-HSA-NEXT: v_mov_b32_e32 v32, v12
-; GCN-HSA-NEXT: v_mov_b32_e32 v34, v13
-; GCN-HSA-NEXT: v_mov_b32_e32 v8, v14
-; GCN-HSA-NEXT: v_mov_b32_e32 v10, v15
-; GCN-HSA-NEXT: global_store_dwordx4 v36, v[24:27], s[0:1] offset:96
-; GCN-HSA-NEXT: global_store_dwordx4 v36, v[20:23], s[0:1] offset:112
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v31, 31, v11
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v29, 31, v10
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v35, 31, v9
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v33, 31, v8
+; GCN-HSA-NEXT: v_mov_b32_e32 v32, v8
+; GCN-HSA-NEXT: v_mov_b32_e32 v34, v9
+; GCN-HSA-NEXT: v_mov_b32_e32 v28, v10
+; GCN-HSA-NEXT: v_mov_b32_e32 v30, v11
+; GCN-HSA-NEXT: global_store_dwordx4 v36, v[20:23], s[0:1] offset:96
+; GCN-HSA-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:112
; GCN-HSA-NEXT: global_store_dwordx4 v36, v[4:7], s[0:1] offset:64
-; GCN-HSA-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:80
-; GCN-HSA-NEXT: global_store_dwordx4 v36, v[28:31], s[0:1] offset:32
+; GCN-HSA-NEXT: global_store_dwordx4 v36, v[12:15], s[0:1] offset:80
+; GCN-HSA-NEXT: global_store_dwordx4 v36, v[24:27], s[0:1] offset:32
; GCN-HSA-NEXT: global_store_dwordx4 v36, v[0:3], s[0:1] offset:48
; GCN-HSA-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1]
-; GCN-HSA-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:16
+; GCN-HSA-NEXT: global_store_dwordx4 v36, v[28:31], s[0:1] offset:16
; GCN-HSA-NEXT: s_endpgm
%ld = load <16 x i32>, ptr addrspace(1) %in
%ext = sext <16 x i32> %ld to <16 x i64>
@@ -3075,17 +3074,17 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; SI-NOHSA-NEXT: s_mov_b32 s8, s6
; SI-NOHSA-NEXT: s_mov_b32 s9, s7
-; SI-NOHSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:96
+; SI-NOHSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:96
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:112
-; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:80
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:64
-; SI-NOHSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:48
-; SI-NOHSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:32
-; SI-NOHSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:16
+; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:80
+; SI-NOHSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:32
+; SI-NOHSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:48
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0
+; SI-NOHSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:16
; SI-NOHSA-NEXT: s_waitcnt vmcnt(7)
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v47, 31, v31
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v45, 31, v30
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v47, 31, v27
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v45, 31, v26
; SI-NOHSA-NEXT: s_waitcnt vmcnt(6)
; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v39, 31, v15
; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v37, 31, v14
@@ -3095,17 +3094,17 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; SI-NOHSA-NEXT: v_mov_b32_e32 v42, v13
; SI-NOHSA-NEXT: v_mov_b32_e32 v36, v14
; SI-NOHSA-NEXT: v_mov_b32_e32 v38, v15
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v35, 31, v29
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v33, 31, v28
-; SI-NOHSA-NEXT: v_mov_b32_e32 v32, v28
-; SI-NOHSA-NEXT: v_mov_b32_e32 v34, v29
-; SI-NOHSA-NEXT: v_mov_b32_e32 v44, v30
-; SI-NOHSA-NEXT: v_mov_b32_e32 v46, v31
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v35, 31, v25
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v33, 31, v24
+; SI-NOHSA-NEXT: v_mov_b32_e32 v32, v24
+; SI-NOHSA-NEXT: v_mov_b32_e32 v34, v25
+; SI-NOHSA-NEXT: v_mov_b32_e32 v44, v26
+; SI-NOHSA-NEXT: v_mov_b32_e32 v46, v27
; SI-NOHSA-NEXT: buffer_store_dword v44, off, s[12:15], 0 ; 4-byte Folded Spill
; SI-NOHSA-NEXT: buffer_store_dword v45, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; SI-NOHSA-NEXT: buffer_store_dword v46, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
; SI-NOHSA-NEXT: buffer_store_dword v47, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
-; SI-NOHSA-NEXT: s_waitcnt vmcnt(9)
+; SI-NOHSA-NEXT: s_waitcnt vmcnt(8)
; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v15, 31, v7
; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v13, 31, v6
; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
@@ -3115,7 +3114,6 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; SI-NOHSA-NEXT: v_mov_b32_e32 v46, v5
; SI-NOHSA-NEXT: v_mov_b32_e32 v12, v6
; SI-NOHSA-NEXT: v_mov_b32_e32 v14, v7
-; SI-NOHSA-NEXT: s_waitcnt vmcnt(8)
; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v7, 31, v3
; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v5, 31, v2
; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v51, 31, v1
@@ -3124,42 +3122,40 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; SI-NOHSA-NEXT: v_mov_b32_e32 v50, v1
; SI-NOHSA-NEXT: v_mov_b32_e32 v4, v2
; SI-NOHSA-NEXT: v_mov_b32_e32 v6, v3
-; SI-NOHSA-NEXT: s_waitcnt vmcnt(7)
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v19
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v18
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v55, 31, v17
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v53, 31, v16
-; SI-NOHSA-NEXT: v_mov_b32_e32 v52, v16
-; SI-NOHSA-NEXT: v_mov_b32_e32 v54, v17
-; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v18
-; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v19
; SI-NOHSA-NEXT: s_waitcnt vmcnt(6)
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v19, 31, v23
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v17, 31, v22
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v59, 31, v21
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v57, 31, v20
-; SI-NOHSA-NEXT: v_mov_b32_e32 v56, v20
-; SI-NOHSA-NEXT: v_mov_b32_e32 v58, v21
-; SI-NOHSA-NEXT: v_mov_b32_e32 v16, v22
-; SI-NOHSA-NEXT: v_mov_b32_e32 v18, v23
-; SI-NOHSA-NEXT: s_waitcnt vmcnt(5)
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v23, 31, v27
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v21, 31, v26
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v63, 31, v25
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v61, 31, v24
-; SI-NOHSA-NEXT: v_mov_b32_e32 v60, v24
-; SI-NOHSA-NEXT: v_mov_b32_e32 v62, v25
-; SI-NOHSA-NEXT: v_mov_b32_e32 v20, v26
-; SI-NOHSA-NEXT: v_mov_b32_e32 v22, v27
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v23
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v22
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v55, 31, v21
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v53, 31, v20
+; SI-NOHSA-NEXT: v_mov_b32_e32 v52, v20
+; SI-NOHSA-NEXT: v_mov_b32_e32 v54, v21
+; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v22
+; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v23
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v23, 31, v19
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v21, 31, v18
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v59, 31, v17
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v57, 31, v16
+; SI-NOHSA-NEXT: v_mov_b32_e32 v56, v16
+; SI-NOHSA-NEXT: v_mov_b32_e32 v58, v17
+; SI-NOHSA-NEXT: v_mov_b32_e32 v20, v18
+; SI-NOHSA-NEXT: v_mov_b32_e32 v22, v19
; SI-NOHSA-NEXT: s_waitcnt vmcnt(4)
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v27, 31, v11
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v25, 31, v10
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v31, 31, v9
-; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v29, 31, v8
-; SI-NOHSA-NEXT: v_mov_b32_e32 v28, v8
-; SI-NOHSA-NEXT: v_mov_b32_e32 v30, v9
-; SI-NOHSA-NEXT: v_mov_b32_e32 v24, v10
-; SI-NOHSA-NEXT: v_mov_b32_e32 v26, v11
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v19, 31, v31
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v17, 31, v30
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v63, 31, v29
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v61, 31, v28
+; SI-NOHSA-NEXT: v_mov_b32_e32 v60, v28
+; SI-NOHSA-NEXT: v_mov_b32_e32 v62, v29
+; SI-NOHSA-NEXT: v_mov_b32_e32 v16, v30
+; SI-NOHSA-NEXT: v_mov_b32_e32 v18, v31
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v31, 31, v11
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v29, 31, v10
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v27, 31, v9
+; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v25, 31, v8
+; SI-NOHSA-NEXT: v_mov_b32_e32 v24, v8
+; SI-NOHSA-NEXT: v_mov_b32_e32 v26, v9
+; SI-NOHSA-NEXT: v_mov_b32_e32 v28, v10
+; SI-NOHSA-NEXT: v_mov_b32_e32 v30, v11
; SI-NOHSA-NEXT: s_mov_b32 s0, s4
; SI-NOHSA-NEXT: s_mov_b32 s1, s5
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:224
@@ -3178,11 +3174,11 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:96
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:64
-; SI-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80
+; SI-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:32
-; SI-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:48
-; SI-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0
-; SI-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:16
+; SI-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
+; SI-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0
+; SI-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:16
; SI-NOHSA-NEXT: s_endpgm
;
; GCNX3-HSA-LABEL: global_sextload_v32i32_to_v32i64:
@@ -3210,40 +3206,40 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 64
-; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1]
+; GCNX3-HSA-NEXT: flat_load_dwordx4 v[16:19], v[0:1]
; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5
-; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 48
; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s5
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s4
-; GCNX3-HSA-NEXT: flat_load_dwordx4 v[16:19], v[8:9]
+; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
; GCNX3-HSA-NEXT: s_add_u32 s6, s2, 32
; GCNX3-HSA-NEXT: s_addc_u32 s7, s3, 0
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s7
; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s6
; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0
+; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s7
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s5
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s6
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s4
; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
+; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13]
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v37, s1
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v36, s0
; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7)
; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v35, 31, v29
; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v33, 31, v28
; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, v28
; GCNX3-HSA-NEXT: v_mov_b32_e32 v34, v29
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s1
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s0
+; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[32:35]
; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3
; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s2
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xe0
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCNX3-HSA-NEXT: flat_store_dwordx4 v[36:37], v[32:35]
; GCNX3-HSA-NEXT: v_mov_b32_e32 v37, s3
; GCNX3-HSA-NEXT: v_mov_b32_e32 v36, s2
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xf0
@@ -3296,24 +3292,24 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[34:35], v[24:27]
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[36:37], v[28:31]
; GCNX3-HSA-NEXT: s_waitcnt vmcnt(10)
-; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v23, 31, v15
-; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v21, 31, v14
-; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v27, 31, v13
-; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v12
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, v12
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, v13
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, v14
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, v15
+; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v23, 31, v19
+; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v21, 31, v18
+; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v27, 31, v17
+; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v16
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, v16
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, v17
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, v18
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, v19
; GCNX3-HSA-NEXT: s_waitcnt vmcnt(9)
-; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v5
-; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v4
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, v4
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, v5
+; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v5
+; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v4
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v4
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v5
; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x90
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[12:15]
+; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[16:19]
; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x60
@@ -3325,55 +3321,54 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, v7
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[23:26]
-; GCNX3-HSA-NEXT: s_waitcnt vmcnt(12)
-; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v16
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, v16
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s3
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s2
+; GCNX3-HSA-NEXT: s_waitcnt vmcnt(10)
+; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v13
+; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v12
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, v12
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, v13
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s3
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s2
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x70
-; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v17
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, v17
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCNX3-HSA-NEXT: flat_store_dwordx4 v[15:16], v[4:7]
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s3
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s2
+; GCNX3-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7]
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s3
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s2
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 64
-; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v19
-; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v24, 31, v18
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, v18
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, v19
+; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v15
+; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v24, 31, v14
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, v14
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, v15
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCNX3-HSA-NEXT: flat_store_dwordx4 v[15:16], v[23:26]
-; GCNX3-HSA-NEXT: s_waitcnt vmcnt(12)
-; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v9
-; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v8
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, v8
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, v9
+; GCNX3-HSA-NEXT: flat_store_dwordx4 v[12:13], v[23:26]
+; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v9
+; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v8
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, v8
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, v9
; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3
; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x50
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[15:18]
+; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[12:15]
; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3
; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 32
-; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v1
-; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v12, 31, v0
-; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v11
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, v11
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, v0
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, v1
+; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v1
+; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v0
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, v0
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, v1
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 48
-; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[11:14]
+; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[15:18]
; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s0
; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v22, 31, v3
; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v2
+; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v11
; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v10
; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, v10
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, v11
; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, v2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, v3
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s1
@@ -3391,10 +3386,10 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
-; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:96
-; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:112
-; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:80
-; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:64
+; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:80
+; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:96
+; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:112
+; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:64
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:48
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:32
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0
@@ -3402,52 +3397,52 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7)
-; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v35, 31, v11
-; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v33, 31, v10
+; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v35, 31, v3
+; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v33, 31, v2
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(6)
-; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v39, 31, v15
-; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v37, 31, v14
-; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v43, 31, v13
-; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v41, 31, v12
-; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v40, v12
-; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v42, v13
-; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v36, v14
-; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v38, v15
-; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v15, 31, v9
-; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v13, 31, v8
-; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v12, v8
-; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v14, v9
-; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v32, v10
-; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v34, v11
+; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v39, 31, v7
+; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v37, 31, v6
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(5)
-; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v11, 31, v7
-; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v9, 31, v6
-; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v47, 31, v5
-; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v45, 31, v4
-; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v44, v4
-; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v46, v5
-; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v8, v6
-; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v10, v7
+; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v43, 31, v11
+; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v41, 31, v10
+; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v47, 31, v9
+; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v45, 31, v8
+; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v44, v8
+; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v46, v9
+; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v40, v10
+; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v42, v11
+; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v11, 31, v5
+; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v9, 31, v4
+; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v8, v4
+; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v10, v5
+; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v36, v6
+; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v38, v7
+; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1
+; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v5, 31, v0
+; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v4, v0
+; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v6, v1
+; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v32, v2
+; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v34, v3
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(4)
-; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v7, 31, v3
-; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v5, 31, v2
-; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v4, v2
-; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v6, v3
+; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v15
+; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v14
+; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v0, v14
+; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v2, v15
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(3)
-; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v19
-; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v2, v19
+; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v15, 31, v19
+; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v14, v19
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v19, 31, v23
-; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v51, 31, v1
-; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v49, 31, v0
-; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v48, v0
-; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v50, v1
-; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v18
+; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v51, 31, v13
+; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v49, 31, v12
+; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v48, v12
+; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v50, v13
+; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v13, 31, v18
; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v55, 31, v17
; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v53, 31, v16
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v52, v16
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v54, v17
-; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v0, v18
+; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v12, v18
; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v17, 31, v22
; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v59, 31, v21
; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v57, 31, v20
@@ -3458,36 +3453,36 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(1)
; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v22, 31, v27
; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v20, 31, v26
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:224
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:240
-; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v42, 31, v25
-; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v40, 31, v24
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:224
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:240
+; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v46, 31, v25
+; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v44, 31, v24
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
-; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v38, 31, v31
-; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v36, 31, v30
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192
-; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v37, v31
-; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v15, 31, v29
-; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v13, 31, v28
-; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v12, v28
-; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v14, v29
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:208
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:160
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:176
+; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v42, 31, v31
+; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v40, 31, v30
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:192
+; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v41, v31
+; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v11, 31, v29
+; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v9, 31, v28
+; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v8, v28
+; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v10, v29
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:208
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:160
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:176
; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:128
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:144
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:96
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:112
; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:64
; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
-; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v35, v30
-; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v39, v24
-; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v41, v25
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
+; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v39, v30
+; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v43, v24
+; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v45, v25
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v19, v26
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v21, v27
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[35:38], off, s[0:3], 0 offset:48
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[39:42], off, s[0:3], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[39:42], off, s[0:3], 0 offset:48
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[43:46], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:16
; GCNX3-NOHSA-NEXT: s_endpgm
;
@@ -3656,233 +3651,109 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; EG-NEXT: MOV * T32.Z, T12.Y,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
-; GCN-GFX900-HSA-LABEL: global_sextload_v32i32_to_v32i64:
-; GCN-GFX900-HSA: ; %bb.0:
-; GCN-GFX900-HSA-NEXT: s_mov_b64 s[22:23], s[2:3]
-; GCN-GFX900-HSA-NEXT: s_mov_b64 s[20:21], s[0:1]
-; GCN-GFX900-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v12, 0
-; GCN-GFX900-HSA-NEXT: s_add_u32 s20, s20, s17
-; GCN-GFX900-HSA-NEXT: s_addc_u32 s21, s21, 0
-; GCN-GFX900-HSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[4:7], v12, s[2:3] offset:96
-; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[8:11], v12, s[2:3] offset:112
-; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[25:28], v12, s[2:3] offset:80
-; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[13:16], v12, s[2:3] offset:64
-; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[17:20], v12, s[2:3] offset:48
-; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[21:24], v12, s[2:3] offset:32
-; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[0:3], v12, s[2:3] offset:16
-; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(6)
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v32, 31, v7
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v30, 31, v6
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v29, v6
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v31, v7
-; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(5)
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v36, 31, v11
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v34, 31, v10
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v40, 31, v9
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v38, 31, v8
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v37, v8
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v39, v9
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v33, v10
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v35, v11
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v5
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v4
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v8, v4
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v10, v5
-; GCN-GFX900-HSA-NEXT: buffer_store_dword v29, off, s[20:23], 0 ; 4-byte Folded Spill
-; GCN-GFX900-HSA-NEXT: s_nop 0
-; GCN-GFX900-HSA-NEXT: buffer_store_dword v30, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill
-; GCN-GFX900-HSA-NEXT: buffer_store_dword v31, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill
-; GCN-GFX900-HSA-NEXT: buffer_store_dword v32, off, s[20:23], 0 offset:12 ; 4-byte Folded Spill
-; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(7)
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v16
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v15
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v48, 31, v14
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v46, 31, v13
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v45, v13
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v47, v14
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v4, v15
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v6, v16
-; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(6)
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v20
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v19
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v52, 31, v18
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v50, 31, v17
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v49, v17
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v51, v18
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v13, v19
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v15, v20
-; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(5)
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v24
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v23
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v56, 31, v22
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v54, 31, v21
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v53, v21
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v55, v22
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v17, v23
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v19, v24
-; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[21:24], v12, s[2:3]
-; GCN-GFX900-HSA-NEXT: s_nop 0
-; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[37:40], s[0:1] offset:224
-; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[33:36], s[0:1] offset:240
-; GCN-GFX900-HSA-NEXT: buffer_load_dword v33, off, s[20:23], 0 ; 4-byte Folded Reload
-; GCN-GFX900-HSA-NEXT: s_nop 0
-; GCN-GFX900-HSA-NEXT: buffer_load_dword v34, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload
-; GCN-GFX900-HSA-NEXT: buffer_load_dword v35, off, s[20:23], 0 offset:8 ; 4-byte Folded Reload
-; GCN-GFX900-HSA-NEXT: buffer_load_dword v36, off, s[20:23], 0 offset:12 ; 4-byte Folded Reload
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v32, 31, v28
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v30, 31, v27
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v44, 31, v26
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v42, 31, v25
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v41, v25
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v43, v26
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v29, v27
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v31, v28
-; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(11)
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v60, 31, v3
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v58, 31, v2
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v1
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v0
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v25, v0
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v27, v1
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v57, v2
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v59, v3
-; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:192
-; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(7)
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v24
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v23
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v0, v23
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v2, v24
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v22
-; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v21
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v8, v21
-; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v10, v22
-; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(1)
-; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[33:36], s[0:1] offset:208
-; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[41:44], s[0:1] offset:160
-; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[29:32], s[0:1] offset:176
-; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[45:48], s[0:1] offset:128
-; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] offset:144
-; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[49:52], s[0:1] offset:96
-; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[13:16], s[0:1] offset:112
-; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[53:56], s[0:1] offset:64
-; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[17:20], s[0:1] offset:80
-; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[25:28], s[0:1] offset:32
-; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[57:60], s[0:1] offset:48
-; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1]
-; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16
-; GCN-GFX900-HSA-NEXT: s_endpgm
-;
-; GCN-GFX908-HSA-LABEL: global_sextload_v32i32_to_v32i64:
-; GCN-GFX908-HSA: ; %bb.0:
-; GCN-GFX908-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v12, 0
-; GCN-GFX908-HSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[4:7], v12, s[2:3] offset:96
-; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[8:11], v12, s[2:3] offset:112
-; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[25:28], v12, s[2:3] offset:80
-; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[13:16], v12, s[2:3] offset:64
-; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[17:20], v12, s[2:3] offset:48
-; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[21:24], v12, s[2:3] offset:32
-; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[0:3], v12, s[2:3] offset:16
-; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(6)
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v32, 31, v7
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v30, 31, v6
-; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(5)
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v36, 31, v11
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v34, 31, v10
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v40, 31, v9
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v38, 31, v8
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v37, v8
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v39, v9
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v33, v10
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v35, v11
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v5
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v4
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v8, v4
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v10, v5
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v29, v6
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v31, v7
-; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(3)
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v16
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v15
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v48, 31, v14
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v46, 31, v13
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v45, v13
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v47, v14
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v4, v15
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v6, v16
-; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(2)
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v20
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v19
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v52, 31, v18
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v50, 31, v17
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v49, v17
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v51, v18
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v13, v19
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v15, v20
-; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(1)
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v24
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v23
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v56, 31, v22
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v54, 31, v21
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v53, v21
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v55, v22
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v17, v23
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v19, v24
-; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[21:24], v12, s[2:3]
-; GCN-GFX908-HSA-NEXT: v_accvgpr_write_b32 a0, v29
-; GCN-GFX908-HSA-NEXT: v_accvgpr_write_b32 a3, v32
-; GCN-GFX908-HSA-NEXT: v_accvgpr_write_b32 a1, v30
-; GCN-GFX908-HSA-NEXT: v_accvgpr_write_b32 a2, v31
-; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[37:40], s[0:1] offset:224
-; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[33:36], s[0:1] offset:240
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v32, 31, v28
-; GCN-GFX908-HSA-NEXT: v_accvgpr_read_b32 v36, a3
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v30, 31, v27
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v44, 31, v26
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v42, 31, v25
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v41, v25
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v43, v26
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v29, v27
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v31, v28
-; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(3)
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v60, 31, v3
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v58, 31, v2
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v1
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v0
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v25, v0
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v27, v1
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v57, v2
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v59, v3
-; GCN-GFX908-HSA-NEXT: v_accvgpr_read_b32 v35, a2
-; GCN-GFX908-HSA-NEXT: v_accvgpr_read_b32 v34, a1
-; GCN-GFX908-HSA-NEXT: v_accvgpr_read_b32 v33, a0
-; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:192
-; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(3)
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v24
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v23
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v0, v23
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v2, v24
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v22
-; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v21
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v8, v21
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v10, v22
-; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[33:36], s[0:1] offset:208
-; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[41:44], s[0:1] offset:160
-; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[29:32], s[0:1] offset:176
-; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[45:48], s[0:1] offset:128
-; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] offset:144
-; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[49:52], s[0:1] offset:96
-; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[13:16], s[0:1] offset:112
-; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[53:56], s[0:1] offset:64
-; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[17:20], s[0:1] offset:80
-; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[25:28], s[0:1] offset:32
-; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[57:60], s[0:1] offset:48
-; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1]
-; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16
-; GCN-GFX908-HSA-NEXT: s_endpgm
+; GCN-HSA-LABEL: global_sextload_v32i32_to_v32i64:
+; GCN-HSA: ; %bb.0:
+; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: v_mov_b32_e32 v12, 0
+; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-HSA-NEXT: global_load_dwordx4 v[4:7], v12, s[2:3] offset:96
+; GCN-HSA-NEXT: global_load_dwordx4 v[8:11], v12, s[2:3] offset:112
+; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v12, s[2:3] offset:80
+; GCN-HSA-NEXT: global_load_dwordx4 v[13:16], v12, s[2:3] offset:64
+; GCN-HSA-NEXT: global_load_dwordx4 v[17:20], v12, s[2:3] offset:48
+; GCN-HSA-NEXT: global_load_dwordx4 v[21:24], v12, s[2:3] offset:32
+; GCN-HSA-NEXT: global_load_dwordx4 v[25:28], v12, s[2:3] offset:16
+; GCN-HSA-NEXT: s_waitcnt vmcnt(6)
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v32, 31, v7
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v30, 31, v6
+; GCN-HSA-NEXT: s_waitcnt vmcnt(5)
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v36, 31, v11
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v34, 31, v10
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v40, 31, v9
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v38, 31, v8
+; GCN-HSA-NEXT: v_mov_b32_e32 v37, v8
+; GCN-HSA-NEXT: v_mov_b32_e32 v39, v9
+; GCN-HSA-NEXT: v_mov_b32_e32 v33, v10
+; GCN-HSA-NEXT: v_mov_b32_e32 v35, v11
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v5
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v4
+; GCN-HSA-NEXT: v_mov_b32_e32 v8, v4
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, v5
+; GCN-HSA-NEXT: v_mov_b32_e32 v29, v6
+; GCN-HSA-NEXT: v_mov_b32_e32 v31, v7
+; GCN-HSA-NEXT: s_waitcnt vmcnt(4)
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v3
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v2
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v44, 31, v1
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v42, 31, v0
+; GCN-HSA-NEXT: v_mov_b32_e32 v41, v0
+; GCN-HSA-NEXT: v_mov_b32_e32 v43, v1
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, v2
+; GCN-HSA-NEXT: v_mov_b32_e32 v6, v3
+; GCN-HSA-NEXT: s_waitcnt vmcnt(3)
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v16
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v15
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v48, 31, v14
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v46, 31, v13
+; GCN-HSA-NEXT: v_mov_b32_e32 v45, v13
+; GCN-HSA-NEXT: v_mov_b32_e32 v47, v14
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, v15
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, v16
+; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v20
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v19
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v52, 31, v18
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v50, 31, v17
+; GCN-HSA-NEXT: v_mov_b32_e32 v49, v17
+; GCN-HSA-NEXT: v_mov_b32_e32 v51, v18
+; GCN-HSA-NEXT: v_mov_b32_e32 v13, v19
+; GCN-HSA-NEXT: v_mov_b32_e32 v15, v20
+; GCN-HSA-NEXT: s_waitcnt vmcnt(1)
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v24
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v23
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v56, 31, v22
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v54, 31, v21
+; GCN-HSA-NEXT: v_mov_b32_e32 v53, v21
+; GCN-HSA-NEXT: v_mov_b32_e32 v55, v22
+; GCN-HSA-NEXT: v_mov_b32_e32 v17, v23
+; GCN-HSA-NEXT: v_mov_b32_e32 v19, v24
+; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v24, 31, v28
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 31, v27
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v60, 31, v26
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v58, 31, v25
+; GCN-HSA-NEXT: v_mov_b32_e32 v57, v25
+; GCN-HSA-NEXT: v_mov_b32_e32 v59, v26
+; GCN-HSA-NEXT: v_mov_b32_e32 v21, v27
+; GCN-HSA-NEXT: v_mov_b32_e32 v23, v28
+; GCN-HSA-NEXT: global_load_dwordx4 v[25:28], v12, s[2:3]
+; GCN-HSA-NEXT: s_nop 0
+; GCN-HSA-NEXT: global_store_dwordx4 v12, v[37:40], s[0:1] offset:224
+; GCN-HSA-NEXT: global_store_dwordx4 v12, v[33:36], s[0:1] offset:240
+; GCN-HSA-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:192
+; GCN-HSA-NEXT: s_waitcnt vmcnt(3)
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v35, 31, v28
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v33, 31, v27
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v26
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v25
+; GCN-HSA-NEXT: v_mov_b32_e32 v8, v25
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, v26
+; GCN-HSA-NEXT: global_store_dwordx4 v12, v[29:32], s[0:1] offset:208
+; GCN-HSA-NEXT: global_store_dwordx4 v12, v[41:44], s[0:1] offset:160
+; GCN-HSA-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] offset:176
+; GCN-HSA-NEXT: global_store_dwordx4 v12, v[45:48], s[0:1] offset:128
+; GCN-HSA-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:144
+; GCN-HSA-NEXT: global_store_dwordx4 v12, v[49:52], s[0:1] offset:96
+; GCN-HSA-NEXT: global_store_dwordx4 v12, v[13:16], s[0:1] offset:112
+; GCN-HSA-NEXT: global_store_dwordx4 v12, v[53:56], s[0:1] offset:64
+; GCN-HSA-NEXT: global_store_dwordx4 v12, v[17:20], s[0:1] offset:80
+; GCN-HSA-NEXT: global_store_dwordx4 v12, v[57:60], s[0:1] offset:32
+; GCN-HSA-NEXT: global_store_dwordx4 v12, v[21:24], s[0:1] offset:48
+; GCN-HSA-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1]
+; GCN-HSA-NEXT: v_mov_b32_e32 v32, v27
+; GCN-HSA-NEXT: v_mov_b32_e32 v34, v28
+; GCN-HSA-NEXT: global_store_dwordx4 v12, v[32:35], s[0:1] offset:16
+; GCN-HSA-NEXT: s_endpgm
%ld = load <32 x i32>, ptr addrspace(1) %in
%ext = sext <32 x i32> %ld to <32 x i64>
store <32 x i64> %ext, ptr addrspace(1) %out
@@ -3902,31 +3773,31 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; SI-NOHSA-NEXT: s_mov_b32 s8, s6
; SI-NOHSA-NEXT: s_mov_b32 s9, s7
+; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:96
+; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:112
; SI-NOHSA-NEXT: s_mov_b32 s0, s4
; SI-NOHSA-NEXT: s_mov_b32 s1, s5
-; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:112
-; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:96
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:16
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:32
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:48
-; SI-NOHSA-NEXT: s_waitcnt vmcnt(5)
-; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v4
-; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v5
+; SI-NOHSA-NEXT: s_waitcnt vmcnt(4)
+; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v8
+; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v9
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:64
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[32:35], off, s[8:11], 0 offset:80
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
-; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v6
-; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v7
+; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v10
+; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v11
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
-; SI-NOHSA-NEXT: s_waitcnt vmcnt(8) expcnt(0)
-; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v8
-; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v9
+; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
+; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v4
+; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v5
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192
; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
-; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v10
-; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v11
+; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v6
+; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v7
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208
; SI-NOHSA-NEXT: s_waitcnt vmcnt(4) expcnt(0)
; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v32
@@ -4485,22 +4356,22 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa
; SI-NOHSA-NEXT: s_mov_b32 s9, s3
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
-; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:112
-; SI-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:96
-; SI-NOHSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:80
-; SI-NOHSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:64
-; SI-NOHSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:32
-; SI-NOHSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:48
-; SI-NOHSA-NEXT: s_waitcnt vmcnt(4)
-; SI-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:96
-; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:112
-; SI-NOHSA-NEXT: s_waitcnt vmcnt(4)
-; SI-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 offset:64
-; SI-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:80
-; SI-NOHSA-NEXT: s_waitcnt vmcnt(5)
-; SI-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 offset:32
-; SI-NOHSA-NEXT: s_waitcnt vmcnt(5)
-; SI-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 offset:48
+; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
+; SI-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48
+; SI-NOHSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:96
+; SI-NOHSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:112
+; SI-NOHSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:64
+; SI-NOHSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:80
+; SI-NOHSA-NEXT: s_waitcnt vmcnt(3)
+; SI-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:96
+; SI-NOHSA-NEXT: s_waitcnt vmcnt(3)
+; SI-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 offset:112
+; SI-NOHSA-NEXT: s_waitcnt vmcnt(3)
+; SI-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 offset:64
+; SI-NOHSA-NEXT: s_waitcnt vmcnt(3)
+; SI-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 offset:80
+; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32
+; SI-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:48
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
; SI-NOHSA-NEXT: s_endpgm
@@ -4605,27 +4476,29 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa
; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
-; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:112
-; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:96
-; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:80
-; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:64
-; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:32
-; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:48
-; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0
-; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:16
; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
-; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(6)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:96
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:112
-; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(6)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:64
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:80
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, s7
+; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:96
+; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:112
+; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:64
+; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:80
+; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[0:3], 0 offset:32
+; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[0:3], 0 offset:48
+; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[0:3], 0
+; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[0:3], 0 offset:16
+; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7)
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:96
+; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7)
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:112
+; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7)
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:64
+; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7)
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:80
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7)
; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:32
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7)
@@ -4724,3 +4597,6 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa
}
attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1024,1024" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN-GFX900-HSA: {{.*}}
+; GCN-GFX908-HSA: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
index 04d906ca6ad9c..37f73ecf5b360 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
@@ -4085,73 +4085,73 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; SI-NEXT: s_addc_u32 s13, s13, 0
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v20, s1
+; SI-NEXT: v_mov_b32_e32 v24, s1
; SI-NEXT: s_mov_b32 m0, -1
-; SI-NEXT: ds_read2_b64 v[4:7], v20 offset0:8 offset1:9
-; SI-NEXT: ds_read2_b64 v[0:3], v20 offset0:10 offset1:11
-; SI-NEXT: ds_read2_b64 v[8:11], v20 offset0:12 offset1:13
-; SI-NEXT: ds_read2_b64 v[12:15], v20 offset0:14 offset1:15
-; SI-NEXT: ds_read2_b64 v[16:19], v20 offset1:1
-; SI-NEXT: ds_read2_b64 v[30:33], v20 offset0:2 offset1:3
-; SI-NEXT: ds_read2_b64 v[34:37], v20 offset0:4 offset1:5
-; SI-NEXT: ds_read2_b64 v[38:41], v20 offset0:6 offset1:7
+; SI-NEXT: ds_read2_b64 v[4:7], v24 offset0:8 offset1:9
+; SI-NEXT: ds_read2_b64 v[0:3], v24 offset0:10 offset1:11
+; SI-NEXT: ds_read2_b64 v[12:15], v24 offset0:12 offset1:13
+; SI-NEXT: ds_read2_b64 v[8:11], v24 offset0:14 offset1:15
+; SI-NEXT: ds_read2_b64 v[20:23], v24 offset1:1
+; SI-NEXT: ds_read2_b64 v[16:19], v24 offset0:2 offset1:3
+; SI-NEXT: ds_read2_b64 v[34:37], v24 offset0:4 offset1:5
+; SI-NEXT: ds_read2_b64 v[38:41], v24 offset0:6 offset1:7
; SI-NEXT: s_waitcnt lgkmcnt(7)
-; SI-NEXT: v_ashrrev_i32_e32 v21, 16, v5
-; SI-NEXT: v_ashrrev_i32_e32 v23, 16, v4
-; SI-NEXT: v_ashrrev_i32_e32 v25, 16, v7
-; SI-NEXT: v_ashrrev_i32_e32 v27, 16, v6
+; SI-NEXT: v_ashrrev_i32_e32 v25, 16, v5
+; SI-NEXT: v_ashrrev_i32_e32 v27, 16, v4
+; SI-NEXT: v_ashrrev_i32_e32 v29, 16, v7
+; SI-NEXT: v_ashrrev_i32_e32 v31, 16, v6
; SI-NEXT: s_waitcnt lgkmcnt(6)
-; SI-NEXT: v_ashrrev_i32_e32 v29, 16, v1
-; SI-NEXT: v_bfe_i32 v20, v5, 0, 16
-; SI-NEXT: buffer_store_dword v20, off, s[12:15], 0 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v21, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; SI-NEXT: v_bfe_i32 v22, v4, 0, 16
-; SI-NEXT: v_bfe_i32 v24, v7, 0, 16
-; SI-NEXT: v_bfe_i32 v26, v6, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v33, 16, v1
+; SI-NEXT: v_bfe_i32 v24, v5, 0, 16
+; SI-NEXT: buffer_store_dword v24, off, s[12:15], 0 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v25, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: v_bfe_i32 v26, v4, 0, 16
+; SI-NEXT: v_bfe_i32 v28, v7, 0, 16
+; SI-NEXT: v_bfe_i32 v30, v6, 0, 16
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_ashrrev_i32_e32 v21, 16, v0
+; SI-NEXT: v_ashrrev_i32_e32 v25, 16, v0
; SI-NEXT: v_ashrrev_i32_e32 v7, 16, v3
-; SI-NEXT: v_bfe_i32 v28, v1, 0, 16
-; SI-NEXT: v_bfe_i32 v20, v0, 0, 16
+; SI-NEXT: v_bfe_i32 v32, v1, 0, 16
+; SI-NEXT: v_bfe_i32 v24, v0, 0, 16
; SI-NEXT: v_bfe_i32 v6, v3, 0, 16
; SI-NEXT: v_ashrrev_i32_e32 v5, 16, v2
; SI-NEXT: v_bfe_i32 v4, v2, 0, 16
; SI-NEXT: s_waitcnt lgkmcnt(5)
-; SI-NEXT: v_ashrrev_i32_e32 v3, 16, v9
-; SI-NEXT: v_bfe_i32 v2, v9, 0, 16
-; SI-NEXT: v_ashrrev_i32_e32 v9, 16, v8
-; SI-NEXT: v_bfe_i32 v8, v8, 0, 16
-; SI-NEXT: v_ashrrev_i32_e32 v43, 16, v11
-; SI-NEXT: v_bfe_i32 v42, v11, 0, 16
-; SI-NEXT: v_ashrrev_i32_e32 v11, 16, v10
-; SI-NEXT: v_bfe_i32 v10, v10, 0, 16
-; SI-NEXT: s_waitcnt lgkmcnt(4)
-; SI-NEXT: v_ashrrev_i32_e32 v45, 16, v13
-; SI-NEXT: v_bfe_i32 v44, v13, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v3, 16, v13
+; SI-NEXT: v_bfe_i32 v2, v13, 0, 16
; SI-NEXT: v_ashrrev_i32_e32 v13, 16, v12
; SI-NEXT: v_bfe_i32 v12, v12, 0, 16
-; SI-NEXT: v_ashrrev_i32_e32 v47, 16, v15
-; SI-NEXT: v_bfe_i32 v46, v15, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v43, 16, v15
+; SI-NEXT: v_bfe_i32 v42, v15, 0, 16
; SI-NEXT: v_ashrrev_i32_e32 v15, 16, v14
; SI-NEXT: v_bfe_i32 v14, v14, 0, 16
+; SI-NEXT: s_waitcnt lgkmcnt(4)
+; SI-NEXT: v_ashrrev_i32_e32 v45, 16, v9
+; SI-NEXT: v_bfe_i32 v44, v9, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v9, 16, v8
+; SI-NEXT: v_bfe_i32 v8, v8, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v47, 16, v11
+; SI-NEXT: v_bfe_i32 v46, v11, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v11, 16, v10
+; SI-NEXT: v_bfe_i32 v10, v10, 0, 16
; SI-NEXT: s_waitcnt lgkmcnt(3)
-; SI-NEXT: v_ashrrev_i32_e32 v49, 16, v17
-; SI-NEXT: v_bfe_i32 v48, v17, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v49, 16, v21
+; SI-NEXT: v_bfe_i32 v48, v21, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v21, 16, v20
+; SI-NEXT: v_bfe_i32 v20, v20, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v51, 16, v23
+; SI-NEXT: v_bfe_i32 v50, v23, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v23, 16, v22
+; SI-NEXT: v_bfe_i32 v22, v22, 0, 16
+; SI-NEXT: s_waitcnt lgkmcnt(2)
+; SI-NEXT: v_ashrrev_i32_e32 v53, 16, v17
+; SI-NEXT: v_bfe_i32 v52, v17, 0, 16
; SI-NEXT: v_ashrrev_i32_e32 v17, 16, v16
; SI-NEXT: v_bfe_i32 v16, v16, 0, 16
-; SI-NEXT: v_ashrrev_i32_e32 v51, 16, v19
-; SI-NEXT: v_bfe_i32 v50, v19, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v55, 16, v19
+; SI-NEXT: v_bfe_i32 v54, v19, 0, 16
; SI-NEXT: v_ashrrev_i32_e32 v19, 16, v18
; SI-NEXT: v_bfe_i32 v18, v18, 0, 16
-; SI-NEXT: s_waitcnt lgkmcnt(2)
-; SI-NEXT: v_ashrrev_i32_e32 v53, 16, v31
-; SI-NEXT: v_bfe_i32 v52, v31, 0, 16
-; SI-NEXT: v_ashrrev_i32_e32 v31, 16, v30
-; SI-NEXT: v_bfe_i32 v30, v30, 0, 16
-; SI-NEXT: v_ashrrev_i32_e32 v55, 16, v33
-; SI-NEXT: v_bfe_i32 v54, v33, 0, 16
-; SI-NEXT: v_ashrrev_i32_e32 v33, 16, v32
-; SI-NEXT: v_bfe_i32 v32, v32, 0, 16
; SI-NEXT: s_waitcnt lgkmcnt(1)
; SI-NEXT: v_ashrrev_i32_e32 v57, 16, v35
; SI-NEXT: v_bfe_i32 v56, v35, 0, 16
@@ -4175,21 +4175,21 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; SI-NEXT: ds_write2_b64 v0, v[38:39], v[60:61] offset0:12 offset1:13
; SI-NEXT: ds_write2_b64 v0, v[36:37], v[58:59] offset0:10 offset1:11
; SI-NEXT: ds_write2_b64 v0, v[34:35], v[56:57] offset0:8 offset1:9
-; SI-NEXT: ds_write2_b64 v0, v[32:33], v[54:55] offset0:6 offset1:7
-; SI-NEXT: ds_write2_b64 v0, v[30:31], v[52:53] offset0:4 offset1:5
-; SI-NEXT: ds_write2_b64 v0, v[18:19], v[50:51] offset0:2 offset1:3
-; SI-NEXT: ds_write2_b64 v0, v[16:17], v[48:49] offset1:1
-; SI-NEXT: ds_write2_b64 v0, v[14:15], v[46:47] offset0:30 offset1:31
-; SI-NEXT: ds_write2_b64 v0, v[12:13], v[44:45] offset0:28 offset1:29
-; SI-NEXT: ds_write2_b64 v0, v[10:11], v[42:43] offset0:26 offset1:27
-; SI-NEXT: ds_write2_b64 v0, v[8:9], v[2:3] offset0:24 offset1:25
+; SI-NEXT: ds_write2_b64 v0, v[18:19], v[54:55] offset0:6 offset1:7
+; SI-NEXT: ds_write2_b64 v0, v[16:17], v[52:53] offset0:4 offset1:5
+; SI-NEXT: ds_write2_b64 v0, v[22:23], v[50:51] offset0:2 offset1:3
+; SI-NEXT: ds_write2_b64 v0, v[20:21], v[48:49] offset1:1
+; SI-NEXT: ds_write2_b64 v0, v[10:11], v[46:47] offset0:30 offset1:31
+; SI-NEXT: ds_write2_b64 v0, v[8:9], v[44:45] offset0:28 offset1:29
+; SI-NEXT: ds_write2_b64 v0, v[14:15], v[42:43] offset0:26 offset1:27
+; SI-NEXT: ds_write2_b64 v0, v[12:13], v[2:3] offset0:24 offset1:25
; SI-NEXT: ds_write2_b64 v0, v[4:5], v[6:7] offset0:22 offset1:23
-; SI-NEXT: ds_write2_b64 v0, v[20:21], v[28:29] offset0:20 offset1:21
-; SI-NEXT: ds_write2_b64 v0, v[26:27], v[24:25] offset0:18 offset1:19
+; SI-NEXT: ds_write2_b64 v0, v[24:25], v[32:33] offset0:20 offset1:21
+; SI-NEXT: ds_write2_b64 v0, v[30:31], v[28:29] offset0:18 offset1:19
; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: ds_write2_b64 v0, v[22:23], v[1:2] offset0:16 offset1:17
+; SI-NEXT: ds_write2_b64 v0, v[26:27], v[1:2] offset0:16 offset1:17
; SI-NEXT: s_endpgm
;
; VI-NO-DS128-LABEL: local_sextload_v64i16_to_v64i32:
@@ -4200,59 +4200,20 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-NO-DS128-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
; VI-NO-DS128-NEXT: s_mov_b32 s90, -1
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NO-DS128-NEXT: v_mov_b32_e32 v28, s1
-; VI-NO-DS128-NEXT: ds_read2_b64 v[20:23], v28 offset0:4 offset1:5
-; VI-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:6 offset1:7
-; VI-NO-DS128-NEXT: ds_read2_b64 v[33:36], v28 offset0:8 offset1:9
-; VI-NO-DS128-NEXT: ds_read2_b64 v[10:13], v28 offset1:1
-; VI-NO-DS128-NEXT: ds_read2_b64 v[14:17], v28 offset0:2 offset1:3
+; VI-NO-DS128-NEXT: v_mov_b32_e32 v56, s1
+; VI-NO-DS128-NEXT: ds_read2_b64 v[10:13], v56 offset1:1
+; VI-NO-DS128-NEXT: ds_read2_b64 v[14:17], v56 offset0:2 offset1:3
+; VI-NO-DS128-NEXT: ds_read2_b64 v[18:21], v56 offset0:4 offset1:5
+; VI-NO-DS128-NEXT: ds_read2_b64 v[22:25], v56 offset0:6 offset1:7
; VI-NO-DS128-NEXT: s_mov_b32 s91, 0xe80000
-; VI-NO-DS128-NEXT: s_add_u32 s88, s88, s11
-; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(3)
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v30
-; VI-NO-DS128-NEXT: v_bfe_i32 v24, v30, 0, 16
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v29
-; VI-NO-DS128-NEXT: v_bfe_i32 v26, v29, 0, 16
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v38, 16, v32
-; VI-NO-DS128-NEXT: v_bfe_i32 v37, v32, 0, 16
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v40, 16, v31
-; VI-NO-DS128-NEXT: v_bfe_i32 v39, v31, 0, 16
-; VI-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:10 offset1:11
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(3)
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v42, 16, v34
-; VI-NO-DS128-NEXT: v_bfe_i32 v41, v34, 0, 16
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v44, 16, v33
-; VI-NO-DS128-NEXT: v_bfe_i32 v43, v33, 0, 16
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v46, 16, v36
-; VI-NO-DS128-NEXT: v_bfe_i32 v45, v36, 0, 16
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v48, 16, v35
-; VI-NO-DS128-NEXT: v_bfe_i32 v47, v35, 0, 16
-; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v50, 16, v30
-; VI-NO-DS128-NEXT: v_bfe_i32 v49, v30, 0, 16
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v52, 16, v29
-; VI-NO-DS128-NEXT: v_bfe_i32 v51, v29, 0, 16
-; VI-NO-DS128-NEXT: ds_read2_b64 v[33:36], v28 offset0:12 offset1:13
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v56, 16, v31
-; VI-NO-DS128-NEXT: v_bfe_i32 v55, v31, 0, 16
-; VI-NO-DS128-NEXT: ds_read2_b64 v[28:31], v28 offset0:14 offset1:15
-; VI-NO-DS128-NEXT: s_addc_u32 s89, s89, 0
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v11
-; VI-NO-DS128-NEXT: v_bfe_i32 v0, v11, 0, 16
-; VI-NO-DS128-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill
-; VI-NO-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v54, 16, v32
-; VI-NO-DS128-NEXT: v_bfe_i32 v53, v32, 0, 16
-; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v32, 16, v31
-; VI-NO-DS128-NEXT: v_bfe_i32 v31, v31, 0, 16
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v30
-; VI-NO-DS128-NEXT: v_bfe_i32 v0, v30, 0, 16
-; VI-NO-DS128-NEXT: v_mov_b32_e32 v30, s0
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v10
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v13
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v12
+; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(2)
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v15
+; VI-NO-DS128-NEXT: v_bfe_i32 v0, v11, 0, 16
; VI-NO-DS128-NEXT: v_bfe_i32 v2, v10, 0, 16
; VI-NO-DS128-NEXT: v_bfe_i32 v4, v13, 0, 16
; VI-NO-DS128-NEXT: v_bfe_i32 v6, v12, 0, 16
@@ -4263,45 +4224,87 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-NO-DS128-NEXT: v_bfe_i32 v12, v17, 0, 16
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v16
; VI-NO-DS128-NEXT: v_bfe_i32 v14, v16, 0, 16
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v21
-; VI-NO-DS128-NEXT: v_bfe_i32 v16, v21, 0, 16
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v20
-; VI-NO-DS128-NEXT: v_bfe_i32 v18, v20, 0, 16
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v23
-; VI-NO-DS128-NEXT: v_bfe_i32 v20, v23, 0, 16
+; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v19
+; VI-NO-DS128-NEXT: v_bfe_i32 v26, v19, 0, 16
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v18
+; VI-NO-DS128-NEXT: v_bfe_i32 v28, v18, 0, 16
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v31, 16, v21
+; VI-NO-DS128-NEXT: v_bfe_i32 v30, v21, 0, 16
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v33, 16, v20
+; VI-NO-DS128-NEXT: v_bfe_i32 v32, v20, 0, 16
+; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v35, 16, v23
+; VI-NO-DS128-NEXT: v_bfe_i32 v34, v23, 0, 16
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v37, 16, v22
+; VI-NO-DS128-NEXT: v_bfe_i32 v36, v22, 0, 16
+; VI-NO-DS128-NEXT: ds_read2_b64 v[16:19], v56 offset0:8 offset1:9
+; VI-NO-DS128-NEXT: ds_read2_b64 v[20:23], v56 offset0:10 offset1:11
+; VI-NO-DS128-NEXT: s_add_u32 s88, s88, s11
+; VI-NO-DS128-NEXT: s_addc_u32 s89, s89, 0
+; VI-NO-DS128-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill
+; VI-NO-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
+; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v41, 16, v17
+; VI-NO-DS128-NEXT: v_bfe_i32 v40, v17, 0, 16
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v43, 16, v16
+; VI-NO-DS128-NEXT: v_bfe_i32 v42, v16, 0, 16
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v45, 16, v19
+; VI-NO-DS128-NEXT: v_bfe_i32 v44, v19, 0, 16
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v47, 16, v18
+; VI-NO-DS128-NEXT: v_bfe_i32 v46, v18, 0, 16
+; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v49, 16, v21
+; VI-NO-DS128-NEXT: v_bfe_i32 v48, v21, 0, 16
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v51, 16, v20
+; VI-NO-DS128-NEXT: v_bfe_i32 v50, v20, 0, 16
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v53, 16, v23
+; VI-NO-DS128-NEXT: ds_read2_b64 v[16:19], v56 offset0:12 offset1:13
+; VI-NO-DS128-NEXT: v_bfe_i32 v52, v23, 0, 16
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v55, 16, v22
+; VI-NO-DS128-NEXT: v_bfe_i32 v54, v22, 0, 16
+; VI-NO-DS128-NEXT: ds_read2_b64 v[20:23], v56 offset0:14 offset1:15
+; VI-NO-DS128-NEXT: v_mov_b32_e32 v0, s0
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v39, 16, v25
+; VI-NO-DS128-NEXT: v_bfe_i32 v38, v25, 0, 16
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v24
+; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v63, 16, v23
+; VI-NO-DS128-NEXT: v_bfe_i32 v62, v23, 0, 16
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v22
; VI-NO-DS128-NEXT: v_bfe_i32 v22, v22, 0, 16
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v58, 16, v34
-; VI-NO-DS128-NEXT: v_bfe_i32 v57, v34, 0, 16
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v34, 16, v33
-; VI-NO-DS128-NEXT: v_bfe_i32 v33, v33, 0, 16
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v60, 16, v36
-; VI-NO-DS128-NEXT: v_bfe_i32 v59, v36, 0, 16
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v36, 16, v35
-; VI-NO-DS128-NEXT: v_bfe_i32 v35, v35, 0, 16
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v62, 16, v29
-; VI-NO-DS128-NEXT: v_bfe_i32 v61, v29, 0, 16
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v28
-; VI-NO-DS128-NEXT: v_bfe_i32 v28, v28, 0, 16
-; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[0:1], v[31:32] offset0:30 offset1:31
-; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[28:29], v[61:62] offset0:28 offset1:29
-; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[35:36], v[59:60] offset0:26 offset1:27
-; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[33:34], v[57:58] offset0:24 offset1:25
-; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[55:56], v[53:54] offset0:22 offset1:23
-; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[51:52], v[49:50] offset0:20 offset1:21
-; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[47:48], v[45:46] offset0:18 offset1:19
-; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[43:44], v[41:42] offset0:16 offset1:17
-; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[39:40], v[37:38] offset0:14 offset1:15
-; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[26:27], v[24:25] offset0:12 offset1:13
-; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[22:23], v[20:21] offset0:10 offset1:11
-; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[18:19], v[16:17] offset0:8 offset1:9
-; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[14:15], v[12:13] offset0:6 offset1:7
-; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[10:11], v[8:9] offset0:4 offset1:5
-; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[6:7], v[4:5] offset0:2 offset1:3
-; VI-NO-DS128-NEXT: buffer_load_dword v0, off, s[88:91], 0 ; 4-byte Folded Reload
-; VI-NO-DS128-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload
+; VI-NO-DS128-NEXT: v_bfe_i32 v24, v24, 0, 16
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v57, 16, v17
+; VI-NO-DS128-NEXT: v_bfe_i32 v56, v17, 0, 16
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v16
+; VI-NO-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v59, 16, v19
+; VI-NO-DS128-NEXT: v_bfe_i32 v58, v19, 0, 16
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v18
+; VI-NO-DS128-NEXT: v_bfe_i32 v18, v18, 0, 16
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v61, 16, v21
+; VI-NO-DS128-NEXT: v_bfe_i32 v60, v21, 0, 16
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v20
+; VI-NO-DS128-NEXT: v_bfe_i32 v20, v20, 0, 16
+; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[22:23], v[62:63] offset0:30 offset1:31
+; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[20:21], v[60:61] offset0:28 offset1:29
+; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[18:19], v[58:59] offset0:26 offset1:27
+; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[16:17], v[56:57] offset0:24 offset1:25
+; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[54:55], v[52:53] offset0:22 offset1:23
+; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[50:51], v[48:49] offset0:20 offset1:21
+; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[46:47], v[44:45] offset0:18 offset1:19
+; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[42:43], v[40:41] offset0:16 offset1:17
+; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[24:25], v[38:39] offset0:14 offset1:15
+; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[36:37], v[34:35] offset0:12 offset1:13
+; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[32:33], v[30:31] offset0:10 offset1:11
+; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[28:29], v[26:27] offset0:8 offset1:9
+; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[14:15], v[12:13] offset0:6 offset1:7
+; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[10:11], v[8:9] offset0:4 offset1:5
+; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[6:7], v[4:5] offset0:2 offset1:3
+; VI-NO-DS128-NEXT: buffer_load_dword v4, off, s[88:91], 0 ; 4-byte Folded Reload
+; VI-NO-DS128-NEXT: buffer_load_dword v5, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload
; VI-NO-DS128-NEXT: s_waitcnt vmcnt(0)
-; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[2:3], v[0:1] offset1:1
+; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
; VI-NO-DS128-NEXT: s_endpgm
;
; GFX9-NO-DS128-LABEL: local_sextload_v64i16_to_v64i32:
@@ -4312,59 +4315,20 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-NO-DS128-NEXT: s_mov_b32 s14, -1
; GFX9-NO-DS128-NEXT: s_mov_b32 s15, 0xe00000
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v28, s1
-; GFX9-NO-DS128-NEXT: ds_read2_b64 v[20:23], v28 offset0:4 offset1:5
-; GFX9-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:6 offset1:7
-; GFX9-NO-DS128-NEXT: ds_read2_b64 v[33:36], v28 offset0:8 offset1:9
-; GFX9-NO-DS128-NEXT: ds_read2_b64 v[10:13], v28 offset1:1
-; GFX9-NO-DS128-NEXT: ds_read2_b64 v[14:17], v28 offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v56, s1
+; GFX9-NO-DS128-NEXT: ds_read2_b64 v[10:13], v56 offset1:1
+; GFX9-NO-DS128-NEXT: ds_read2_b64 v[14:17], v56 offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT: ds_read2_b64 v[18:21], v56 offset0:4 offset1:5
+; GFX9-NO-DS128-NEXT: ds_read2_b64 v[22:25], v56 offset0:6 offset1:7
; GFX9-NO-DS128-NEXT: s_add_u32 s12, s12, s11
-; GFX9-NO-DS128-NEXT: s_addc_u32 s13, s13, 0
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(3)
-; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v30
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v24, v30, 0, 16
-; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v29
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v26, v29, 0, 16
-; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v38, 16, v32
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v37, v32, 0, 16
-; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v40, 16, v31
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v39, v31, 0, 16
-; GFX9-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:10 offset1:11
-; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(3)
-; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v42, 16, v34
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v41, v34, 0, 16
-; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v44, 16, v33
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v43, v33, 0, 16
-; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v46, 16, v36
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v45, v36, 0, 16
-; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v48, 16, v35
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v47, v35, 0, 16
-; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v50, 16, v30
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v49, v30, 0, 16
-; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v52, 16, v29
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v51, v29, 0, 16
-; GFX9-NO-DS128-NEXT: ds_read2_b64 v[33:36], v28 offset0:12 offset1:13
-; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v56, 16, v31
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v55, v31, 0, 16
-; GFX9-NO-DS128-NEXT: ds_read2_b64 v[28:31], v28 offset0:14 offset1:15
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v11
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v11, 0, 16
-; GFX9-NO-DS128-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
-; GFX9-NO-DS128-NEXT: s_nop 0
-; GFX9-NO-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v54, 16, v32
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v53, v32, 0, 16
-; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v32, 16, v31
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v31, v31, 0, 16
-; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v30
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v30, 0, 16
-; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v30, s0
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v10
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v13
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v12
+; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(2)
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v15
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v11, 0, 16
; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v10, 0, 16
; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v13, 0, 16
; GFX9-NO-DS128-NEXT: v_bfe_i32 v6, v12, 0, 16
@@ -4375,45 +4339,87 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-NO-DS128-NEXT: v_bfe_i32 v12, v17, 0, 16
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v16
; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v16, 0, 16
-; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v21
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v21, 0, 16
-; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v20
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v20, 0, 16
-; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v23
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v20, v23, 0, 16
+; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
+; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v19
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v26, v19, 0, 16
+; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v18
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v28, v18, 0, 16
+; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v31, 16, v21
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v30, v21, 0, 16
+; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v33, 16, v20
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v32, v20, 0, 16
+; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v35, 16, v23
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v34, v23, 0, 16
+; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v37, 16, v22
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v36, v22, 0, 16
+; GFX9-NO-DS128-NEXT: ds_read2_b64 v[16:19], v56 offset0:8 offset1:9
+; GFX9-NO-DS128-NEXT: ds_read2_b64 v[20:23], v56 offset0:10 offset1:11
+; GFX9-NO-DS128-NEXT: s_addc_u32 s13, s13, 0
+; GFX9-NO-DS128-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
+; GFX9-NO-DS128-NEXT: s_nop 0
+; GFX9-NO-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
+; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v41, 16, v17
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v40, v17, 0, 16
+; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v43, 16, v16
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v42, v16, 0, 16
+; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v45, 16, v19
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v44, v19, 0, 16
+; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v47, 16, v18
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v46, v18, 0, 16
+; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v49, 16, v21
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v48, v21, 0, 16
+; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v51, 16, v20
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v50, v20, 0, 16
+; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v53, 16, v23
+; GFX9-NO-DS128-NEXT: ds_read2_b64 v[16:19], v56 offset0:12 offset1:13
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v52, v23, 0, 16
+; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v55, 16, v22
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v54, v22, 0, 16
+; GFX9-NO-DS128-NEXT: ds_read2_b64 v[20:23], v56 offset0:14 offset1:15
+; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v39, 16, v25
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v38, v25, 0, 16
+; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v24
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v24, v24, 0, 16
+; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v63, 16, v23
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v62, v23, 0, 16
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v22
; GFX9-NO-DS128-NEXT: v_bfe_i32 v22, v22, 0, 16
-; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v58, 16, v34
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v57, v34, 0, 16
-; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v34, 16, v33
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v33, v33, 0, 16
-; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v60, 16, v36
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v59, v36, 0, 16
-; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v36, 16, v35
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v35, v35, 0, 16
-; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v62, 16, v29
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v61, v29, 0, 16
-; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v28
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v28, v28, 0, 16
-; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[0:1], v[31:32] offset0:30 offset1:31
-; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[28:29], v[61:62] offset0:28 offset1:29
-; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[35:36], v[59:60] offset0:26 offset1:27
-; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[33:34], v[57:58] offset0:24 offset1:25
-; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[55:56], v[53:54] offset0:22 offset1:23
-; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[51:52], v[49:50] offset0:20 offset1:21
-; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[47:48], v[45:46] offset0:18 offset1:19
-; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[43:44], v[41:42] offset0:16 offset1:17
-; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[39:40], v[37:38] offset0:14 offset1:15
-; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[26:27], v[24:25] offset0:12 offset1:13
-; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[22:23], v[20:21] offset0:10 offset1:11
-; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[18:19], v[16:17] offset0:8 offset1:9
-; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[14:15], v[12:13] offset0:6 offset1:7
-; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[10:11], v[8:9] offset0:4 offset1:5
-; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[6:7], v[4:5] offset0:2 offset1:3
-; GFX9-NO-DS128-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
-; GFX9-NO-DS128-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v57, 16, v17
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v56, v17, 0, 16
+; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v16
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16
+; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v59, 16, v19
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v58, v19, 0, 16
+; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v18
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v18, 0, 16
+; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v61, 16, v21
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v60, v21, 0, 16
+; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v20
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v20, v20, 0, 16
+; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[22:23], v[62:63] offset0:30 offset1:31
+; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[20:21], v[60:61] offset0:28 offset1:29
+; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[18:19], v[58:59] offset0:26 offset1:27
+; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[16:17], v[56:57] offset0:24 offset1:25
+; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[54:55], v[52:53] offset0:22 offset1:23
+; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[50:51], v[48:49] offset0:20 offset1:21
+; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[46:47], v[44:45] offset0:18 offset1:19
+; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[42:43], v[40:41] offset0:16 offset1:17
+; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[24:25], v[38:39] offset0:14 offset1:15
+; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[36:37], v[34:35] offset0:12 offset1:13
+; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[32:33], v[30:31] offset0:10 offset1:11
+; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[28:29], v[26:27] offset0:8 offset1:9
+; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[14:15], v[12:13] offset0:6 offset1:7
+; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[10:11], v[8:9] offset0:4 offset1:5
+; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[6:7], v[4:5] offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
+; GFX9-NO-DS128-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GFX9-NO-DS128-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[2:3], v[0:1] offset1:1
+; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
; GFX9-NO-DS128-NEXT: s_endpgm
;
; EG-LABEL: local_sextload_v64i16_to_v64i32:
@@ -4845,129 +4851,139 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-DS128-LABEL: local_sextload_v64i16_to_v64i32:
; VI-DS128: ; %bb.0:
; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-DS128-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
; VI-DS128-NEXT: s_mov_b32 m0, -1
+; VI-DS128-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
; VI-DS128-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
; VI-DS128-NEXT: s_mov_b32 s90, -1
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DS128-NEXT: v_mov_b32_e32 v32, s1
-; VI-DS128-NEXT: ds_read_b128 v[8:11], v32
-; VI-DS128-NEXT: ds_read_b128 v[16:19], v32 offset:16
-; VI-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32
-; VI-DS128-NEXT: ds_read_b128 v[33:36], v32 offset:48
+; VI-DS128-NEXT: v_mov_b32_e32 v0, s1
+; VI-DS128-NEXT: ds_read_b128 v[8:11], v0
+; VI-DS128-NEXT: ds_read_b128 v[16:19], v0 offset:16
; VI-DS128-NEXT: s_mov_b32 s91, 0xe80000
; VI-DS128-NEXT: s_add_u32 s88, s88, s11
-; VI-DS128-NEXT: s_addc_u32 s89, s89, 0
+; VI-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32
+; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:48
; VI-DS128-NEXT: s_waitcnt lgkmcnt(3)
; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v11
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v10
-; VI-DS128-NEXT: v_bfe_i32 v2, v11, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v0, v10, 0, 16
-; VI-DS128-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill
-; VI-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
-; VI-DS128-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill
-; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v6, 16, v9
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v8
-; VI-DS128-NEXT: v_bfe_i32 v5, v9, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v3, v8, 0, 16
+; VI-DS128-NEXT: s_addc_u32 s89, s89, 0
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v2, 16, v10
+; VI-DS128-NEXT: v_mov_b32_e32 v4, v3
+; VI-DS128-NEXT: v_bfe_i32 v3, v11, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v1, v10, 0, 16
+; VI-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 ; 4-byte Folded Spill
+; VI-DS128-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
+; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill
+; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v9
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v8
+; VI-DS128-NEXT: v_bfe_i32 v6, v9, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v4, v8, 0, 16
+; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
+; VI-DS128-NEXT: buffer_store_dword v5, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill
+; VI-DS128-NEXT: buffer_store_dword v6, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill
+; VI-DS128-NEXT: buffer_store_dword v7, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill
; VI-DS128-NEXT: s_waitcnt lgkmcnt(2)
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v19
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v18
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v19
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v2, 16, v18
; VI-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v17
; VI-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v16
-; VI-DS128-NEXT: v_bfe_i32 v10, v19, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v8, v18, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v3, v19, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v1, v18, 0, 16
; VI-DS128-NEXT: v_bfe_i32 v14, v17, 0, 16
; VI-DS128-NEXT: v_bfe_i32 v12, v16, 0, 16
; VI-DS128-NEXT: s_waitcnt lgkmcnt(1)
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v27
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v26
-; VI-DS128-NEXT: v_bfe_i32 v18, v27, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v16, v26, 0, 16
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v23
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v22
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v31, 16, v21
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v20
+; VI-DS128-NEXT: v_bfe_i32 v18, v23, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v16, v22, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v30, v21, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v28, v20, 0, 16
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v36
-; VI-DS128-NEXT: v_bfe_i32 v26, v36, 0, 16
-; VI-DS128-NEXT: ds_read_b128 v[36:39], v32 offset:64
-; VI-DS128-NEXT: ds_read_b128 v[40:43], v32 offset:80
-; VI-DS128-NEXT: ds_read_b128 v[56:59], v32 offset:96
-; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
-; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill
-; VI-DS128-NEXT: buffer_store_dword v5, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill
-; VI-DS128-NEXT: buffer_store_dword v6, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v25
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v27
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v26
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v35, 16, v25
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v33, 16, v24
+; VI-DS128-NEXT: v_bfe_i32 v22, v27, 0, 16
+; VI-DS128-NEXT: ds_read_b128 v[36:39], v0 offset:64
+; VI-DS128-NEXT: v_bfe_i32 v20, v26, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v34, v25, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v32, v24, 0, 16
+; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:80
+; VI-DS128-NEXT: ds_read_b128 v[55:58], v0 offset:96
+; VI-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:32 ; 4-byte Folded Spill
+; VI-DS128-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:36 ; 4-byte Folded Spill
+; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:40 ; 4-byte Folded Spill
+; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:44 ; 4-byte Folded Spill
; VI-DS128-NEXT: s_waitcnt lgkmcnt(2)
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v47, 16, v39
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v45, 16, v38
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v51, 16, v37
-; VI-DS128-NEXT: v_bfe_i32 v46, v39, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v44, v38, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v50, v37, 0, 16
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v42, 16, v39
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v40, 16, v38
; VI-DS128-NEXT: s_waitcnt lgkmcnt(1)
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v53, 16, v40
-; VI-DS128-NEXT: v_bfe_i32 v52, v40, 0, 16
-; VI-DS128-NEXT: ds_read_b128 v[37:40], v32 offset:112
-; VI-DS128-NEXT: v_mov_b32_e32 v32, s0
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v24
-; VI-DS128-NEXT: v_bfe_i32 v22, v25, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v20, v24, 0, 16
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v50, 16, v27
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v48, 16, v26
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v54, 16, v25
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v52, 16, v24
+; VI-DS128-NEXT: v_bfe_i32 v49, v27, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v47, v26, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v53, v25, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v51, v24, 0, 16
+; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:112
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v46, 16, v37
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v44, 16, v36
+; VI-DS128-NEXT: v_bfe_i32 v41, v39, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v39, v38, 0, 16
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v38
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v37
-; VI-DS128-NEXT: v_bfe_i32 v2, v38, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v0, v37, 0, 16
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v35
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v31, 16, v34
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v33
-; VI-DS128-NEXT: v_bfe_i32 v24, v35, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v30, v34, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v28, v33, 0, 16
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v49, 16, v36
-; VI-DS128-NEXT: v_bfe_i32 v48, v36, 0, 16
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v36, 16, v43
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v34, 16, v42
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v55, 16, v41
-; VI-DS128-NEXT: v_bfe_i32 v35, v43, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v33, v42, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v54, v41, 0, 16
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v62, 16, v59
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v60, 16, v58
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v57
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v56
-; VI-DS128-NEXT: v_bfe_i32 v61, v59, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v59, v58, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v6, v57, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v4, v56, 0, 16
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v43, 16, v40
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v41, 16, v39
-; VI-DS128-NEXT: v_bfe_i32 v42, v40, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v40, v39, 0, 16
-; VI-DS128-NEXT: ds_write_b128 v32, v[0:3] offset:224
-; VI-DS128-NEXT: ds_write_b128 v32, v[40:43] offset:240
-; VI-DS128-NEXT: ds_write_b128 v32, v[4:7] offset:192
-; VI-DS128-NEXT: ds_write_b128 v32, v[59:62] offset:208
-; VI-DS128-NEXT: ds_write_b128 v32, v[52:55] offset:160
-; VI-DS128-NEXT: ds_write_b128 v32, v[33:36] offset:176
-; VI-DS128-NEXT: ds_write_b128 v32, v[48:51] offset:128
-; VI-DS128-NEXT: ds_write_b128 v32, v[44:47] offset:144
-; VI-DS128-NEXT: ds_write_b128 v32, v[28:31] offset:96
-; VI-DS128-NEXT: ds_write_b128 v32, v[24:27] offset:112
-; VI-DS128-NEXT: ds_write_b128 v32, v[20:23] offset:64
-; VI-DS128-NEXT: ds_write_b128 v32, v[16:19] offset:80
-; VI-DS128-NEXT: ds_write_b128 v32, v[12:15] offset:32
-; VI-DS128-NEXT: ds_write_b128 v32, v[8:11] offset:48
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v25
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v24
+; VI-DS128-NEXT: v_bfe_i32 v2, v25, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v0, v24, 0, 16
+; VI-DS128-NEXT: v_mov_b32_e32 v24, s0
+; VI-DS128-NEXT: v_bfe_i32 v45, v37, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v43, v36, 0, 16
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v61, 16, v58
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v59, 16, v57
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v56
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v55
+; VI-DS128-NEXT: v_bfe_i32 v60, v58, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v58, v57, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v10, v56, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v8, v55, 0, 16
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v27
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v26
+; VI-DS128-NEXT: v_bfe_i32 v6, v27, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v4, v26, 0, 16
+; VI-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:224
+; VI-DS128-NEXT: ds_write_b128 v24, v[4:7] offset:240
+; VI-DS128-NEXT: ds_write_b128 v24, v[8:11] offset:192
+; VI-DS128-NEXT: ds_write_b128 v24, v[58:61] offset:208
+; VI-DS128-NEXT: ds_write_b128 v24, v[51:54] offset:160
+; VI-DS128-NEXT: ds_write_b128 v24, v[47:50] offset:176
+; VI-DS128-NEXT: ds_write_b128 v24, v[43:46] offset:128
+; VI-DS128-NEXT: ds_write_b128 v24, v[39:42] offset:144
+; VI-DS128-NEXT: ds_write_b128 v24, v[32:35] offset:96
+; VI-DS128-NEXT: ds_write_b128 v24, v[20:23] offset:112
+; VI-DS128-NEXT: ds_write_b128 v24, v[28:31] offset:64
+; VI-DS128-NEXT: ds_write_b128 v24, v[16:19] offset:80
+; VI-DS128-NEXT: ds_write_b128 v24, v[12:15] offset:32
+; VI-DS128-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:32 ; 4-byte Folded Reload
+; VI-DS128-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:36 ; 4-byte Folded Reload
+; VI-DS128-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:40 ; 4-byte Folded Reload
+; VI-DS128-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:44 ; 4-byte Folded Reload
+; VI-DS128-NEXT: s_waitcnt vmcnt(0)
+; VI-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:48
; VI-DS128-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:16 ; 4-byte Folded Reload
; VI-DS128-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload
; VI-DS128-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:24 ; 4-byte Folded Reload
; VI-DS128-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:28 ; 4-byte Folded Reload
; VI-DS128-NEXT: s_waitcnt vmcnt(0)
-; VI-DS128-NEXT: ds_write_b128 v32, v[0:3]
+; VI-DS128-NEXT: ds_write_b128 v24, v[0:3]
; VI-DS128-NEXT: buffer_load_dword v0, off, s[88:91], 0 ; 4-byte Folded Reload
; VI-DS128-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload
; VI-DS128-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload
; VI-DS128-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload
; VI-DS128-NEXT: s_waitcnt vmcnt(0)
-; VI-DS128-NEXT: ds_write_b128 v32, v[0:3] offset:16
+; VI-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:16
; VI-DS128-NEXT: s_endpgm
;
; GFX9-DS128-LABEL: local_sextload_v64i16_to_v64i32:
@@ -4978,125 +4994,136 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-DS128-NEXT: s_mov_b32 s14, -1
; GFX9-DS128-NEXT: s_mov_b32 s15, 0xe00000
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DS128-NEXT: v_mov_b32_e32 v32, s1
-; GFX9-DS128-NEXT: ds_read_b128 v[8:11], v32
-; GFX9-DS128-NEXT: ds_read_b128 v[16:19], v32 offset:16
-; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32
-; GFX9-DS128-NEXT: ds_read_b128 v[33:36], v32 offset:48
+; GFX9-DS128-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-DS128-NEXT: ds_read_b128 v[8:11], v0
+; GFX9-DS128-NEXT: ds_read_b128 v[16:19], v0 offset:16
; GFX9-DS128-NEXT: s_add_u32 s12, s12, s11
-; GFX9-DS128-NEXT: s_addc_u32 s13, s13, 0
+; GFX9-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32
+; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:48
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(3)
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v11
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v10
-; GFX9-DS128-NEXT: v_bfe_i32 v2, v11, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v0, v10, 0, 16
-; GFX9-DS128-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT: s_addc_u32 s13, s13, 0
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v2, 16, v10
+; GFX9-DS128-NEXT: v_mov_b32_e32 v4, v3
+; GFX9-DS128-NEXT: v_bfe_i32 v3, v11, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v1, v10, 0, 16
+; GFX9-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 ; 4-byte Folded Spill
; GFX9-DS128-NEXT: s_nop 0
-; GFX9-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v6, 16, v9
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v8
-; GFX9-DS128-NEXT: v_bfe_i32 v5, v9, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v3, v8, 0, 16
+; GFX9-DS128-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v9
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v8
+; GFX9-DS128-NEXT: v_bfe_i32 v6, v9, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v4, v8, 0, 16
+; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT: s_nop 0
+; GFX9-DS128-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2)
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v19
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v18
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v19
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v2, 16, v18
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v17
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v16
-; GFX9-DS128-NEXT: v_bfe_i32 v10, v19, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v8, v18, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v3, v19, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v1, v18, 0, 16
; GFX9-DS128-NEXT: v_bfe_i32 v14, v17, 0, 16
; GFX9-DS128-NEXT: v_bfe_i32 v12, v16, 0, 16
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1)
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v27
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v26
-; GFX9-DS128-NEXT: v_bfe_i32 v18, v27, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v16, v26, 0, 16
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v23
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v22
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v31, 16, v21
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v20
+; GFX9-DS128-NEXT: v_bfe_i32 v18, v23, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v16, v22, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v30, v21, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v28, v20, 0, 16
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v36
-; GFX9-DS128-NEXT: v_bfe_i32 v26, v36, 0, 16
-; GFX9-DS128-NEXT: ds_read_b128 v[36:39], v32 offset:64
-; GFX9-DS128-NEXT: ds_read_b128 v[40:43], v32 offset:80
-; GFX9-DS128-NEXT: ds_read_b128 v[56:59], v32 offset:96
-; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v27
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v26
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v35, 16, v25
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v33, 16, v24
+; GFX9-DS128-NEXT: v_bfe_i32 v22, v27, 0, 16
+; GFX9-DS128-NEXT: ds_read_b128 v[36:39], v0 offset:64
+; GFX9-DS128-NEXT: v_bfe_i32 v20, v26, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v34, v25, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v32, v24, 0, 16
+; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:80
+; GFX9-DS128-NEXT: ds_read_b128 v[55:58], v0 offset:96
+; GFX9-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill
; GFX9-DS128-NEXT: s_nop 0
-; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v25
+; GFX9-DS128-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2)
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v47, 16, v39
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v45, 16, v38
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v51, 16, v37
-; GFX9-DS128-NEXT: v_bfe_i32 v46, v39, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v44, v38, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v50, v37, 0, 16
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v42, 16, v39
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v40, 16, v38
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1)
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v53, 16, v40
-; GFX9-DS128-NEXT: v_bfe_i32 v52, v40, 0, 16
-; GFX9-DS128-NEXT: ds_read_b128 v[37:40], v32 offset:112
-; GFX9-DS128-NEXT: v_mov_b32_e32 v32, s0
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v24
-; GFX9-DS128-NEXT: v_bfe_i32 v22, v25, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v20, v24, 0, 16
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v50, 16, v27
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v48, 16, v26
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v54, 16, v25
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v52, 16, v24
+; GFX9-DS128-NEXT: v_bfe_i32 v49, v27, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v47, v26, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v53, v25, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v51, v24, 0, 16
+; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:112
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v46, 16, v37
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v44, 16, v36
+; GFX9-DS128-NEXT: v_bfe_i32 v41, v39, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v39, v38, 0, 16
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v38
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v37
-; GFX9-DS128-NEXT: v_bfe_i32 v2, v38, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v0, v37, 0, 16
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v35
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v31, 16, v34
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v33
-; GFX9-DS128-NEXT: v_bfe_i32 v24, v35, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v30, v34, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v28, v33, 0, 16
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v49, 16, v36
-; GFX9-DS128-NEXT: v_bfe_i32 v48, v36, 0, 16
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v36, 16, v43
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v34, 16, v42
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v55, 16, v41
-; GFX9-DS128-NEXT: v_bfe_i32 v35, v43, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v33, v42, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v54, v41, 0, 16
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v62, 16, v59
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v60, 16, v58
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v57
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v56
-; GFX9-DS128-NEXT: v_bfe_i32 v61, v59, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v59, v58, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v6, v57, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v4, v56, 0, 16
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v43, 16, v40
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v41, 16, v39
-; GFX9-DS128-NEXT: v_bfe_i32 v42, v40, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v40, v39, 0, 16
-; GFX9-DS128-NEXT: ds_write_b128 v32, v[0:3] offset:224
-; GFX9-DS128-NEXT: ds_write_b128 v32, v[40:43] offset:240
-; GFX9-DS128-NEXT: ds_write_b128 v32, v[4:7] offset:192
-; GFX9-DS128-NEXT: ds_write_b128 v32, v[59:62] offset:208
-; GFX9-DS128-NEXT: ds_write_b128 v32, v[52:55] offset:160
-; GFX9-DS128-NEXT: ds_write_b128 v32, v[33:36] offset:176
-; GFX9-DS128-NEXT: ds_write_b128 v32, v[48:51] offset:128
-; GFX9-DS128-NEXT: ds_write_b128 v32, v[44:47] offset:144
-; GFX9-DS128-NEXT: ds_write_b128 v32, v[28:31] offset:96
-; GFX9-DS128-NEXT: ds_write_b128 v32, v[24:27] offset:112
-; GFX9-DS128-NEXT: ds_write_b128 v32, v[20:23] offset:64
-; GFX9-DS128-NEXT: ds_write_b128 v32, v[16:19] offset:80
-; GFX9-DS128-NEXT: ds_write_b128 v32, v[12:15] offset:32
-; GFX9-DS128-NEXT: ds_write_b128 v32, v[8:11] offset:48
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v25
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v24
+; GFX9-DS128-NEXT: v_bfe_i32 v2, v25, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v0, v24, 0, 16
+; GFX9-DS128-NEXT: v_mov_b32_e32 v24, s0
+; GFX9-DS128-NEXT: v_bfe_i32 v45, v37, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v43, v36, 0, 16
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v61, 16, v58
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v59, 16, v57
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v56
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v55
+; GFX9-DS128-NEXT: v_bfe_i32 v60, v58, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v58, v57, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v10, v56, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v8, v55, 0, 16
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v27
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v26
+; GFX9-DS128-NEXT: v_bfe_i32 v6, v27, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v4, v26, 0, 16
+; GFX9-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:224
+; GFX9-DS128-NEXT: ds_write_b128 v24, v[4:7] offset:240
+; GFX9-DS128-NEXT: ds_write_b128 v24, v[8:11] offset:192
+; GFX9-DS128-NEXT: ds_write_b128 v24, v[58:61] offset:208
+; GFX9-DS128-NEXT: ds_write_b128 v24, v[51:54] offset:160
+; GFX9-DS128-NEXT: ds_write_b128 v24, v[47:50] offset:176
+; GFX9-DS128-NEXT: ds_write_b128 v24, v[43:46] offset:128
+; GFX9-DS128-NEXT: ds_write_b128 v24, v[39:42] offset:144
+; GFX9-DS128-NEXT: ds_write_b128 v24, v[32:35] offset:96
+; GFX9-DS128-NEXT: ds_write_b128 v24, v[20:23] offset:112
+; GFX9-DS128-NEXT: ds_write_b128 v24, v[28:31] offset:64
+; GFX9-DS128-NEXT: ds_write_b128 v24, v[16:19] offset:80
+; GFX9-DS128-NEXT: ds_write_b128 v24, v[12:15] offset:32
+; GFX9-DS128-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload
+; GFX9-DS128-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload
+; GFX9-DS128-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload
+; GFX9-DS128-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload
+; GFX9-DS128-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:48
; GFX9-DS128-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
; GFX9-DS128-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
; GFX9-DS128-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
; GFX9-DS128-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
; GFX9-DS128-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DS128-NEXT: ds_write_b128 v32, v[0:3]
+; GFX9-DS128-NEXT: ds_write_b128 v24, v[0:3]
; GFX9-DS128-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
; GFX9-DS128-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GFX9-DS128-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
; GFX9-DS128-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
; GFX9-DS128-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DS128-NEXT: ds_write_b128 v32, v[0:3] offset:16
+; GFX9-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:16
; GFX9-DS128-NEXT: s_endpgm
%load = load <64 x i16>, ptr addrspace(3) %in
%ext = sext <64 x i16> %load to <64 x i32>
@@ -6691,53 +6718,53 @@ define amdgpu_kernel void @local_zextload_v16i16_to_v16i64(ptr addrspace(3) %out
; VI-DS128: ; %bb.0:
; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-DS128-NEXT: s_mov_b32 m0, -1
-; VI-DS128-NEXT: v_mov_b32_e32 v26, 0
-; VI-DS128-NEXT: v_mov_b32_e32 v22, v26
-; VI-DS128-NEXT: v_mov_b32_e32 v24, v26
+; VI-DS128-NEXT: v_mov_b32_e32 v25, 0
+; VI-DS128-NEXT: v_mov_b32_e32 v21, v25
+; VI-DS128-NEXT: v_mov_b32_e32 v23, v25
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DS128-NEXT: v_mov_b32_e32 v5, s1
-; VI-DS128-NEXT: ds_read_b128 v[0:3], v5
-; VI-DS128-NEXT: ds_read_b128 v[13:16], v5 offset:16
-; VI-DS128-NEXT: v_mov_b32_e32 v11, v26
-; VI-DS128-NEXT: v_mov_b32_e32 v19, v26
-; VI-DS128-NEXT: v_mov_b32_e32 v8, v26
+; VI-DS128-NEXT: v_mov_b32_e32 v4, s1
+; VI-DS128-NEXT: ds_read_b128 v[0:3], v4
+; VI-DS128-NEXT: ds_read_b128 v[4:7], v4 offset:16
+; VI-DS128-NEXT: v_mov_b32_e32 v28, s0
+; VI-DS128-NEXT: v_mov_b32_e32 v18, v25
+; VI-DS128-NEXT: v_mov_b32_e32 v15, v25
; VI-DS128-NEXT: s_waitcnt lgkmcnt(1)
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v12, 16, v2
-; VI-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v2
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v2
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v13
-; VI-DS128-NEXT: v_and_b32_e32 v21, 0xffff, v13
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v27, 16, v14
-; VI-DS128-NEXT: v_and_b32_e32 v25, 0xffff, v14
-; VI-DS128-NEXT: v_mov_b32_e32 v14, s0
-; VI-DS128-NEXT: v_mov_b32_e32 v13, v26
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v1
-; VI-DS128-NEXT: v_and_b32_e32 v7, 0xffff, v1
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v20, 16, v16
-; VI-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v16
-; VI-DS128-NEXT: ds_write_b128 v14, v[21:24] offset:64
-; VI-DS128-NEXT: v_mov_b32_e32 v21, v26
-; VI-DS128-NEXT: ds_write_b128 v14, v[10:13] offset:32
-; VI-DS128-NEXT: v_mov_b32_e32 v10, v26
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v0
-; VI-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v0
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v22, 16, v7
+; VI-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v7
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v6
+; VI-DS128-NEXT: v_and_b32_e32 v17, 0xffff, v6
+; VI-DS128-NEXT: ds_write_b128 v28, v[20:23] offset:112
+; VI-DS128-NEXT: v_mov_b32_e32 v20, v25
+; VI-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v2
+; VI-DS128-NEXT: ds_write_b128 v28, v[17:20] offset:96
+; VI-DS128-NEXT: v_mov_b32_e32 v17, v25
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v1
+; VI-DS128-NEXT: v_and_b32_e32 v11, 0xffff, v1
+; VI-DS128-NEXT: ds_write_b128 v28, v[14:17] offset:32
+; VI-DS128-NEXT: v_mov_b32_e32 v12, v25
+; VI-DS128-NEXT: v_mov_b32_e32 v14, v25
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v10, 16, v0
+; VI-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v0
; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3
; VI-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v3
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v15
-; VI-DS128-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; VI-DS128-NEXT: ds_write_b128 v14, v[18:21] offset:112
-; VI-DS128-NEXT: v_mov_b32_e32 v16, v26
-; VI-DS128-NEXT: v_mov_b32_e32 v18, v26
-; VI-DS128-NEXT: v_mov_b32_e32 v1, v26
-; VI-DS128-NEXT: v_mov_b32_e32 v3, v26
-; VI-DS128-NEXT: v_mov_b32_e32 v28, v26
-; VI-DS128-NEXT: ds_write_b128 v14, v[7:10] offset:16
-; VI-DS128-NEXT: v_mov_b32_e32 v5, v26
-; VI-DS128-NEXT: v_mov_b32_e32 v7, v26
-; VI-DS128-NEXT: ds_write_b128 v14, v[15:18] offset:96
-; VI-DS128-NEXT: ds_write_b128 v14, v[0:3] offset:48
-; VI-DS128-NEXT: ds_write_b128 v14, v[25:28] offset:80
-; VI-DS128-NEXT: ds_write_b128 v14, v[4:7]
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v4
+; VI-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v26, 16, v5
+; VI-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v5
+; VI-DS128-NEXT: v_mov_b32_e32 v5, v25
+; VI-DS128-NEXT: v_mov_b32_e32 v7, v25
+; VI-DS128-NEXT: v_mov_b32_e32 v1, v25
+; VI-DS128-NEXT: v_mov_b32_e32 v3, v25
+; VI-DS128-NEXT: v_mov_b32_e32 v27, v25
+; VI-DS128-NEXT: ds_write_b128 v28, v[11:14] offset:16
+; VI-DS128-NEXT: v_mov_b32_e32 v9, v25
+; VI-DS128-NEXT: v_mov_b32_e32 v11, v25
+; VI-DS128-NEXT: ds_write_b128 v28, v[4:7] offset:64
+; VI-DS128-NEXT: ds_write_b128 v28, v[0:3] offset:48
+; VI-DS128-NEXT: ds_write_b128 v28, v[24:27] offset:80
+; VI-DS128-NEXT: ds_write_b128 v28, v[8:11]
; VI-DS128-NEXT: s_endpgm
;
; GFX9-DS128-LABEL: local_zextload_v16i16_to_v16i64:
@@ -7362,12 +7389,12 @@ define amdgpu_kernel void @local_zextload_v32i16_to_v32i64(ptr addrspace(3) %out
; VI-NO-DS128-NEXT: v_mov_b32_e32 v21, v5
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
; VI-NO-DS128-NEXT: v_mov_b32_e32 v4, s1
-; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v4 offset0:6 offset1:7
; VI-NO-DS128-NEXT: ds_read2_b64 v[6:9], v4 offset0:4 offset1:5
+; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v4 offset0:6 offset1:7
; VI-NO-DS128-NEXT: v_mov_b32_e32 v22, s0
-; VI-NO-DS128-NEXT: ds_read2_b64 v[10:13], v4 offset0:2 offset1:3
-; VI-NO-DS128-NEXT: ds_read2_b64 v[14:17], v4 offset1:1
-; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(3)
+; VI-NO-DS128-NEXT: ds_read2_b64 v[10:13], v4 offset1:1
+; VI-NO-DS128-NEXT: ds_read2_b64 v[14:17], v4 offset0:2 offset1:3
+; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(2)
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v20, 16, v2
; VI-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v2
; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[18:19], v[20:21] offset0:28 offset1:29
@@ -7378,7 +7405,6 @@ define amdgpu_kernel void @local_zextload_v32i16_to_v32i64(ptr addrspace(3) %out
; VI-NO-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v0
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v18, 16, v0
; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[1:2], v[18:19] offset0:24 offset1:25
-; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(5)
; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v9
; VI-NO-DS128-NEXT: v_mov_b32_e32 v1, v5
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v18, 16, v9
@@ -7395,37 +7421,36 @@ define amdgpu_kernel void @local_zextload_v32i16_to_v32i64(ptr addrspace(3) %out
; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v6
; VI-NO-DS128-NEXT: v_mov_b32_e32 v7, v5
; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[0:1] offset0:16 offset1:17
-; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(8)
-; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v13
-; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v13
+; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(7)
+; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v17
+; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v17
; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[0:1] offset0:14 offset1:15
-; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v12
-; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v12
+; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v16
+; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v16
; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[0:1] offset0:12 offset1:13
-; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v11
-; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v11
+; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v15
+; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v15
; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[0:1] offset0:10 offset1:11
-; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(10)
-; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v14
-; VI-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v14
+; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v14
+; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v14
+; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v10
+; VI-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v10
+; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[1:2] offset0:8 offset1:9
+; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v12
+; VI-NO-DS128-NEXT: v_and_b32_e32 v9, 0xffff, v12
+; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v10, 16, v13
+; VI-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v13
; VI-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v3
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v3
; VI-NO-DS128-NEXT: v_mov_b32_e32 v14, v5
-; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v10
-; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v10
-; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v10, 16, v17
-; VI-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v17
+; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v11
+; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v11
; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[4:5], v[13:14] offset0:30 offset1:31
; VI-NO-DS128-NEXT: v_mov_b32_e32 v13, v5
; VI-NO-DS128-NEXT: v_mov_b32_e32 v11, v5
-; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[1:2] offset0:8 offset1:9
-; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v16
-; VI-NO-DS128-NEXT: v_and_b32_e32 v9, 0xffff, v16
; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[12:13], v[10:11] offset0:6 offset1:7
; VI-NO-DS128-NEXT: v_mov_b32_e32 v10, v5
; VI-NO-DS128-NEXT: v_mov_b32_e32 v3, v5
-; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v15
-; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v15
; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[9:10], v[2:3] offset0:4 offset1:5
; VI-NO-DS128-NEXT: v_mov_b32_e32 v2, v5
; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[1:2] offset0:2 offset1:3
@@ -8010,12 +8035,12 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v12, s1
+; SI-NEXT: v_mov_b32_e32 v8, s1
; SI-NEXT: s_mov_b32 m0, -1
-; SI-NEXT: ds_read2_b64 v[4:7], v12 offset0:2 offset1:3
-; SI-NEXT: ds_read2_b64 v[0:3], v12 offset1:1
-; SI-NEXT: ds_read2_b64 v[8:11], v12 offset0:6 offset1:7
-; SI-NEXT: ds_read2_b64 v[12:15], v12 offset0:4 offset1:5
+; SI-NEXT: ds_read2_b64 v[4:7], v8 offset0:2 offset1:3
+; SI-NEXT: ds_read2_b64 v[0:3], v8 offset1:1
+; SI-NEXT: ds_read2_b64 v[12:15], v8 offset0:4 offset1:5
+; SI-NEXT: ds_read2_b64 v[8:11], v8 offset0:6 offset1:7
; SI-NEXT: s_waitcnt lgkmcnt(3)
; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v7
; SI-NEXT: v_ashrrev_i32_e32 v16, 16, v7
@@ -8039,7 +8064,7 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out
; SI-NEXT: v_bfe_i32 v18, v1, 0, 16
; SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18
; SI-NEXT: ds_write2_b64 v7, v[18:19], v[16:17] offset0:2 offset1:3
-; SI-NEXT: s_waitcnt lgkmcnt(5)
+; SI-NEXT: s_waitcnt lgkmcnt(4)
; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v11
; SI-NEXT: v_ashrrev_i32_e32 v16, 16, v11
; SI-NEXT: v_bfe_i32 v18, v11, 0, 16
@@ -8050,7 +8075,6 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out
; SI-NEXT: v_bfe_i32 v18, v9, 0, 16
; SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18
; SI-NEXT: ds_write2_b64 v7, v[18:19], v[16:17] offset0:26 offset1:27
-; SI-NEXT: s_waitcnt lgkmcnt(6)
; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v15
; SI-NEXT: v_ashrrev_i32_e32 v16, 16, v15
; SI-NEXT: v_bfe_i32 v18, v15, 0, 16
@@ -8116,111 +8140,111 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out
; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NO-DS128-NEXT: s_mov_b32 m0, -1
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NO-DS128-NEXT: v_mov_b32_e32 v4, s1
-; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v4 offset0:6 offset1:7
-; VI-NO-DS128-NEXT: ds_read2_b64 v[12:15], v4 offset0:4 offset1:5
-; VI-NO-DS128-NEXT: v_mov_b32_e32 v11, s0
+; VI-NO-DS128-NEXT: v_mov_b32_e32 v11, s1
+; VI-NO-DS128-NEXT: ds_read2_b64 v[4:7], v11 offset0:6 offset1:7
+; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v11 offset0:4 offset1:5
+; VI-NO-DS128-NEXT: v_mov_b32_e32 v15, s0
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
-; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v3
-; VI-NO-DS128-NEXT: v_bfe_i32 v16, v3, 0, 16
-; VI-NO-DS128-NEXT: v_bfe_i32 v18, v5, 0, 16
+; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v7
+; VI-NO-DS128-NEXT: v_bfe_i32 v16, v7, 0, 16
+; VI-NO-DS128-NEXT: v_bfe_i32 v18, v8, 0, 16
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18
-; VI-NO-DS128-NEXT: ds_read2_b64 v[7:10], v4 offset0:2 offset1:3
-; VI-NO-DS128-NEXT: ds_read2_b64 v[3:6], v4 offset1:1
-; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[16:17], v[18:19] offset0:30 offset1:31
-; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v2
+; VI-NO-DS128-NEXT: ds_read2_b64 v[7:10], v11 offset1:1
+; VI-NO-DS128-NEXT: ds_read2_b64 v[11:14], v11 offset0:2 offset1:3
+; VI-NO-DS128-NEXT: ds_write2_b64 v15, v[16:17], v[18:19] offset0:30 offset1:31
+; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v6
; VI-NO-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16
-; VI-NO-DS128-NEXT: v_bfe_i32 v18, v2, 0, 16
+; VI-NO-DS128-NEXT: v_bfe_i32 v18, v6, 0, 16
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18
-; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[18:19], v[16:17] offset0:28 offset1:29
+; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v5
+; VI-NO-DS128-NEXT: ds_write2_b64 v15, v[18:19], v[16:17] offset0:28 offset1:29
+; VI-NO-DS128-NEXT: v_bfe_i32 v16, v6, 0, 16
+; VI-NO-DS128-NEXT: v_bfe_i32 v5, v5, 0, 16
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v6, 31, v5
+; VI-NO-DS128-NEXT: ds_write2_b64 v15, v[5:6], v[16:17] offset0:26 offset1:27
+; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v4
+; VI-NO-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16
+; VI-NO-DS128-NEXT: v_bfe_i32 v18, v17, 0, 16
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
+; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(5)
+; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; VI-NO-DS128-NEXT: ds_write2_b64 v15, v[16:17], v[18:19] offset0:24 offset1:25
+; VI-NO-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16
+; VI-NO-DS128-NEXT: v_bfe_i32 v3, v3, 0, 16
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3
+; VI-NO-DS128-NEXT: ds_write2_b64 v15, v[3:4], v[16:17] offset0:22 offset1:23
+; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; VI-NO-DS128-NEXT: v_bfe_i32 v3, v3, 0, 16
; VI-NO-DS128-NEXT: v_bfe_i32 v16, v2, 0, 16
-; VI-NO-DS128-NEXT: v_bfe_i32 v1, v1, 0, 16
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v2, 31, v1
-; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[1:2], v[16:17] offset0:26 offset1:27
+; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; VI-NO-DS128-NEXT: ds_write2_b64 v15, v[16:17], v[3:4] offset0:20 offset1:21
+; VI-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
+; VI-NO-DS128-NEXT: v_bfe_i32 v16, v1, 0, 16
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
+; VI-NO-DS128-NEXT: ds_write2_b64 v15, v[16:17], v[2:3] offset0:18 offset1:19
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v0
; VI-NO-DS128-NEXT: v_bfe_i32 v16, v0, 0, 16
; VI-NO-DS128-NEXT: v_bfe_i32 v18, v17, 0, 16
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
-; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[16:17], v[18:19] offset0:24 offset1:25
-; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(6)
-; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v15
+; VI-NO-DS128-NEXT: ds_write2_b64 v15, v[16:17], v[18:19] offset0:16 offset1:17
+; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(8)
+; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v14
; VI-NO-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16
-; VI-NO-DS128-NEXT: v_bfe_i32 v18, v15, 0, 16
+; VI-NO-DS128-NEXT: v_bfe_i32 v18, v14, 0, 16
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18
-; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v14
-; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[18:19], v[16:17] offset0:22 offset1:23
-; VI-NO-DS128-NEXT: v_bfe_i32 v15, v15, 0, 16
-; VI-NO-DS128-NEXT: v_bfe_i32 v17, v14, 0, 16
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v14, 16, v13
-; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(5)
-; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v4
-; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[17:18], v[15:16] offset0:20 offset1:21
-; VI-NO-DS128-NEXT: v_bfe_i32 v14, v14, 0, 16
-; VI-NO-DS128-NEXT: v_bfe_i32 v16, v13, 0, 16
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14
+; VI-NO-DS128-NEXT: ds_write2_b64 v15, v[18:19], v[16:17] offset0:14 offset1:15
+; VI-NO-DS128-NEXT: v_bfe_i32 v16, v14, 0, 16
+; VI-NO-DS128-NEXT: v_bfe_i32 v13, v13, 0, 16
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
-; VI-NO-DS128-NEXT: v_bfe_i32 v13, v0, 0, 16
-; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v5
-; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[16:17], v[14:15] offset0:18 offset1:19
-; VI-NO-DS128-NEXT: v_bfe_i32 v15, v0, 0, 16
-; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v12
-; VI-NO-DS128-NEXT: v_bfe_i32 v17, v12, 0, 16
-; VI-NO-DS128-NEXT: v_bfe_i32 v19, v0, 0, 16
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
+; VI-NO-DS128-NEXT: ds_write2_b64 v15, v[13:14], v[16:17] offset0:12 offset1:13
+; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v12
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v10
-; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[17:18], v[19:20] offset0:16 offset1:17
-; VI-NO-DS128-NEXT: v_bfe_i32 v17, v0, 0, 16
-; VI-NO-DS128-NEXT: v_bfe_i32 v19, v10, 0, 16
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19
-; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v9
-; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[19:20], v[17:18] offset0:14 offset1:15
-; VI-NO-DS128-NEXT: v_bfe_i32 v17, v0, 0, 16
-; VI-NO-DS128-NEXT: v_bfe_i32 v9, v9, 0, 16
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9
-; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v8
-; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[9:10], v[17:18] offset0:12 offset1:13
-; VI-NO-DS128-NEXT: v_bfe_i32 v9, v0, 0, 16
-; VI-NO-DS128-NEXT: v_bfe_i32 v17, v8, 0, 16
+; VI-NO-DS128-NEXT: v_bfe_i32 v13, v13, 0, 16
+; VI-NO-DS128-NEXT: v_bfe_i32 v16, v12, 0, 16
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
+; VI-NO-DS128-NEXT: v_bfe_i32 v12, v0, 0, 16
+; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v11
+; VI-NO-DS128-NEXT: ds_write2_b64 v15, v[16:17], v[13:14] offset0:10 offset1:11
+; VI-NO-DS128-NEXT: v_bfe_i32 v16, v0, 0, 16
+; VI-NO-DS128-NEXT: v_bfe_i32 v18, v11, 0, 16
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18
+; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v20, 16, v8
+; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v9
+; VI-NO-DS128-NEXT: ds_write2_b64 v15, v[18:19], v[16:17] offset0:8 offset1:9
+; VI-NO-DS128-NEXT: v_bfe_i32 v16, v9, 0, 16
+; VI-NO-DS128-NEXT: v_bfe_i32 v9, v10, 0, 16
+; VI-NO-DS128-NEXT: v_bfe_i32 v5, v5, 0, 16
+; VI-NO-DS128-NEXT: v_bfe_i32 v1, v20, 0, 16
+; VI-NO-DS128-NEXT: v_bfe_i32 v3, v3, 0, 16
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 31, v12
+; VI-NO-DS128-NEXT: v_bfe_i32 v20, v7, 0, 16
+; VI-NO-DS128-NEXT: v_bfe_i32 v7, v8, 0, 16
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17
-; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v7
-; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[17:18], v[9:10] offset0:10 offset1:11
-; VI-NO-DS128-NEXT: v_bfe_i32 v17, v0, 0, 16
-; VI-NO-DS128-NEXT: v_bfe_i32 v19, v7, 0, 16
-; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v12, 16, v6
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19
-; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v3
-; VI-NO-DS128-NEXT: v_bfe_i32 v8, v12, 0, 16
-; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[19:20], v[17:18] offset0:8 offset1:9
-; VI-NO-DS128-NEXT: v_bfe_i32 v17, v5, 0, 16
-; VI-NO-DS128-NEXT: v_bfe_i32 v5, v6, 0, 16
-; VI-NO-DS128-NEXT: v_bfe_i32 v1, v1, 0, 16
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8
-; VI-NO-DS128-NEXT: v_bfe_i32 v21, v3, 0, 16
-; VI-NO-DS128-NEXT: v_bfe_i32 v3, v4, 0, 16
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v6, 31, v5
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v2, 31, v1
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v22, 31, v21
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17
-; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[5:6], v[8:9] offset0:6 offset1:7
-; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[17:18], v[15:16] offset0:4 offset1:5
-; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[3:4], v[13:14] offset0:2 offset1:3
-; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[21:22], v[1:2] offset1:1
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v21, 31, v20
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v8, 31, v7
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
+; VI-NO-DS128-NEXT: ds_write2_b64 v15, v[9:10], v[12:13] offset0:6 offset1:7
+; VI-NO-DS128-NEXT: ds_write2_b64 v15, v[16:17], v[3:4] offset0:4 offset1:5
+; VI-NO-DS128-NEXT: ds_write2_b64 v15, v[7:8], v[1:2] offset0:2 offset1:3
+; VI-NO-DS128-NEXT: ds_write2_b64 v15, v[20:21], v[5:6] offset1:1
; VI-NO-DS128-NEXT: s_endpgm
;
; GFX9-NO-DS128-LABEL: local_sextload_v32i16_to_v32i64:
@@ -8682,111 +8706,112 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out
; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-DS128-NEXT: s_mov_b32 m0, -1
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DS128-NEXT: v_mov_b32_e32 v4, s1
-; VI-DS128-NEXT: ds_read_b128 v[0:3], v4 offset:48
-; VI-DS128-NEXT: ds_read_b128 v[9:12], v4 offset:32
-; VI-DS128-NEXT: v_mov_b32_e32 v8, s0
-; VI-DS128-NEXT: ds_read_b128 v[17:20], v4 offset:16
-; VI-DS128-NEXT: ds_read_b128 v[4:7], v4
+; VI-DS128-NEXT: v_mov_b32_e32 v13, s1
+; VI-DS128-NEXT: ds_read_b128 v[4:7], v13 offset:48
+; VI-DS128-NEXT: ds_read_b128 v[0:3], v13 offset:32
+; VI-DS128-NEXT: v_mov_b32_e32 v12, s0
+; VI-DS128-NEXT: ds_read_b128 v[8:11], v13
+; VI-DS128-NEXT: ds_read_b128 v[18:21], v13 offset:16
; VI-DS128-NEXT: s_waitcnt lgkmcnt(3)
-; VI-DS128-NEXT: v_bfe_i32 v13, v2, 0, 16
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-DS128-NEXT: v_bfe_i32 v15, v2, 0, 16
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; VI-DS128-NEXT: ds_write_b128 v8, v[13:16] offset:224
-; VI-DS128-NEXT: v_bfe_i32 v15, v2, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v13, v3, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v14, v6, 0, 16
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; VI-DS128-NEXT: v_bfe_i32 v16, v6, 0, 16
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v7
+; VI-DS128-NEXT: ds_write_b128 v12, v[14:17] offset:224
+; VI-DS128-NEXT: v_bfe_i32 v15, v6, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v13, v7, 0, 16
; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; VI-DS128-NEXT: ds_write_b128 v8, v[13:16] offset:240
-; VI-DS128-NEXT: v_bfe_i32 v15, v2, 0, 16
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; VI-DS128-NEXT: v_bfe_i32 v13, v0, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v0, v1, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:208
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v4
+; VI-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:240
+; VI-DS128-NEXT: v_bfe_i32 v15, v6, 0, 16
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v5
+; VI-DS128-NEXT: v_bfe_i32 v13, v4, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v4, v5, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v6, v6, 0, 16
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6
+; VI-DS128-NEXT: ds_write_b128 v12, v[4:7] offset:208
; VI-DS128-NEXT: s_waitcnt lgkmcnt(5)
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v11
-; VI-DS128-NEXT: v_bfe_i32 v0, v11, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:160
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v12
-; VI-DS128-NEXT: v_bfe_i32 v0, v12, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:176
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v9
-; VI-DS128-NEXT: v_bfe_i32 v0, v9, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; VI-DS128-NEXT: v_bfe_i32 v4, v2, 0, 16
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; VI-DS128-NEXT: v_bfe_i32 v6, v2, 0, 16
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; VI-DS128-NEXT: ds_write_b128 v12, v[4:7] offset:160
+; VI-DS128-NEXT: v_bfe_i32 v4, v3, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v6, v2, 0, 16
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6
+; VI-DS128-NEXT: v_bfe_i32 v2, v0, 0, 16
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; VI-DS128-NEXT: ds_write_b128 v12, v[4:7] offset:176
+; VI-DS128-NEXT: v_bfe_i32 v4, v0, 0, 16
; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:128
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; VI-DS128-NEXT: s_waitcnt lgkmcnt(6)
-; VI-DS128-NEXT: v_bfe_i32 v0, v5, 0, 16
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v5
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v10
-; VI-DS128-NEXT: v_bfe_i32 v9, v10, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v11, v5, 0, 16
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v19
-; VI-DS128-NEXT: ds_write_b128 v8, v[9:12] offset:144
-; VI-DS128-NEXT: v_bfe_i32 v9, v19, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v11, v5, 0, 16
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v20
-; VI-DS128-NEXT: ds_write_b128 v8, v[9:12] offset:96
-; VI-DS128-NEXT: v_bfe_i32 v9, v20, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v11, v5, 0, 16
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v17
-; VI-DS128-NEXT: ds_write_b128 v8, v[9:12] offset:112
-; VI-DS128-NEXT: v_bfe_i32 v9, v17, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v11, v5, 0, 16
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11
-; VI-DS128-NEXT: ds_write_b128 v8, v[9:12] offset:64
-; VI-DS128-NEXT: v_bfe_i32 v9, v4, 0, 16
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v9
; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
-; VI-DS128-NEXT: v_bfe_i32 v11, v4, 0, 16
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v18
-; VI-DS128-NEXT: ds_write_b128 v8, v[13:16] offset:192
+; VI-DS128-NEXT: ds_write_b128 v12, v[2:5] offset:128
+; VI-DS128-NEXT: v_bfe_i32 v4, v0, 0, 16
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; VI-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:192
+; VI-DS128-NEXT: v_bfe_i32 v13, v1, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v15, v0, 0, 16
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
+; VI-DS128-NEXT: s_waitcnt lgkmcnt(7)
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v20
+; VI-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:144
+; VI-DS128-NEXT: v_bfe_i32 v13, v20, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v15, v0, 0, 16
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v21
+; VI-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:96
+; VI-DS128-NEXT: v_bfe_i32 v13, v21, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v15, v0, 0, 16
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v18
+; VI-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:112
; VI-DS128-NEXT: v_bfe_i32 v13, v18, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v15, v4, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v15, v0, 0, 16
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v8
; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
-; VI-DS128-NEXT: ds_write_b128 v8, v[13:16] offset:80
-; VI-DS128-NEXT: v_bfe_i32 v13, v7, 0, 16
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v7
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v6
-; VI-DS128-NEXT: v_bfe_i32 v15, v4, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v4, v6, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v6, v7, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v6, v8, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v8, v0, 0, 16
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v19
+; VI-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:64
+; VI-DS128-NEXT: v_bfe_i32 v13, v19, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v15, v0, 0, 16
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v11
+; VI-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:80
+; VI-DS128-NEXT: v_bfe_i32 v15, v0, 0, 16
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v10
+; VI-DS128-NEXT: v_bfe_i32 v17, v10, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v19, v0, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v2, v9, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v13, v11, 0, 16
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; VI-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; VI-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8
; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
-; VI-DS128-NEXT: ds_write_b128 v8, v[4:7] offset:32
-; VI-DS128-NEXT: ds_write_b128 v8, v[13:16] offset:48
-; VI-DS128-NEXT: ds_write_b128 v8, v[9:12]
-; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:16
+; VI-DS128-NEXT: ds_write_b128 v12, v[17:20] offset:32
+; VI-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:48
+; VI-DS128-NEXT: ds_write_b128 v12, v[6:9]
+; VI-DS128-NEXT: ds_write_b128 v12, v[2:5] offset:16
; VI-DS128-NEXT: s_endpgm
;
; GFX9-DS128-LABEL: local_sextload_v32i16_to_v32i64:
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll b/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll
index c9615f478e5b5..7cba97fc32936 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll
@@ -11,13 +11,14 @@ define amdgpu_vs void @test(ptr addrspace(8) inreg %arg1, ptr addrspace(3) %arg2
; CHECK-NEXT: v_add_i32_e32 v2, vcc, 8, v0
; CHECK-NEXT: v_add_i32_e32 v4, vcc, 4, v0
; CHECK-NEXT: s_mov_b32 m0, -1
+; CHECK-NEXT: ds_read_b32 v0, v0
; CHECK-NEXT: ds_read_b32 v3, v1
; CHECK-NEXT: ds_read_b32 v2, v2
; CHECK-NEXT: ds_read_b32 v1, v4
-; CHECK-NEXT: ds_read_b32 v0, v0
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_waitcnt lgkmcnt(3)
; CHECK-NEXT: exp mrt0 off, off, off, off
; CHECK-NEXT: v_mov_b32_e32 v4, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_FLOAT] idxen
; CHECK-NEXT: s_endpgm
call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float poison, float poison, float poison, float poison, i1 false, i1 false)
@@ -30,26 +31,26 @@ define amdgpu_vs void @test(ptr addrspace(8) inreg %arg1, ptr addrspace(3) %arg2
define amdgpu_vs void @test_2(ptr addrspace(8) inreg %arg1, i32 %arg2, i32 inreg %arg3, ptr addrspace(3) %arg4) {
; CHECK-LABEL: test_2:
; CHECK: ; %bb.0:
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, 20, v1
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, 16, v1
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, 20, v1
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, 16, v1
; CHECK-NEXT: v_add_i32_e32 v4, vcc, 28, v1
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, 24, v1
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, 12, v1
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, 8, v1
-; CHECK-NEXT: v_add_i32_e32 v10, vcc, 4, v1
+; CHECK-NEXT: v_add_i32_e32 v7, vcc, 24, v1
+; CHECK-NEXT: v_add_i32_e32 v9, vcc, 12, v1
+; CHECK-NEXT: v_add_i32_e32 v10, vcc, 8, v1
+; CHECK-NEXT: v_add_i32_e32 v11, vcc, 4, v1
; CHECK-NEXT: s_mov_b32 m0, -1
-; CHECK-NEXT: ds_read_b32 v2, v2
-; CHECK-NEXT: ds_read_b32 v5, v4
-; CHECK-NEXT: ds_read_b32 v4, v6
-; CHECK-NEXT: ds_read_b32 v9, v7
-; CHECK-NEXT: ds_read_b32 v8, v8
-; CHECK-NEXT: ds_read_b32 v7, v10
-; CHECK-NEXT: ds_read_b32 v6, v1
-; CHECK-NEXT: ds_read_b32 v3, v3
+; CHECK-NEXT: ds_read_b32 v1, v1
+; CHECK-NEXT: ds_read_b32 v6, v2
+; CHECK-NEXT: ds_read_b32 v5, v3
+; CHECK-NEXT: ds_read_b32 v8, v4
+; CHECK-NEXT: ds_read_b32 v4, v9
+; CHECK-NEXT: ds_read_b32 v3, v10
+; CHECK-NEXT: ds_read_b32 v2, v11
+; CHECK-NEXT: ds_read_b32 v7, v7
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
-; CHECK-NEXT: tbuffer_store_format_xyzw v[6:9], v0, s[0:3], s4 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen glc slc
+; CHECK-NEXT: tbuffer_store_format_xyzw v[1:4], v0, s[0:3], s4 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen glc slc
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: tbuffer_store_format_xyzw v[2:5], v0, s[0:3], s4 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:16 glc slc
+; CHECK-NEXT: tbuffer_store_format_xyzw v[5:8], v0, s[0:3], s4 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:16 glc slc
; CHECK-NEXT: s_endpgm
%load = load <8 x float>, ptr addrspace(3) %arg4, align 4
%vec1 = shufflevector <8 x float> %load, <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -67,38 +68,40 @@ define amdgpu_vs void @test_3(i32 inreg %arg1, i32 inreg %arg2, ptr addrspace(8)
; CHECK-NEXT: s_mov_b32 s5, s3
; CHECK-NEXT: s_mov_b32 s4, s2
; CHECK-NEXT: v_add_i32_e32 v0, vcc, 12, v1
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, 8, v1
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, 4, v1
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, 20, v1
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, 8, v1
+; CHECK-NEXT: v_add_i32_e32 v7, vcc, 4, v1
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, 20, v1
; CHECK-NEXT: v_add_i32_e32 v9, vcc, 16, v1
; CHECK-NEXT: v_mov_b32_e32 v10, s0
; CHECK-NEXT: v_add_i32_e32 v11, vcc, 12, v2
; CHECK-NEXT: v_add_i32_e32 v12, vcc, 8, v2
+; CHECK-NEXT: v_add_i32_e32 v13, vcc, 4, v2
; CHECK-NEXT: s_mov_b32 m0, -1
-; CHECK-NEXT: ds_read_b32 v6, v0
-; CHECK-NEXT: ds_read_b32 v5, v3
-; CHECK-NEXT: ds_read_b32 v4, v4
-; CHECK-NEXT: ds_read_b32 v8, v7
-; CHECK-NEXT: ds_read_b32 v7, v9
; CHECK-NEXT: ds_read_b32 v3, v1
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, 4, v2
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, 20, v2
-; CHECK-NEXT: v_add_i32_e32 v9, vcc, 16, v2
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: tbuffer_store_format_xyzw v[3:6], v10, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:264 glc slc
-; CHECK-NEXT: tbuffer_store_format_xy v[7:8], v10, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:280 glc slc
-; CHECK-NEXT: s_waitcnt expcnt(1)
-; CHECK-NEXT: ds_read_b32 v5, v11
-; CHECK-NEXT: ds_read_b32 v4, v12
-; CHECK-NEXT: ds_read_b32 v3, v0
-; CHECK-NEXT: ds_read_b32 v1, v1
+; CHECK-NEXT: ds_read_b32 v6, v0
+; CHECK-NEXT: ds_read_b32 v5, v4
+; CHECK-NEXT: ds_read_b32 v4, v7
+; CHECK-NEXT: ds_read_b32 v1, v8
; CHECK-NEXT: ds_read_b32 v0, v9
-; CHECK-NEXT: ds_read_b32 v2, v2
-; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: v_add_i32_e32 v7, vcc, 20, v2
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, 16, v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
+; CHECK-NEXT: tbuffer_store_format_xyzw v[3:6], v10, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:264 glc slc
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: tbuffer_store_format_xy v[0:1], v10, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:280 glc slc
+; CHECK-NEXT: s_waitcnt expcnt(0)
+; CHECK-NEXT: ds_read_b32 v0, v2
+; CHECK-NEXT: ds_read_b32 v3, v11
+; CHECK-NEXT: ds_read_b32 v2, v12
+; CHECK-NEXT: ds_read_b32 v1, v13
+; CHECK-NEXT: ds_read_b32 v5, v7
+; CHECK-NEXT: ds_read_b32 v4, v8
+; CHECK-NEXT: s_waitcnt lgkmcnt(5)
; CHECK-NEXT: exp mrt0 off, off, off, off
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
+; CHECK-NEXT: tbuffer_store_format_xyzw v[0:3], v10, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:240 glc slc
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: tbuffer_store_format_xyzw v[2:5], v10, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:240 glc slc
-; CHECK-NEXT: tbuffer_store_format_xy v[0:1], v10, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:256 glc slc
+; CHECK-NEXT: tbuffer_store_format_xy v[4:5], v10, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:256 glc slc
; CHECK-NEXT: s_endpgm
%load1 = load <6 x float>, ptr addrspace(3) %arg5, align 4
%vec11 = shufflevector <6 x float> %load1, <6 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/CodeGen/AMDGPU/load-local.128.ll b/llvm/test/CodeGen/AMDGPU/load-local.128.ll
index 5b6af7654f7e9..06b790457b0fa 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local.128.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local.128.ll
@@ -95,53 +95,52 @@ define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) {
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 m0, -1
-; GFX7-NEXT: ds_read_u8 v1, v0 offset:1
-; GFX7-NEXT: ds_read_u8 v2, v0 offset:6
-; GFX7-NEXT: ds_read_u8 v3, v0 offset:4
-; GFX7-NEXT: ds_read_u8 v4, v0 offset:2
-; GFX7-NEXT: ds_read_u8 v5, v0
-; GFX7-NEXT: ds_read_u8 v6, v0 offset:3
-; GFX7-NEXT: ds_read_u8 v7, v0 offset:5
+; GFX7-NEXT: ds_read_u8 v1, v0
+; GFX7-NEXT: ds_read_u8 v2, v0 offset:1
+; GFX7-NEXT: ds_read_u8 v3, v0 offset:2
+; GFX7-NEXT: ds_read_u8 v4, v0 offset:3
+; GFX7-NEXT: ds_read_u8 v5, v0 offset:4
+; GFX7-NEXT: ds_read_u8 v6, v0 offset:5
+; GFX7-NEXT: ds_read_u8 v7, v0 offset:6
; GFX7-NEXT: ds_read_u8 v8, v0 offset:7
-; GFX7-NEXT: s_waitcnt lgkmcnt(7)
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-NEXT: s_waitcnt lgkmcnt(3)
-; GFX7-NEXT: v_or_b32_e32 v1, v1, v5
-; GFX7-NEXT: s_waitcnt lgkmcnt(2)
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v6
-; GFX7-NEXT: v_or_b32_e32 v4, v5, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_or_b32_e32 v4, v4, v1
-; GFX7-NEXT: s_waitcnt lgkmcnt(1)
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v7
-; GFX7-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX7-NEXT: s_waitcnt lgkmcnt(6)
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX7-NEXT: v_or_b32_e32 v1, v2, v1
+; GFX7-NEXT: s_waitcnt lgkmcnt(4)
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v4
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_or_b32_e32 v4, v2, v1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v8
-; GFX7-NEXT: ds_read_u8 v5, v0 offset:9
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v6
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v7
+; GFX7-NEXT: v_or_b32_e32 v1, v1, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_or_b32_e32 v1, v2, v1
+; GFX7-NEXT: ds_read_u8 v2, v0 offset:8
+; GFX7-NEXT: ds_read_u8 v3, v0 offset:9
+; GFX7-NEXT: ds_read_u8 v5, v0 offset:10
; GFX7-NEXT: ds_read_u8 v6, v0 offset:11
-; GFX7-NEXT: ds_read_u8 v7, v0 offset:13
-; GFX7-NEXT: ds_read_u8 v8, v0 offset:15
+; GFX7-NEXT: ds_read_u8 v7, v0 offset:12
+; GFX7-NEXT: ds_read_u8 v8, v0 offset:13
; GFX7-NEXT: ds_read_u8 v9, v0 offset:14
-; GFX7-NEXT: ds_read_u8 v10, v0 offset:12
-; GFX7-NEXT: ds_read_u8 v11, v0 offset:10
-; GFX7-NEXT: ds_read_u8 v0, v0 offset:8
+; GFX7-NEXT: ds_read_u8 v0, v0 offset:15
+; GFX7-NEXT: s_waitcnt lgkmcnt(6)
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_or_b32_e32 v1, v2, v1
-; GFX7-NEXT: s_waitcnt lgkmcnt(7)
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v5
+; GFX7-NEXT: s_waitcnt lgkmcnt(4)
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v6
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v6
-; GFX7-NEXT: v_or_b32_e32 v2, v2, v11
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0
+; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v8
-; GFX7-NEXT: v_or_b32_e32 v2, v2, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v7
-; GFX7-NEXT: v_or_b32_e32 v3, v3, v9
-; GFX7-NEXT: v_or_b32_e32 v0, v0, v10
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_or_b32_e32 v3, v3, v0
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v9
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_or_b32_e32 v3, v0, v3
; GFX7-NEXT: v_mov_b32_e32 v0, v4
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -156,6 +155,7 @@ define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) {
; GFX6-NEXT: v_add_i32_e32 v6, vcc, 8, v0
; GFX6-NEXT: v_add_i32_e32 v7, vcc, 11, v0
; GFX6-NEXT: s_mov_b32 m0, -1
+; GFX6-NEXT: ds_read_u8 v8, v0
; GFX6-NEXT: ds_read_u8 v1, v1
; GFX6-NEXT: ds_read_u8 v2, v2
; GFX6-NEXT: ds_read_u8 v3, v3
@@ -163,22 +163,21 @@ define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) {
; GFX6-NEXT: ds_read_u8 v5, v5
; GFX6-NEXT: ds_read_u8 v6, v6
; GFX6-NEXT: ds_read_u8 v7, v7
-; GFX6-NEXT: ds_read_u8 v8, v0
-; GFX6-NEXT: s_waitcnt lgkmcnt(7)
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; GFX6-NEXT: s_waitcnt lgkmcnt(6)
-; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; GFX6-NEXT: s_waitcnt lgkmcnt(5)
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 8, v3
+; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
; GFX6-NEXT: s_waitcnt lgkmcnt(4)
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 8, v3
+; GFX6-NEXT: s_waitcnt lgkmcnt(3)
; GFX6-NEXT: v_or_b32_e32 v2, v2, v4
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX6-NEXT: v_or_b32_e32 v1, v2, v1
-; GFX6-NEXT: s_waitcnt lgkmcnt(3)
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 8, v5
; GFX6-NEXT: s_waitcnt lgkmcnt(2)
-; GFX6-NEXT: v_or_b32_e32 v2, v2, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 8, v5
; GFX6-NEXT: s_waitcnt lgkmcnt(1)
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v6
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 8, v7
; GFX6-NEXT: v_add_i32_e32 v4, vcc, 10, v0
; GFX6-NEXT: v_add_i32_e32 v5, vcc, 13, v0
@@ -331,27 +330,26 @@ define <4 x i32> @load_lds_v4i32_align2(ptr addrspace(3) %ptr) {
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 m0, -1
-; GFX7-NEXT: ds_read_u16 v1, v0 offset:2
-; GFX7-NEXT: ds_read_u16 v3, v0 offset:12
-; GFX7-NEXT: ds_read_u16 v2, v0 offset:8
-; GFX7-NEXT: ds_read_u16 v4, v0 offset:4
-; GFX7-NEXT: ds_read_u16 v5, v0
-; GFX7-NEXT: ds_read_u16 v6, v0 offset:6
-; GFX7-NEXT: ds_read_u16 v7, v0 offset:10
+; GFX7-NEXT: ds_read_u16 v1, v0
+; GFX7-NEXT: ds_read_u16 v2, v0 offset:2
+; GFX7-NEXT: ds_read_u16 v3, v0 offset:4
+; GFX7-NEXT: ds_read_u16 v4, v0 offset:6
+; GFX7-NEXT: ds_read_u16 v5, v0 offset:8
+; GFX7-NEXT: ds_read_u16 v6, v0 offset:10
+; GFX7-NEXT: ds_read_u16 v7, v0 offset:12
; GFX7-NEXT: ds_read_u16 v8, v0 offset:14
-; GFX7-NEXT: s_waitcnt lgkmcnt(7)
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX7-NEXT: s_waitcnt lgkmcnt(3)
-; GFX7-NEXT: v_or_b32_e32 v0, v0, v5
+; GFX7-NEXT: s_waitcnt lgkmcnt(6)
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT: s_waitcnt lgkmcnt(4)
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4
+; GFX7-NEXT: v_or_b32_e32 v1, v1, v3
; GFX7-NEXT: s_waitcnt lgkmcnt(2)
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v6
-; GFX7-NEXT: v_or_b32_e32 v1, v1, v4
-; GFX7-NEXT: s_waitcnt lgkmcnt(1)
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v7
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8
-; GFX7-NEXT: v_or_b32_e32 v3, v4, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v8
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v5
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v7
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: load_lds_v4i32_align2:
@@ -365,6 +363,7 @@ define <4 x i32> @load_lds_v4i32_align2(ptr addrspace(3) %ptr) {
; GFX6-NEXT: v_add_i32_e32 v6, vcc, 12, v0
; GFX6-NEXT: v_add_i32_e32 v7, vcc, 2, v0
; GFX6-NEXT: s_mov_b32 m0, -1
+; GFX6-NEXT: ds_read_u16 v0, v0
; GFX6-NEXT: ds_read_u16 v1, v1
; GFX6-NEXT: ds_read_u16 v2, v2
; GFX6-NEXT: ds_read_u16 v3, v3
@@ -372,21 +371,19 @@ define <4 x i32> @load_lds_v4i32_align2(ptr addrspace(3) %ptr) {
; GFX6-NEXT: ds_read_u16 v5, v5
; GFX6-NEXT: ds_read_u16 v6, v6
; GFX6-NEXT: ds_read_u16 v7, v7
-; GFX6-NEXT: ds_read_u16 v0, v0
-; GFX6-NEXT: s_waitcnt lgkmcnt(7)
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: s_waitcnt lgkmcnt(6)
-; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: s_waitcnt lgkmcnt(5)
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
; GFX6-NEXT: s_waitcnt lgkmcnt(4)
-; GFX6-NEXT: v_or_b32_e32 v2, v2, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX6-NEXT: s_waitcnt lgkmcnt(3)
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v4
+; GFX6-NEXT: s_waitcnt lgkmcnt(2)
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; GFX6-NEXT: s_waitcnt lgkmcnt(1)
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v7
; GFX6-NEXT: v_or_b32_e32 v3, v3, v6
-; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_or_b32_e32 v0, v4, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
@@ -462,10 +459,10 @@ define <4 x i32> @load_lds_v4i32_align4(ptr addrspace(3) %ptr) {
; GFX6-NEXT: v_add_i32_e32 v2, vcc, 8, v0
; GFX6-NEXT: v_add_i32_e32 v3, vcc, 12, v0
; GFX6-NEXT: s_mov_b32 m0, -1
-; GFX6-NEXT: ds_read_b32 v2, v2
-; GFX6-NEXT: ds_read_b32 v3, v3
; GFX6-NEXT: ds_read_b32 v0, v0
; GFX6-NEXT: ds_read_b32 v1, v1
+; GFX6-NEXT: ds_read_b32 v2, v2
+; GFX6-NEXT: ds_read_b32 v3, v3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/load-local.96.ll b/llvm/test/CodeGen/AMDGPU/load-local.96.ll
index 509aba49893f6..3c882daef614a 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local.96.ll
@@ -86,43 +86,41 @@ define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) {
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 m0, -1
-; GFX7-NEXT: ds_read_u8 v1, v0 offset:1
-; GFX7-NEXT: ds_read_u8 v2, v0 offset:6
-; GFX7-NEXT: ds_read_u8 v4, v0 offset:4
+; GFX7-NEXT: ds_read_u8 v1, v0
+; GFX7-NEXT: ds_read_u8 v2, v0 offset:1
; GFX7-NEXT: ds_read_u8 v3, v0 offset:2
-; GFX7-NEXT: ds_read_u8 v5, v0
-; GFX7-NEXT: ds_read_u8 v6, v0 offset:3
-; GFX7-NEXT: ds_read_u8 v7, v0 offset:5
+; GFX7-NEXT: ds_read_u8 v4, v0 offset:3
+; GFX7-NEXT: ds_read_u8 v5, v0 offset:4
+; GFX7-NEXT: ds_read_u8 v6, v0 offset:5
+; GFX7-NEXT: ds_read_u8 v7, v0 offset:6
; GFX7-NEXT: ds_read_u8 v8, v0 offset:7
-; GFX7-NEXT: s_waitcnt lgkmcnt(7)
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-NEXT: s_waitcnt lgkmcnt(3)
-; GFX7-NEXT: v_or_b32_e32 v1, v1, v5
+; GFX7-NEXT: s_waitcnt lgkmcnt(6)
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX7-NEXT: v_or_b32_e32 v1, v2, v1
+; GFX7-NEXT: s_waitcnt lgkmcnt(4)
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v4
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_or_b32_e32 v3, v2, v1
; GFX7-NEXT: s_waitcnt lgkmcnt(2)
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v6
-; GFX7-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_or_b32_e32 v3, v3, v1
-; GFX7-NEXT: s_waitcnt lgkmcnt(1)
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v6
+; GFX7-NEXT: v_or_b32_e32 v1, v1, v5
+; GFX7-NEXT: ds_read_u8 v4, v0 offset:8
; GFX7-NEXT: ds_read_u8 v5, v0 offset:9
-; GFX7-NEXT: ds_read_u8 v6, v0 offset:11
-; GFX7-NEXT: ds_read_u8 v7, v0 offset:10
-; GFX7-NEXT: ds_read_u8 v0, v0 offset:8
-; GFX7-NEXT: v_or_b32_e32 v1, v1, v4
+; GFX7-NEXT: ds_read_u8 v6, v0 offset:10
+; GFX7-NEXT: ds_read_u8 v0, v0 offset:11
; GFX7-NEXT: s_waitcnt lgkmcnt(4)
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v8
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v8
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v7
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_or_b32_e32 v1, v2, v1
-; GFX7-NEXT: s_waitcnt lgkmcnt(3)
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v5
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v6
-; GFX7-NEXT: v_or_b32_e32 v2, v2, v7
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v2, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v5
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v6
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_or_b32_e32 v2, v0, v2
; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -137,6 +135,7 @@ define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) {
; GFX6-NEXT: v_add_i32_e32 v6, vcc, 8, v0
; GFX6-NEXT: v_add_i32_e32 v7, vcc, 11, v0
; GFX6-NEXT: s_mov_b32 m0, -1
+; GFX6-NEXT: ds_read_u8 v8, v0
; GFX6-NEXT: ds_read_u8 v1, v1
; GFX6-NEXT: ds_read_u8 v2, v2
; GFX6-NEXT: ds_read_u8 v3, v3
@@ -144,22 +143,21 @@ define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) {
; GFX6-NEXT: ds_read_u8 v5, v5
; GFX6-NEXT: ds_read_u8 v6, v6
; GFX6-NEXT: ds_read_u8 v7, v7
-; GFX6-NEXT: ds_read_u8 v8, v0
-; GFX6-NEXT: s_waitcnt lgkmcnt(7)
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; GFX6-NEXT: s_waitcnt lgkmcnt(6)
-; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; GFX6-NEXT: s_waitcnt lgkmcnt(5)
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 8, v3
+; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
; GFX6-NEXT: s_waitcnt lgkmcnt(4)
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 8, v3
+; GFX6-NEXT: s_waitcnt lgkmcnt(3)
; GFX6-NEXT: v_or_b32_e32 v2, v2, v4
; GFX6-NEXT: v_add_i32_e32 v4, vcc, 10, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX6-NEXT: ds_read_u8 v4, v4
; GFX6-NEXT: v_or_b32_e32 v1, v2, v1
-; GFX6-NEXT: s_waitcnt lgkmcnt(4)
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 8, v5
; GFX6-NEXT: s_waitcnt lgkmcnt(3)
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 8, v5
+; GFX6-NEXT: s_waitcnt lgkmcnt(2)
; GFX6-NEXT: v_or_b32_e32 v2, v2, v6
; GFX6-NEXT: v_add_i32_e32 v5, vcc, 3, v0
; GFX6-NEXT: v_add_i32_e32 v6, vcc, 2, v0
@@ -167,7 +165,7 @@ define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) {
; GFX6-NEXT: ds_read_u8 v5, v5
; GFX6-NEXT: ds_read_u8 v6, v6
; GFX6-NEXT: ds_read_u8 v0, v0
-; GFX6-NEXT: s_waitcnt lgkmcnt(5)
+; GFX6-NEXT: s_waitcnt lgkmcnt(4)
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 8, v7
; GFX6-NEXT: s_waitcnt lgkmcnt(3)
; GFX6-NEXT: v_or_b32_e32 v3, v3, v4
@@ -274,22 +272,21 @@ define <3 x i32> @load_lds_v3i32_align2(ptr addrspace(3) %ptr) {
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 m0, -1
-; GFX7-NEXT: ds_read_u16 v1, v0 offset:2
-; GFX7-NEXT: ds_read_u16 v2, v0 offset:8
+; GFX7-NEXT: ds_read_u16 v1, v0
+; GFX7-NEXT: ds_read_u16 v2, v0 offset:2
; GFX7-NEXT: ds_read_u16 v3, v0 offset:4
-; GFX7-NEXT: ds_read_u16 v4, v0
-; GFX7-NEXT: ds_read_u16 v5, v0 offset:6
+; GFX7-NEXT: ds_read_u16 v4, v0 offset:6
+; GFX7-NEXT: ds_read_u16 v5, v0 offset:8
; GFX7-NEXT: ds_read_u16 v6, v0 offset:10
-; GFX7-NEXT: s_waitcnt lgkmcnt(5)
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX7-NEXT: s_waitcnt lgkmcnt(4)
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-NEXT: s_waitcnt lgkmcnt(2)
-; GFX7-NEXT: v_or_b32_e32 v0, v0, v4
-; GFX7-NEXT: s_waitcnt lgkmcnt(1)
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4
; GFX7-NEXT: v_or_b32_e32 v1, v1, v3
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: load_lds_v3i32_align2:
@@ -301,22 +298,22 @@ define <3 x i32> @load_lds_v3i32_align2(ptr addrspace(3) %ptr) {
; GFX6-NEXT: v_add_i32_e32 v4, vcc, 8, v0
; GFX6-NEXT: v_add_i32_e32 v5, vcc, 2, v0
; GFX6-NEXT: s_mov_b32 m0, -1
+; GFX6-NEXT: ds_read_u16 v0, v0
; GFX6-NEXT: ds_read_u16 v1, v1
; GFX6-NEXT: ds_read_u16 v2, v2
; GFX6-NEXT: ds_read_u16 v3, v3
; GFX6-NEXT: ds_read_u16 v4, v4
; GFX6-NEXT: ds_read_u16 v5, v5
-; GFX6-NEXT: ds_read_u16 v0, v0
-; GFX6-NEXT: s_waitcnt lgkmcnt(5)
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: s_waitcnt lgkmcnt(4)
-; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: s_waitcnt lgkmcnt(3)
+; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT: s_waitcnt lgkmcnt(2)
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX6-NEXT: s_waitcnt lgkmcnt(1)
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5
; GFX6-NEXT: v_or_b32_e32 v2, v2, v4
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5
; GFX6-NEXT: v_or_b32_e32 v0, v3, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
@@ -383,9 +380,9 @@ define <3 x i32> @load_lds_v3i32_align4(ptr addrspace(3) %ptr) {
; GFX6-NEXT: v_add_i32_e32 v1, vcc, 4, v0
; GFX6-NEXT: v_add_i32_e32 v2, vcc, 8, v0
; GFX6-NEXT: s_mov_b32 m0, -1
-; GFX6-NEXT: ds_read_b32 v2, v2
; GFX6-NEXT: ds_read_b32 v0, v0
; GFX6-NEXT: ds_read_b32 v1, v1
+; GFX6-NEXT: ds_read_b32 v2, v2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
index 9ecd35e7ddd11..3302bbb667f28 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
@@ -425,8 +425,8 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: s_cbranch_execz .LBB0_27
; CHECK-NEXT: ; %bb.29: ; in Loop: Header=BB0_28 Depth=1
; CHECK-NEXT: s_clause 0x1
-; CHECK-NEXT: global_load_dwordx2 v[58:59], v[2:3], off offset:24
-; CHECK-NEXT: global_load_dwordx2 v[60:61], v[0:1], off offset:24
+; CHECK-NEXT: global_load_dwordx2 v[58:59], v[0:1], off offset:24
+; CHECK-NEXT: global_load_dwordx2 v[60:61], v[2:3], off offset:24
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 4, v45
; CHECK-NEXT: v_alignbit_b32 v1, v46, v45, 12
; CHECK-NEXT: v_and_b32_e32 v2, 0xf0000, v45
@@ -465,14 +465,14 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: s_xor_b32 s4, exec_lo, s4
; CHECK-NEXT: s_cbranch_execz .LBB0_31
; CHECK-NEXT: ; %bb.30: ; in Loop: Header=BB0_28 Depth=1
-; CHECK-NEXT: v_xor_b32_e32 v4, v60, v58
+; CHECK-NEXT: v_xor_b32_e32 v4, v58, v60
; CHECK-NEXT: v_lshrrev_b64 v[2:3], 16, v[56:57]
; CHECK-NEXT: v_mad_u64_u32 v[6:7], null, 0x180, v73, s[66:67]
; CHECK-NEXT: v_lshlrev_b32_e32 v10, 5, v0
; CHECK-NEXT: v_lshlrev_b32_e32 v1, 16, v4
; CHECK-NEXT: v_lshlrev_b32_e32 v8, 6, v72
; CHECK-NEXT: v_lshlrev_b32_e32 v9, 12, v63
-; CHECK-NEXT: v_xor_b32_e32 v5, v61, v59
+; CHECK-NEXT: v_xor_b32_e32 v5, v59, v61
; CHECK-NEXT: v_lshlrev_b32_e32 v11, 16, v56
; CHECK-NEXT: v_or_b32_e32 v3, v1, v3
; CHECK-NEXT: v_lshrrev_b64 v[0:1], 16, v[45:46]
diff --git a/llvm/test/CodeGen/AMDGPU/madak.ll b/llvm/test/CodeGen/AMDGPU/madak.ll
index b2813b392d253..9b09f5320c4c9 100644
--- a/llvm/test/CodeGen/AMDGPU/madak.ll
+++ b/llvm/test/CodeGen/AMDGPU/madak.ll
@@ -56,46 +56,46 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac
;
; GFX9-LABEL: madak_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_madak_f32 v1, v1, v2, 0x41200000
+; GFX9-NEXT: v_madak_f32 v1, v2, v1, 0x41200000
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-MAD-LABEL: madak_f32:
; GFX10-MAD: ; %bb.0:
; GFX10-MAD-NEXT: s_clause 0x1
-; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-MAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-MAD-NEXT: s_clause 0x1
-; GFX10-MAD-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX10-MAD-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX10-MAD-NEXT: global_load_dword v2, v0, s[2:3]
; GFX10-MAD-NEXT: s_waitcnt vmcnt(0)
-; GFX10-MAD-NEXT: v_madak_f32 v1, v1, v2, 0x41200000
+; GFX10-MAD-NEXT: v_madak_f32 v1, v2, v1, 0x41200000
; GFX10-MAD-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-MAD-NEXT: s_endpgm
;
; GFX11-MAD-LABEL: madak_f32:
; GFX11-MAD: ; %bb.0:
; GFX11-MAD-NEXT: s_clause 0x1
+; GFX11-MAD-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-MAD-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-MAD-NEXT: s_clause 0x1
-; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[4:5]
+; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-MAD-NEXT: s_waitcnt vmcnt(0)
-; GFX11-MAD-NEXT: v_mul_f32_e32 v1, v1, v2
+; GFX11-MAD-NEXT: v_mul_f32_e32 v1, v2, v1
; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x41200000, v1
; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -103,47 +103,47 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac
;
; GFX942-FMA-LABEL: madak_f32:
; GFX942-FMA: ; %bb.0:
-; GFX942-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-FMA-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX942-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX942-FMA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-FMA-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX942-FMA-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX942-FMA-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX942-FMA-NEXT: global_load_dword v2, v0, s[2:3]
; GFX942-FMA-NEXT: s_waitcnt vmcnt(0)
-; GFX942-FMA-NEXT: v_fmaak_f32 v1, v1, v2, 0x41200000
+; GFX942-FMA-NEXT: v_fmaak_f32 v1, v2, v1, 0x41200000
; GFX942-FMA-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-FMA-NEXT: s_endpgm
;
; GFX10-FMA-LABEL: madak_f32:
; GFX10-FMA: ; %bb.0:
; GFX10-FMA-NEXT: s_clause 0x1
-; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-FMA-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-FMA-NEXT: s_clause 0x1
-; GFX10-FMA-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX10-FMA-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX10-FMA-NEXT: global_load_dword v2, v0, s[2:3]
; GFX10-FMA-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FMA-NEXT: v_fmaak_f32 v1, v1, v2, 0x41200000
+; GFX10-FMA-NEXT: v_fmaak_f32 v1, v2, v1, 0x41200000
; GFX10-FMA-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-FMA-NEXT: s_endpgm
;
; GFX11-FMA-LABEL: madak_f32:
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_clause 0x1
+; GFX11-FMA-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: s_clause 0x1
-; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5]
+; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT: v_fmaak_f32 v1, v1, v2, 0x41200000
+; GFX11-FMA-NEXT: v_fmaak_f32 v1, v2, v1, 0x41200000
; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-FMA-NEXT: s_endpgm
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
@@ -527,46 +527,46 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p
;
; GFX9-LABEL: madak_inline_imm_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mad_f32 v1, v1, v2, 4.0
+; GFX9-NEXT: v_mad_f32 v1, v2, v1, 4.0
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-MAD-LABEL: madak_inline_imm_f32:
; GFX10-MAD: ; %bb.0:
; GFX10-MAD-NEXT: s_clause 0x1
-; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-MAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-MAD-NEXT: s_clause 0x1
-; GFX10-MAD-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX10-MAD-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX10-MAD-NEXT: global_load_dword v2, v0, s[2:3]
; GFX10-MAD-NEXT: s_waitcnt vmcnt(0)
-; GFX10-MAD-NEXT: v_mad_f32 v1, v1, v2, 4.0
+; GFX10-MAD-NEXT: v_mad_f32 v1, v2, v1, 4.0
; GFX10-MAD-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-MAD-NEXT: s_endpgm
;
; GFX11-MAD-LABEL: madak_inline_imm_f32:
; GFX11-MAD: ; %bb.0:
; GFX11-MAD-NEXT: s_clause 0x1
+; GFX11-MAD-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-MAD-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-MAD-NEXT: s_clause 0x1
-; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[4:5]
+; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-MAD-NEXT: s_waitcnt vmcnt(0)
-; GFX11-MAD-NEXT: v_mul_f32_e32 v1, v1, v2
+; GFX11-MAD-NEXT: v_mul_f32_e32 v1, v2, v1
; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-MAD-NEXT: v_add_f32_e32 v1, 4.0, v1
; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -574,47 +574,47 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p
;
; GFX942-FMA-LABEL: madak_inline_imm_f32:
; GFX942-FMA: ; %bb.0:
-; GFX942-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-FMA-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX942-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX942-FMA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-FMA-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX942-FMA-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX942-FMA-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX942-FMA-NEXT: global_load_dword v2, v0, s[2:3]
; GFX942-FMA-NEXT: s_waitcnt vmcnt(0)
-; GFX942-FMA-NEXT: v_fma_f32 v1, v1, v2, 4.0
+; GFX942-FMA-NEXT: v_fma_f32 v1, v2, v1, 4.0
; GFX942-FMA-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-FMA-NEXT: s_endpgm
;
; GFX10-FMA-LABEL: madak_inline_imm_f32:
; GFX10-FMA: ; %bb.0:
; GFX10-FMA-NEXT: s_clause 0x1
-; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-FMA-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-FMA-NEXT: s_clause 0x1
-; GFX10-FMA-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX10-FMA-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX10-FMA-NEXT: global_load_dword v2, v0, s[2:3]
; GFX10-FMA-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FMA-NEXT: v_fma_f32 v1, v1, v2, 4.0
+; GFX10-FMA-NEXT: v_fma_f32 v1, v2, v1, 4.0
; GFX10-FMA-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-FMA-NEXT: s_endpgm
;
; GFX11-FMA-LABEL: madak_inline_imm_f32:
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_clause 0x1
+; GFX11-FMA-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: s_clause 0x1
-; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5]
+; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, 4.0
+; GFX11-FMA-NEXT: v_fma_f32 v1, v2, v1, 4.0
; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-FMA-NEXT: s_endpgm
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
@@ -1045,47 +1045,47 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias %
;
; GFX9-LABEL: no_madak_src0_modifier_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NEXT: s_mov_b32 s2, 0x41200000
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mad_f32 v1, |v1|, v2, s2
+; GFX9-NEXT: v_mad_f32 v1, |v2|, v1, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-MAD-LABEL: no_madak_src0_modifier_f32:
; GFX10-MAD: ; %bb.0:
; GFX10-MAD-NEXT: s_clause 0x1
-; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-MAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-MAD-NEXT: s_clause 0x1
-; GFX10-MAD-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX10-MAD-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX10-MAD-NEXT: global_load_dword v2, v0, s[2:3]
; GFX10-MAD-NEXT: s_waitcnt vmcnt(0)
-; GFX10-MAD-NEXT: v_mad_f32 v1, |v1|, v2, 0x41200000
+; GFX10-MAD-NEXT: v_mad_f32 v1, |v2|, v1, 0x41200000
; GFX10-MAD-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-MAD-NEXT: s_endpgm
;
; GFX11-MAD-LABEL: no_madak_src0_modifier_f32:
; GFX11-MAD: ; %bb.0:
; GFX11-MAD-NEXT: s_clause 0x1
+; GFX11-MAD-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-MAD-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-MAD-NEXT: s_clause 0x1
-; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[4:5]
+; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-MAD-NEXT: s_waitcnt vmcnt(0)
-; GFX11-MAD-NEXT: v_mul_f32_e64 v1, |v1|, v2
+; GFX11-MAD-NEXT: v_mul_f32_e64 v1, |v2|, v1
; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x41200000, v1
; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -1093,48 +1093,48 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias %
;
; GFX942-FMA-LABEL: no_madak_src0_modifier_f32:
; GFX942-FMA: ; %bb.0:
-; GFX942-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-FMA-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX942-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX942-FMA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-FMA-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX942-FMA-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX942-FMA-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX942-FMA-NEXT: global_load_dword v2, v0, s[2:3]
; GFX942-FMA-NEXT: s_mov_b32 s2, 0x41200000
; GFX942-FMA-NEXT: s_waitcnt vmcnt(0)
-; GFX942-FMA-NEXT: v_fma_f32 v1, |v1|, v2, s2
+; GFX942-FMA-NEXT: v_fma_f32 v1, |v2|, v1, s2
; GFX942-FMA-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-FMA-NEXT: s_endpgm
;
; GFX10-FMA-LABEL: no_madak_src0_modifier_f32:
; GFX10-FMA: ; %bb.0:
; GFX10-FMA-NEXT: s_clause 0x1
-; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-FMA-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-FMA-NEXT: s_clause 0x1
-; GFX10-FMA-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX10-FMA-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX10-FMA-NEXT: global_load_dword v2, v0, s[2:3]
; GFX10-FMA-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FMA-NEXT: v_fma_f32 v1, |v1|, v2, 0x41200000
+; GFX10-FMA-NEXT: v_fma_f32 v1, |v2|, v1, 0x41200000
; GFX10-FMA-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-FMA-NEXT: s_endpgm
;
; GFX11-FMA-LABEL: no_madak_src0_modifier_f32:
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_clause 0x1
+; GFX11-FMA-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: s_clause 0x1
-; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5]
+; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT: v_fma_f32 v1, |v1|, v2, 0x41200000
+; GFX11-FMA-NEXT: v_fma_f32 v1, |v2|, v1, 0x41200000
; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-FMA-NEXT: s_endpgm
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
@@ -1199,47 +1199,47 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias %
;
; GFX9-LABEL: no_madak_src1_modifier_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NEXT: s_mov_b32 s2, 0x41200000
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mad_f32 v1, v1, |v2|, s2
+; GFX9-NEXT: v_mad_f32 v1, v2, |v1|, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-MAD-LABEL: no_madak_src1_modifier_f32:
; GFX10-MAD: ; %bb.0:
; GFX10-MAD-NEXT: s_clause 0x1
-; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-MAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-MAD-NEXT: s_clause 0x1
-; GFX10-MAD-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX10-MAD-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX10-MAD-NEXT: global_load_dword v2, v0, s[2:3]
; GFX10-MAD-NEXT: s_waitcnt vmcnt(0)
-; GFX10-MAD-NEXT: v_mad_f32 v1, v1, |v2|, 0x41200000
+; GFX10-MAD-NEXT: v_mad_f32 v1, v2, |v1|, 0x41200000
; GFX10-MAD-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-MAD-NEXT: s_endpgm
;
; GFX11-MAD-LABEL: no_madak_src1_modifier_f32:
; GFX11-MAD: ; %bb.0:
; GFX11-MAD-NEXT: s_clause 0x1
+; GFX11-MAD-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-MAD-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-MAD-NEXT: s_clause 0x1
-; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[4:5]
+; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-MAD-NEXT: s_waitcnt vmcnt(0)
-; GFX11-MAD-NEXT: v_mul_f32_e64 v1, v1, |v2|
+; GFX11-MAD-NEXT: v_mul_f32_e64 v1, v2, |v1|
; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x41200000, v1
; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -1247,48 +1247,48 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias %
;
; GFX942-FMA-LABEL: no_madak_src1_modifier_f32:
; GFX942-FMA: ; %bb.0:
-; GFX942-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-FMA-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX942-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX942-FMA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-FMA-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX942-FMA-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX942-FMA-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX942-FMA-NEXT: global_load_dword v2, v0, s[2:3]
; GFX942-FMA-NEXT: s_mov_b32 s2, 0x41200000
; GFX942-FMA-NEXT: s_waitcnt vmcnt(0)
-; GFX942-FMA-NEXT: v_fma_f32 v1, v1, |v2|, s2
+; GFX942-FMA-NEXT: v_fma_f32 v1, v2, |v1|, s2
; GFX942-FMA-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-FMA-NEXT: s_endpgm
;
; GFX10-FMA-LABEL: no_madak_src1_modifier_f32:
; GFX10-FMA: ; %bb.0:
; GFX10-FMA-NEXT: s_clause 0x1
-; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-FMA-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-FMA-NEXT: s_clause 0x1
-; GFX10-FMA-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX10-FMA-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX10-FMA-NEXT: global_load_dword v2, v0, s[2:3]
; GFX10-FMA-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FMA-NEXT: v_fma_f32 v1, v1, |v2|, 0x41200000
+; GFX10-FMA-NEXT: v_fma_f32 v1, v2, |v1|, 0x41200000
; GFX10-FMA-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-FMA-NEXT: s_endpgm
;
; GFX11-FMA-LABEL: no_madak_src1_modifier_f32:
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_clause 0x1
+; GFX11-FMA-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: s_clause 0x1
-; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5]
+; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT: v_fma_f32 v1, v1, |v2|, 0x41200000
+; GFX11-FMA-NEXT: v_fma_f32 v1, v2, |v1|, 0x41200000
; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-FMA-NEXT: s_endpgm
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/max.i16.ll b/llvm/test/CodeGen/AMDGPU/max.i16.ll
index a5b64f6f80d9b..2787edb9d4abe 100644
--- a/llvm/test/CodeGen/AMDGPU/max.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/max.i16.ll
@@ -145,15 +145,15 @@ define amdgpu_kernel void @v_test_imax_sge_v3i16(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v3, v0, s[2:3]
+; GFX9-NEXT: s_nop 0
; GFX9-NEXT: global_load_short_d16 v2, v0, s[2:3] offset:4
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7]
-; GFX9-NEXT: global_load_dword v4, v0, s[2:3]
-; GFX9-NEXT: ; kill: killed $sgpr2_sgpr3
+; GFX9-NEXT: global_load_dword v4, v0, s[6:7]
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: global_load_short_d16 v1, v0, s[6:7] offset:4
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_pk_max_i16 v3, v4, v3
+; GFX9-NEXT: v_pk_max_i16 v3, v3, v4
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_max_i16 v1, v2, v1
; GFX9-NEXT: global_store_short v0, v1, s[0:1] offset:4
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
index 5b7c36559a366..4d58786002bd7 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
@@ -169,23 +169,23 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %
; CHECK-NEXT: s_addc_u32 s21, s21, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v26, s0
-; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:124
-; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:120
-; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:116
-; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:112
-; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:108
-; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:104
-; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:100
-; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen offset:96
+; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:100
+; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:104
+; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:108
+; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen offset:112
+; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:116
+; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:120
+; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:124
+; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:96
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-NEXT: buffer_load_dword v11, v26, s[20:23], 0 offen offset:92
-; CHECK-NEXT: buffer_load_dword v10, v26, s[20:23], 0 offen offset:88
-; CHECK-NEXT: buffer_load_dword v9, v26, s[20:23], 0 offen offset:84
-; CHECK-NEXT: buffer_load_dword v8, v26, s[20:23], 0 offen offset:80
-; CHECK-NEXT: buffer_load_dword v15, v26, s[20:23], 0 offen offset:76
-; CHECK-NEXT: buffer_load_dword v14, v26, s[20:23], 0 offen offset:72
-; CHECK-NEXT: buffer_load_dword v13, v26, s[20:23], 0 offen offset:68
-; CHECK-NEXT: buffer_load_dword v12, v26, s[20:23], 0 offen offset:64
+; CHECK-NEXT: buffer_load_dword v8, v26, s[20:23], 0 offen offset:64
+; CHECK-NEXT: buffer_load_dword v9, v26, s[20:23], 0 offen offset:68
+; CHECK-NEXT: buffer_load_dword v10, v26, s[20:23], 0 offen offset:72
+; CHECK-NEXT: buffer_load_dword v11, v26, s[20:23], 0 offen offset:76
+; CHECK-NEXT: buffer_load_dword v12, v26, s[20:23], 0 offen offset:80
+; CHECK-NEXT: buffer_load_dword v13, v26, s[20:23], 0 offen offset:84
+; CHECK-NEXT: buffer_load_dword v14, v26, s[20:23], 0 offen offset:88
+; CHECK-NEXT: buffer_load_dword v15, v26, s[20:23], 0 offen offset:92
; CHECK-NEXT: buffer_load_dword v16, v26, s[20:23], 0 offen offset:32
; CHECK-NEXT: buffer_load_dword v17, v26, s[20:23], 0 offen offset:36
; CHECK-NEXT: buffer_load_dword v18, v26, s[20:23], 0 offen offset:40
@@ -197,22 +197,22 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v25, s1
; CHECK-NEXT: v_mov_b32_e32 v24, s0
-; CHECK-NEXT: s_waitcnt vmcnt(20)
-; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:112
+; CHECK-NEXT: s_waitcnt vmcnt(17)
+; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:112
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:96
+; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:96
; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen
+; CHECK-NEXT: s_nop 0
; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:4
; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:8
-; CHECK-NEXT: s_nop 0
; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen offset:16
; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:20
; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:24
; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:28
; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:12
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11] offset:80
-; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15] offset:64
+; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15] offset:80
+; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11] offset:64
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:48
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19] offset:32
; CHECK-NEXT: s_waitcnt vmcnt(0)
@@ -457,23 +457,23 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %
; CHECK-NEXT: s_addc_u32 s21, s21, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v26, s0
-; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:124
-; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:120
-; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:116
-; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:112
-; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:108
-; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:104
-; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:100
-; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen offset:96
+; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:100
+; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:104
+; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:108
+; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen offset:112
+; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:116
+; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:120
+; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:124
+; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:96
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-NEXT: buffer_load_dword v11, v26, s[20:23], 0 offen offset:92
-; CHECK-NEXT: buffer_load_dword v10, v26, s[20:23], 0 offen offset:88
-; CHECK-NEXT: buffer_load_dword v9, v26, s[20:23], 0 offen offset:84
-; CHECK-NEXT: buffer_load_dword v8, v26, s[20:23], 0 offen offset:80
-; CHECK-NEXT: buffer_load_dword v15, v26, s[20:23], 0 offen offset:76
-; CHECK-NEXT: buffer_load_dword v14, v26, s[20:23], 0 offen offset:72
-; CHECK-NEXT: buffer_load_dword v13, v26, s[20:23], 0 offen offset:68
-; CHECK-NEXT: buffer_load_dword v12, v26, s[20:23], 0 offen offset:64
+; CHECK-NEXT: buffer_load_dword v8, v26, s[20:23], 0 offen offset:64
+; CHECK-NEXT: buffer_load_dword v9, v26, s[20:23], 0 offen offset:68
+; CHECK-NEXT: buffer_load_dword v10, v26, s[20:23], 0 offen offset:72
+; CHECK-NEXT: buffer_load_dword v11, v26, s[20:23], 0 offen offset:76
+; CHECK-NEXT: buffer_load_dword v12, v26, s[20:23], 0 offen offset:80
+; CHECK-NEXT: buffer_load_dword v13, v26, s[20:23], 0 offen offset:84
+; CHECK-NEXT: buffer_load_dword v14, v26, s[20:23], 0 offen offset:88
+; CHECK-NEXT: buffer_load_dword v15, v26, s[20:23], 0 offen offset:92
; CHECK-NEXT: buffer_load_dword v16, v26, s[20:23], 0 offen offset:32
; CHECK-NEXT: buffer_load_dword v17, v26, s[20:23], 0 offen offset:36
; CHECK-NEXT: buffer_load_dword v18, v26, s[20:23], 0 offen offset:40
@@ -485,22 +485,22 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v25, s1
; CHECK-NEXT: v_mov_b32_e32 v24, s0
-; CHECK-NEXT: s_waitcnt vmcnt(20)
-; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:112
+; CHECK-NEXT: s_waitcnt vmcnt(17)
+; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:112
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:96
+; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:96
; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen
+; CHECK-NEXT: s_nop 0
; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:4
; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:8
-; CHECK-NEXT: s_nop 0
; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen offset:16
; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:20
; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:24
; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:28
; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:12
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11] offset:80
-; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15] offset:64
+; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15] offset:80
+; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11] offset:64
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:48
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19] offset:32
; CHECK-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
index 4f2816538b1ff..672b4357750ad 100644
--- a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
+++ b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
@@ -19,11 +19,11 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0)
; CHECK-NEXT: v_add_co_u32 v24, vcc_lo, v2, s4
; CHECK-NEXT: v_add_co_ci_u32_e64 v25, null, s5, v3, vcc_lo
; CHECK-NEXT: s_clause 0x4
-; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[24:25] offset:64
+; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[24:25] offset:128
; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[24:25] offset:32
-; CHECK-NEXT: flat_load_dwordx4 v[12:15], v[24:25]
-; CHECK-NEXT: flat_load_dwordx4 v[16:19], v[24:25] offset:16
-; CHECK-NEXT: flat_load_dwordx4 v[20:23], v[24:25] offset:128
+; CHECK-NEXT: flat_load_dwordx4 v[12:15], v[24:25] offset:64
+; CHECK-NEXT: flat_load_dwordx4 v[16:19], v[24:25]
+; CHECK-NEXT: flat_load_dwordx4 v[20:23], v[24:25] offset:16
; CHECK-NEXT: v_add_co_u32 v80, vcc_lo, v24, 0x60
; CHECK-NEXT: v_add_co_ci_u32_e64 v81, null, 0, v25, vcc_lo
; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v24, 48
@@ -50,15 +50,14 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0)
; CHECK-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5]
; CHECK-NEXT: v_add_co_ci_u32_e64 v103, null, 0, v101, vcc_lo
; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:64
-; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15)
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:128
+; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(14)
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15] offset:64
; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11] offset:32
-; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15)
-; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15]
; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15)
-; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[16:19] offset:16
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[16:19]
; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15)
-; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[20:23] offset:128
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[20:23] offset:16
; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15)
; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[24:27] offset:176
; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15)
@@ -3649,54 +3648,54 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:52
; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:56
; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:60
+; CHECK-NEXT: buffer_load_dword v11, v2, s[0:3], 0 offen offset:64
+; CHECK-NEXT: buffer_load_dword v12, v2, s[0:3], 0 offen offset:68
+; CHECK-NEXT: buffer_load_dword v13, v2, s[0:3], 0 offen offset:72
; CHECK-NEXT: buffer_load_dword v14, v2, s[0:3], 0 offen offset:76
-; CHECK-NEXT: buffer_load_dword v18, v2, s[0:3], 0 offen offset:92
-; CHECK-NEXT: buffer_load_dword v17, v2, s[0:3], 0 offen offset:88
-; CHECK-NEXT: buffer_load_dword v16, v2, s[0:3], 0 offen offset:84
; CHECK-NEXT: buffer_load_dword v15, v2, s[0:3], 0 offen offset:80
-; CHECK-NEXT: buffer_load_dword v13, v2, s[0:3], 0 offen offset:72
-; CHECK-NEXT: buffer_load_dword v12, v2, s[0:3], 0 offen offset:68
-; CHECK-NEXT: buffer_load_dword v11, v2, s[0:3], 0 offen offset:64
+; CHECK-NEXT: buffer_load_dword v16, v2, s[0:3], 0 offen offset:84
+; CHECK-NEXT: buffer_load_dword v17, v2, s[0:3], 0 offen offset:88
+; CHECK-NEXT: buffer_load_dword v18, v2, s[0:3], 0 offen offset:92
+; CHECK-NEXT: buffer_load_dword v19, v2, s[0:3], 0 offen offset:224
+; CHECK-NEXT: buffer_load_dword v20, v2, s[0:3], 0 offen offset:228
+; CHECK-NEXT: buffer_load_dword v21, v2, s[0:3], 0 offen offset:232
; CHECK-NEXT: buffer_load_dword v22, v2, s[0:3], 0 offen offset:236
-; CHECK-NEXT: buffer_load_dword v26, v2, s[0:3], 0 offen offset:252
-; CHECK-NEXT: buffer_load_dword v25, v2, s[0:3], 0 offen offset:248
-; CHECK-NEXT: buffer_load_dword v24, v2, s[0:3], 0 offen offset:244
; CHECK-NEXT: buffer_load_dword v23, v2, s[0:3], 0 offen offset:240
-; CHECK-NEXT: buffer_load_dword v21, v2, s[0:3], 0 offen offset:232
-; CHECK-NEXT: buffer_load_dword v20, v2, s[0:3], 0 offen offset:228
-; CHECK-NEXT: buffer_load_dword v19, v2, s[0:3], 0 offen offset:224
-; CHECK-NEXT: buffer_load_dword v30, v2, s[0:3], 0 offen offset:220
-; CHECK-NEXT: buffer_load_dword v29, v2, s[0:3], 0 offen offset:216
-; CHECK-NEXT: buffer_load_dword v28, v2, s[0:3], 0 offen offset:212
-; CHECK-NEXT: buffer_load_dword v27, v2, s[0:3], 0 offen offset:208
-; CHECK-NEXT: buffer_load_dword v34, v2, s[0:3], 0 offen offset:204
-; CHECK-NEXT: buffer_load_dword v33, v2, s[0:3], 0 offen offset:200
-; CHECK-NEXT: buffer_load_dword v32, v2, s[0:3], 0 offen offset:196
-; CHECK-NEXT: buffer_load_dword v31, v2, s[0:3], 0 offen offset:192
-; CHECK-NEXT: buffer_load_dword v38, v2, s[0:3], 0 offen offset:108
-; CHECK-NEXT: buffer_load_dword v51, v2, s[0:3], 0 offen offset:124
-; CHECK-NEXT: buffer_load_dword v50, v2, s[0:3], 0 offen offset:120
-; CHECK-NEXT: buffer_load_dword v49, v2, s[0:3], 0 offen offset:116
-; CHECK-NEXT: buffer_load_dword v48, v2, s[0:3], 0 offen offset:112
-; CHECK-NEXT: buffer_load_dword v37, v2, s[0:3], 0 offen offset:104
-; CHECK-NEXT: buffer_load_dword v36, v2, s[0:3], 0 offen offset:100
-; CHECK-NEXT: buffer_load_dword v35, v2, s[0:3], 0 offen offset:96
-; CHECK-NEXT: buffer_load_dword v55, v2, s[0:3], 0 offen offset:188
-; CHECK-NEXT: buffer_load_dword v54, v2, s[0:3], 0 offen offset:184
-; CHECK-NEXT: buffer_load_dword v53, v2, s[0:3], 0 offen offset:180
-; CHECK-NEXT: buffer_load_dword v52, v2, s[0:3], 0 offen offset:176
-; CHECK-NEXT: buffer_load_dword v67, v2, s[0:3], 0 offen offset:172
-; CHECK-NEXT: buffer_load_dword v66, v2, s[0:3], 0 offen offset:168
-; CHECK-NEXT: buffer_load_dword v65, v2, s[0:3], 0 offen offset:164
-; CHECK-NEXT: buffer_load_dword v64, v2, s[0:3], 0 offen offset:160
-; CHECK-NEXT: buffer_load_dword v71, v2, s[0:3], 0 offen offset:156
-; CHECK-NEXT: buffer_load_dword v70, v2, s[0:3], 0 offen offset:152
-; CHECK-NEXT: buffer_load_dword v69, v2, s[0:3], 0 offen offset:148
-; CHECK-NEXT: buffer_load_dword v68, v2, s[0:3], 0 offen offset:144
-; CHECK-NEXT: buffer_load_dword v83, v2, s[0:3], 0 offen offset:140
-; CHECK-NEXT: buffer_load_dword v82, v2, s[0:3], 0 offen offset:136
-; CHECK-NEXT: buffer_load_dword v81, v2, s[0:3], 0 offen offset:132
-; CHECK-NEXT: buffer_load_dword v80, v2, s[0:3], 0 offen offset:128
+; CHECK-NEXT: buffer_load_dword v24, v2, s[0:3], 0 offen offset:244
+; CHECK-NEXT: buffer_load_dword v25, v2, s[0:3], 0 offen offset:248
+; CHECK-NEXT: buffer_load_dword v26, v2, s[0:3], 0 offen offset:252
+; CHECK-NEXT: buffer_load_dword v27, v2, s[0:3], 0 offen offset:192
+; CHECK-NEXT: buffer_load_dword v28, v2, s[0:3], 0 offen offset:196
+; CHECK-NEXT: buffer_load_dword v29, v2, s[0:3], 0 offen offset:200
+; CHECK-NEXT: buffer_load_dword v30, v2, s[0:3], 0 offen offset:204
+; CHECK-NEXT: buffer_load_dword v31, v2, s[0:3], 0 offen offset:208
+; CHECK-NEXT: buffer_load_dword v32, v2, s[0:3], 0 offen offset:212
+; CHECK-NEXT: buffer_load_dword v33, v2, s[0:3], 0 offen offset:216
+; CHECK-NEXT: buffer_load_dword v34, v2, s[0:3], 0 offen offset:220
+; CHECK-NEXT: buffer_load_dword v35, v2, s[0:3], 0 offen offset:160
+; CHECK-NEXT: buffer_load_dword v36, v2, s[0:3], 0 offen offset:164
+; CHECK-NEXT: buffer_load_dword v37, v2, s[0:3], 0 offen offset:168
+; CHECK-NEXT: buffer_load_dword v38, v2, s[0:3], 0 offen offset:172
+; CHECK-NEXT: buffer_load_dword v48, v2, s[0:3], 0 offen offset:176
+; CHECK-NEXT: buffer_load_dword v49, v2, s[0:3], 0 offen offset:180
+; CHECK-NEXT: buffer_load_dword v50, v2, s[0:3], 0 offen offset:184
+; CHECK-NEXT: buffer_load_dword v51, v2, s[0:3], 0 offen offset:188
+; CHECK-NEXT: buffer_load_dword v52, v2, s[0:3], 0 offen offset:128
+; CHECK-NEXT: buffer_load_dword v53, v2, s[0:3], 0 offen offset:132
+; CHECK-NEXT: buffer_load_dword v54, v2, s[0:3], 0 offen offset:136
+; CHECK-NEXT: buffer_load_dword v55, v2, s[0:3], 0 offen offset:140
+; CHECK-NEXT: buffer_load_dword v64, v2, s[0:3], 0 offen offset:144
+; CHECK-NEXT: buffer_load_dword v65, v2, s[0:3], 0 offen offset:148
+; CHECK-NEXT: buffer_load_dword v66, v2, s[0:3], 0 offen offset:152
+; CHECK-NEXT: buffer_load_dword v67, v2, s[0:3], 0 offen offset:156
+; CHECK-NEXT: buffer_load_dword v68, v2, s[0:3], 0 offen offset:96
+; CHECK-NEXT: buffer_load_dword v69, v2, s[0:3], 0 offen offset:100
+; CHECK-NEXT: buffer_load_dword v70, v2, s[0:3], 0 offen offset:104
+; CHECK-NEXT: buffer_load_dword v71, v2, s[0:3], 0 offen offset:108
+; CHECK-NEXT: buffer_load_dword v80, v2, s[0:3], 0 offen offset:112
+; CHECK-NEXT: buffer_load_dword v81, v2, s[0:3], 0 offen offset:116
+; CHECK-NEXT: buffer_load_dword v82, v2, s[0:3], 0 offen offset:120
+; CHECK-NEXT: buffer_load_dword v83, v2, s[0:3], 0 offen offset:124
; CHECK-NEXT: buffer_load_dword v84, v2, s[0:3], 0 offen
; CHECK-NEXT: buffer_load_dword v85, v2, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_load_dword v86, v2, s[0:3], 0 offen offset:8
@@ -3713,24 +3712,21 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; CHECK-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5]
; CHECK-NEXT: v_add_nc_u32_e32 v2, 0x100, v2
; CHECK-NEXT: v_add_co_ci_u32_e64 v103, null, 0, v101, vcc_lo
-; CHECK-NEXT: s_waitcnt vmcnt(43)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[23:26] offset:192
; CHECK-NEXT: s_waitcnt vmcnt(40)
+; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[23:26] offset:192
; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[19:22] offset:176
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[27:30] offset:160
; CHECK-NEXT: s_waitcnt vmcnt(32)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[31:34] offset:144
-; CHECK-NEXT: s_waitcnt vmcnt(20)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[52:55] offset:128
+; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[31:34] offset:160
+; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[27:30] offset:144
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[48:51] offset:128
+; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[35:38] offset:112
; CHECK-NEXT: s_waitcnt vmcnt(16)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[64:67] offset:112
-; CHECK-NEXT: s_waitcnt vmcnt(12)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[68:71] offset:96
+; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[64:67] offset:96
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:128
; CHECK-NEXT: s_waitcnt vmcnt(8)
-; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:128
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[48:51] offset:64
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[35:38] offset:48
+; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[80:83] offset:64
+; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[68:71] offset:48
; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[15:18] offset:32
; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[11:14] offset:64
; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[7:10]
@@ -3806,31 +3802,30 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:21
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:22
; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:23
-; ALIGNED-NEXT: buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:19
+; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:24
+; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:25
+; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:26
+; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:27
; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28
; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:29
-; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:30
-; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:31
-; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:24
-; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:25
-; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:26
-; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27
+; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:30
+; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:31
; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:32
; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:33
; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:34
-; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:35
+; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:35
; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:36
; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:37
-; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:38
-; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:39
+; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:38
+; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:39
; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:40
-; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:41
-; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:42
-; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:43
-; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:44
-; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:45
-; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:46
-; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:47
+; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:41
+; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:42
+; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:43
+; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:44
+; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:45
+; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:46
+; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:47
; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:48
; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:49
; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:50
@@ -3839,49 +3834,45 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:53
; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:54
; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:55
+; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:56
+; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:57
+; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:58
+; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:59
; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:60
; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:61
; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:62
; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:63
-; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:56
-; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:57
-; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:58
-; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:59
-; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:64
-; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:65
+; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:64
+; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:65
; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:66
; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:67
-; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:68
+; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:68
; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:69
; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:70
; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:71
; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:76
; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:77
+; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:72
+; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:73
; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:78
; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:79
-; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:75
-; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:159
-; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:155
-; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:152
-; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:153
-; ALIGNED-NEXT: buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:154
-; ALIGNED-NEXT: s_clause 0x35
-; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:160
-; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:161
-; ALIGNED-NEXT: buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:162
-; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:163
-; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:164
-; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:165
-; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:166
-; ALIGNED-NEXT: buffer_load_ubyte v91, v2, s[0:3], 0 offen offset:167
-; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:172
-; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:173
-; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:174
+; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:160
+; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:161
+; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:162
+; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:163
+; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:164
+; ALIGNED-NEXT: s_clause 0x30
+; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:165
+; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:166
+; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:167
+; ALIGNED-NEXT: buffer_load_ubyte v89, v2, s[0:3], 0 offen offset:172
+; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:173
+; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:174
; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:175
-; ALIGNED-NEXT: buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:171
-; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:168
-; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:169
-; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:170
+; ALIGNED-NEXT: buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:168
+; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:169
+; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:170
+; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:171
; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:176
; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:177
; ALIGNED-NEXT: buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:178
@@ -3894,62 +3885,62 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:189
; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:190
; ALIGNED-NEXT: buffer_load_ubyte v41, v2, s[0:3], 0 offen offset:191
-; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:187
-; ALIGNED-NEXT: buffer_load_ubyte v119, v2, s[0:3], 0 offen offset:184
-; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:185
-; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:186
+; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:184
+; ALIGNED-NEXT: buffer_load_ubyte v119, v2, s[0:3], 0 offen offset:185
+; ALIGNED-NEXT: buffer_load_ubyte v117, v2, s[0:3], 0 offen offset:186
+; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:187
; ALIGNED-NEXT: buffer_load_ubyte v115, v2, s[0:3], 0 offen offset:192
-; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:193
+; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:193
; ALIGNED-NEXT: buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:194
; ALIGNED-NEXT: buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:195
-; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:196
+; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:196
; ALIGNED-NEXT: buffer_load_ubyte v103, v2, s[0:3], 0 offen offset:197
; ALIGNED-NEXT: buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:198
; ALIGNED-NEXT: buffer_load_ubyte v99, v2, s[0:3], 0 offen offset:199
; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:204
-; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:205
-; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:206
-; ALIGNED-NEXT: buffer_load_ubyte v86, v2, s[0:3], 0 offen offset:207
-; ALIGNED-NEXT: buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:203
-; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:200
-; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:201
-; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:202
-; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:2
-; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:3
-; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:4
-; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:5
-; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:6
-; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:7
+; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:205
+; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:206
+; ALIGNED-NEXT: buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:207
+; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:12
+; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:13
+; ALIGNED-NEXT: buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:14
+; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:15
+; ALIGNED-NEXT: buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:2
+; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:3
+; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:4
+; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:5
+; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:6
+; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:7
; ALIGNED-NEXT: s_waitcnt vmcnt(62)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v5
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v9, 8, v8
; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v6
-; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v12
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v10, 8, v9
+; ALIGNED-NEXT: v_lshl_or_b32 v5, v8, 8, v6
+; ALIGNED-NEXT: v_lshl_or_b32 v6, v12, 8, v11
; ALIGNED-NEXT: v_lshl_or_b32 v7, v15, 8, v14
-; ALIGNED-NEXT: v_lshl_or_b32 v8, v20, 8, v19
+; ALIGNED-NEXT: v_lshl_or_b32 v8, v21, 8, v19
; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v13
-; ALIGNED-NEXT: v_lshl_or_b32 v10, v18, 8, v17
-; ALIGNED-NEXT: v_lshl_or_b32 v11, v26, 8, v22
-; ALIGNED-NEXT: v_lshl_or_b32 v12, v28, 8, v27
-; ALIGNED-NEXT: v_lshl_or_b32 v13, v23, 8, v21
+; ALIGNED-NEXT: v_lshl_or_b32 v10, v20, 8, v18
+; ALIGNED-NEXT: v_lshl_or_b32 v11, v27, 8, v22
+; ALIGNED-NEXT: v_lshl_or_b32 v12, v29, 8, v28
+; ALIGNED-NEXT: v_lshl_or_b32 v13, v24, 8, v23
+; ALIGNED-NEXT: v_lshl_or_b32 v14, v26, 8, v25
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v14, v25, 8, v24
; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v6, 16, v5
; ALIGNED-NEXT: v_lshl_or_b32 v4, v8, 16, v7
@@ -3969,76 +3960,83 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v5, v49, 8, v48
; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v6, v51, 8, v38
+; ALIGNED-NEXT: v_lshl_or_b32 v6, v17, 8, v38
; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v7, v53, 8, v52
+; ALIGNED-NEXT: v_lshl_or_b32 v7, v52, 8, v51
; ALIGNED-NEXT: v_lshl_or_b32 v0, v0, 16, v15
; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v1
; ALIGNED-NEXT: v_lshl_or_b32 v3, v5, 16, v4
-; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:85
; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87
; ALIGNED-NEXT: v_lshl_or_b32 v4, v7, 16, v6
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v55, 8, v29
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v54, 8, v53
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v1, v67, 8, v66
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v64, 8, v54
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v64, 8, v55
; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(61)
; ALIGNED-NEXT: v_lshl_or_b32 v4, v68, 8, v65
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: s_clause 0x2
; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:82
+; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:85
; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:86
-; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(62)
; ALIGNED-NEXT: v_lshl_or_b32 v0, v70, 8, v69
; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:84
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:74
-; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:83
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(62)
+; ALIGNED-NEXT: s_waitcnt vmcnt(60)
; ALIGNED-NEXT: v_lshl_or_b32 v1, v80, 8, v71
-; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: s_clause 0x5
-; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:73
+; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:75
+; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x3
; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:212
; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:213
; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:214
-; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:215
-; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:211
-; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:72
-; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:215
+; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v82, 8, v81
+; ALIGNED-NEXT: s_clause 0x3
+; ALIGNED-NEXT: buffer_load_ubyte v86, v2, s[0:3], 0 offen offset:200
+; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:201
+; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:202
+; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:203
+; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_clause 0x7
; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:216
-; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:217
-; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:218
-; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:219
-; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:220
-; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:221
-; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:222
-; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:223
+; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:217
+; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:218
+; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:219
+; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:220
+; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:221
+; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:222
+; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:223
; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:208
-; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:209
-; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:210
+; ALIGNED-NEXT: s_clause 0x3
+; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:208
+; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:209
+; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:210
+; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:211
; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
@@ -4056,31 +4054,31 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:229
; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:230
; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:231
-; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x4
+; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x3
; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:236
-; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:237
-; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:238
-; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:239
-; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:235
+; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:237
+; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:238
+; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:239
; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:232
-; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:233
-; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:234
+; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x3
+; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:232
+; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:233
+; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:234
+; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:235
; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0xc
+; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0xb
; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:240
; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:241
; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:242
@@ -4090,55 +4088,53 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:246
; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:247
; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:252
-; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:253
-; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:254
-; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:255
-; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:251
-; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(57)
-; ALIGNED-NEXT: buffer_store_dword v90, off, s[0:3], s32 offset:1084 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v90, v90, 8, v121
+; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:253
+; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:254
+; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:255
+; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(59)
+; ALIGNED-NEXT: buffer_store_dword v78, off, s[0:3], s32 offset:1148 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v78, v78, 8, v93
+; ALIGNED-NEXT: buffer_store_dword v93, off, s[0:3], s32 offset:1156 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(56)
-; ALIGNED-NEXT: buffer_store_dword v106, off, s[0:3], s32 offset:1092 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v121, off, s[0:3], s32 offset:1096 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v92, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(55)
-; ALIGNED-NEXT: buffer_store_dword v109, off, s[0:3], s32 offset:1108 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v95, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v79, off, s[0:3], s32 offset:1152 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v90, off, s[0:3], s32 offset:1160 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(54)
-; ALIGNED-NEXT: buffer_store_dword v111, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(53)
-; ALIGNED-NEXT: buffer_store_dword v122, off, s[0:3], s32 offset:1124 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(52)
; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(51)
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(49)
; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(48)
+; ALIGNED-NEXT: s_waitcnt vmcnt(50)
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(49)
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(46)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(40)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v81, 8, v3
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:84
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(48)
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 8, v3
+; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:83
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:81
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:80
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7
+; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:94
; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:98
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6
@@ -4146,33 +4142,35 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:102
; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:103
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v4
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:94
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:95
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:95
+; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:101
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:93
-; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:91
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:92
+; ALIGNED-NEXT: s_waitcnt vmcnt(7)
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(6)
-; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(5)
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3
+; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:90
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:101
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:91
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:89
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
@@ -4180,7 +4178,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
@@ -4188,23 +4186,24 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:99
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:100
-; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:99
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:100
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:97
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:96
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7
+; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:110
; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:114
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6
@@ -4212,33 +4211,35 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:118
; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:119
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v4
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:110
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:111
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:111
+; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:117
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:109
-; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:107
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:108
+; ALIGNED-NEXT: s_waitcnt vmcnt(7)
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(6)
-; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(5)
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3
+; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:106
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:117
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:107
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:105
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill
@@ -4246,7 +4247,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
@@ -4254,23 +4255,24 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:115
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:116
-; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:115
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:116
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:113
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:112
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7
+; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:126
; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:130
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6
@@ -4278,33 +4280,35 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:134
; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:135
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v4
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:126
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:127
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:127
+; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:133
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:125
-; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:123
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:124
+; ALIGNED-NEXT: s_waitcnt vmcnt(7)
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(6)
-; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1012 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1004 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(5)
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1016 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1012 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1016 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1008 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3
+; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:122
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:133
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:123
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:121
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill
@@ -4312,7 +4316,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1008 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
@@ -4320,23 +4324,24 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:131
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:132
-; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1004 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:131
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:132
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:129
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:128
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7
+; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:142
; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:146
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6
@@ -4344,33 +4349,35 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:150
; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:151
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v4
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:142
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:143
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:143
+; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:149
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:141
-; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:139
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:140
+; ALIGNED-NEXT: s_waitcnt vmcnt(7)
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(6)
-; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1120 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1084 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(5)
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1128 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1092 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1096 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1024 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3
+; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:138
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1056 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:149
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:139
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:137
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1048 ; 4-byte Folded Spill
@@ -4378,7 +4385,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1060 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1116 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1056 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1052 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
@@ -4386,184 +4393,188 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:147
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:148
-; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1104 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:147
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:148
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:145
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1064 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:144
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1100 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1072 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1072 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v86, 8, v96
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7
+; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:158
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6
-; ALIGNED-NEXT: v_lshl_or_b32 v6, v10, 8, v12
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:158
+; ALIGNED-NEXT: v_lshl_or_b32 v6, v9, 8, v11
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1100 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v4
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:159
; ALIGNED-NEXT: v_lshl_or_b32 v5, v14, 8, v18
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:157
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1108 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:156
+; ALIGNED-NEXT: s_waitcnt vmcnt(3)
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1116 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1156 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1120 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1152 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1104 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v125, 8, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v87, 8, v97
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3
+; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:154
+; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:155
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v82, 8, v83
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v124, 8, v110
-; ALIGNED-NEXT: v_lshl_or_b32 v79, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v83, 8, v84
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v85, 8, v82
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v120, 8, v123
-; ALIGNED-NEXT: v_lshl_or_b32 v72, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v67, 8, v80
+; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:153
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:152
+; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1144 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1136 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v127
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v84, 8, v86
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v92, 8, v93
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v64, 8, v70
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v104, 8, v108
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v108, 8, v109
+; ALIGNED-NEXT: v_lshl_or_b32 v73, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v67, 8, v80
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v65, 8, v70
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v122, 8, v125
; ALIGNED-NEXT: v_lshl_or_b32 v45, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v66, 8, v71
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v52, 8, v53
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v69, 8, v71
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v91, 8, v94
-; ALIGNED-NEXT: v_lshl_or_b32 v117, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v55, 8, v69
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v105, 8, v120
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v53, 8, v54
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v95, 8, v105
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v51, 8, v54
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v111, 8, v123
+; ALIGNED-NEXT: v_lshl_or_b32 v118, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v66, 8, v68
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v52, 8, v64
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v75, 8, v76
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v75, 8, v77
; ALIGNED-NEXT: v_lshl_or_b32 v114, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v38, 8, v49
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v65, 8, v39
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v39, 8, v50
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 8, v78
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v88, 8, v89
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v38, 8, v48
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v62, 8, v63
; ALIGNED-NEXT: v_lshl_or_b32 v98, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v35, 8, v37
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v74, 8, v62
; ALIGNED-NEXT: v_lshl_or_b32 v4, v32, 8, v31
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v63, 8, v73
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v72, 8, v74
; ALIGNED-NEXT: v_lshl_or_b32 v81, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v33, 8, v36
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v30, 8, v34
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v56, 8, v47
-; ALIGNED-NEXT: v_lshl_or_b32 v68, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v27, 8, v29
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v30, 8, v34
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v59, 8, v61
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v26, 8, v28
+; ALIGNED-NEXT: v_lshl_or_b32 v55, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v28, 8, v29
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v25, 8, v27
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v46, 8, v58
-; ALIGNED-NEXT: v_lshl_or_b32 v50, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v23, 8, v24
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v25, 8, v22
+; ALIGNED-NEXT: v_lshl_or_b32 v51, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v24, 8, v26
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v57, 8, v60
-; ALIGNED-NEXT: v_lshl_or_b32 v48, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v19, 8, v21
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v22, 8, v23
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v41, 8, v42
+; ALIGNED-NEXT: v_lshl_or_b32 v49, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v19, 8, v21
; ALIGNED-NEXT: v_lshl_or_b32 v4, v16, 8, v15
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v44
; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v4, v17, 8, v20
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v116
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v116, 8, v117
; ALIGNED-NEXT: v_lshl_or_b32 v4, v5, 16, v4
-; ALIGNED-NEXT: v_lshl_or_b32 v5, v11, 8, v13
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v118, 8, v119
-; ALIGNED-NEXT: v_lshl_or_b32 v5, v6, 16, v5
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v119, 8, v40
+; ALIGNED-NEXT: v_lshl_or_b32 v5, v12, 8, v13
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v100, 8, v101
+; ALIGNED-NEXT: v_lshl_or_b32 v5, v6, 16, v5
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v112, 8, v115
-; ALIGNED-NEXT: v_lshl_or_b32 v107, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v103, 8, v113
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v113, 8, v115
+; ALIGNED-NEXT: v_lshl_or_b32 v121, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v103, 8, v112
; ALIGNED-NEXT: v_lshl_or_b32 v1, v99, 8, v102
-; ALIGNED-NEXT: v_lshl_or_b32 v88, v1, 16, v0
-; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:248
-; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:249
-; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:250
+; ALIGNED-NEXT: v_lshl_or_b32 v94, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v96, 8, v97
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v85, 8, v87
+; ALIGNED-NEXT: v_lshl_or_b32 v91, v1, 16, v0
+; ALIGNED-NEXT: s_clause 0x4
+; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:248
+; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:249
+; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:250
+; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:251
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen
-; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: v_lshl_or_b32 v6, v7, 8, v8
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v89, v9, 8, v1
-; ALIGNED-NEXT: v_lshl_or_b32 v6, v89, 16, v6
-; ALIGNED-NEXT: buffer_load_ubyte v89, v2, s[0:3], 0 offen offset:1
+; ALIGNED-NEXT: s_waitcnt vmcnt(3)
+; ALIGNED-NEXT: v_lshl_or_b32 v6, v8, 8, v10
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v89, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v89, v89, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v90, 16, v89
-; ALIGNED-NEXT: v_lshl_or_b32 v89, v109, 8, v106
-; ALIGNED-NEXT: v_lshl_or_b32 v90, v122, 8, v111
-; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:15
-; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:11
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:8
-; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:9
-; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:10
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v90, 16, v89
-; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:13
-; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:14
-; ALIGNED-NEXT: buffer_store_dword v126, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:12
-; ALIGNED-NEXT: s_waitcnt vmcnt(7)
-; ALIGNED-NEXT: buffer_store_dword v106, off, s[0:3], s32 offset:1168 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v76, v1, 8, v7
+; ALIGNED-NEXT: v_lshl_or_b32 v6, v76, 16, v6
+; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:1
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: buffer_store_dword v90, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v90, v106, 8, v90
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: v_lshl_or_b32 v89, v127, 8, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1160 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v90, 16, v89
-; ALIGNED-NEXT: v_lshl_or_b32 v89, v111, 8, v122
-; ALIGNED-NEXT: v_lshl_or_b32 v90, v121, 8, v109
+; ALIGNED-NEXT: buffer_store_dword v76, off, s[0:3], s32 offset:1140 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v76, v76, 8, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v78, 16, v76
+; ALIGNED-NEXT: v_lshl_or_b32 v78, v95, 8, v92
+; ALIGNED-NEXT: s_clause 0x3
+; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:8
+; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:9
+; ALIGNED-NEXT: buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:10
+; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:11
+; ALIGNED-NEXT: v_lshl_or_b32 v76, v90, 8, v79
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v90, 16, v89
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:18
-; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:16
-; ALIGNED-NEXT: buffer_load_ubyte v89, v2, s[0:3], 0 offen offset:17
-; ALIGNED-NEXT: v_add_nc_u32_e32 v2, 0x100, v2
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v78, 16, v76
+; ALIGNED-NEXT: v_lshl_or_b32 v76, v107, 8, v124
+; ALIGNED-NEXT: v_lshl_or_b32 v78, v106, 8, v110
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v78, 16, v76
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: v_lshl_or_b32 v76, v95, 8, v104
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: v_lshl_or_b32 v78, v92, 8, v93
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v78, 16, v76
+; ALIGNED-NEXT: s_clause 0x3
+; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:18
+; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:19
+; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:16
+; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:17
+; ALIGNED-NEXT: v_add_nc_u32_e32 v2, 0x100, v2
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:232
; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:236
; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:228
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v126, 8, v90
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v79, 8, v78
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: v_lshl_or_b32 v126, v89, 8, v106
+; ALIGNED-NEXT: v_lshl_or_b32 v126, v76, 8, v90
; ALIGNED-NEXT: v_lshl_or_b32 v126, v0, 16, v126
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
@@ -4575,14 +4586,14 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: v_add_co_u32 v5, vcc_lo, v3, 3
; ALIGNED-NEXT: s_addc_u32 s5, s5, 0
; ALIGNED-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v4, vcc_lo
-; ALIGNED-NEXT: flat_store_byte v[5:6], v1 offset:247
-; ALIGNED-NEXT: flat_store_byte v[5:6], v9 offset:248
-; ALIGNED-NEXT: flat_store_byte v[5:6], v7 offset:246
-; ALIGNED-NEXT: flat_store_byte v[5:6], v10 offset:252
-; ALIGNED-NEXT: flat_store_byte v[5:6], v11 offset:250
-; ALIGNED-NEXT: flat_store_byte v[5:6], v12 offset:251
+; ALIGNED-NEXT: flat_store_byte v[5:6], v7 offset:247
+; ALIGNED-NEXT: flat_store_byte v[5:6], v1 offset:248
+; ALIGNED-NEXT: flat_store_byte v[5:6], v8 offset:246
+; ALIGNED-NEXT: flat_store_byte v[5:6], v9 offset:252
+; ALIGNED-NEXT: flat_store_byte v[5:6], v12 offset:250
+; ALIGNED-NEXT: flat_store_byte v[5:6], v11 offset:251
; ALIGNED-NEXT: flat_store_byte v[5:6], v13 offset:249
-; ALIGNED-NEXT: flat_store_byte v[5:6], v8 offset:245
+; ALIGNED-NEXT: flat_store_byte v[5:6], v10 offset:245
; ALIGNED-NEXT: flat_store_byte v[5:6], v15 offset:239
; ALIGNED-NEXT: flat_store_byte v[5:6], v16 offset:240
; ALIGNED-NEXT: flat_store_byte v[5:6], v19 offset:238
@@ -4591,18 +4602,18 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: flat_store_byte v[5:6], v18 offset:243
; ALIGNED-NEXT: flat_store_byte v[5:6], v20 offset:241
; ALIGNED-NEXT: flat_store_byte v[5:6], v21 offset:237
-; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:248
-; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:252
-; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:244
+; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:248
+; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:252
+; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:244
; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:240
-; ALIGNED-NEXT: flat_store_byte v[5:6], v22 offset:231
-; ALIGNED-NEXT: flat_store_byte v[5:6], v25 offset:232
-; ALIGNED-NEXT: flat_store_byte v[5:6], v23 offset:230
-; ALIGNED-NEXT: flat_store_byte v[5:6], v26 offset:236
-; ALIGNED-NEXT: flat_store_byte v[5:6], v27 offset:234
-; ALIGNED-NEXT: flat_store_byte v[5:6], v28 offset:235
+; ALIGNED-NEXT: flat_store_byte v[5:6], v23 offset:231
+; ALIGNED-NEXT: flat_store_byte v[5:6], v22 offset:232
+; ALIGNED-NEXT: flat_store_byte v[5:6], v24 offset:230
+; ALIGNED-NEXT: flat_store_byte v[5:6], v25 offset:236
+; ALIGNED-NEXT: flat_store_byte v[5:6], v28 offset:234
+; ALIGNED-NEXT: flat_store_byte v[5:6], v27 offset:235
; ALIGNED-NEXT: flat_store_byte v[5:6], v29 offset:233
-; ALIGNED-NEXT: flat_store_byte v[5:6], v24 offset:229
+; ALIGNED-NEXT: flat_store_byte v[5:6], v26 offset:229
; ALIGNED-NEXT: flat_store_byte v[5:6], v31 offset:223
; ALIGNED-NEXT: flat_store_byte v[5:6], v32 offset:224
; ALIGNED-NEXT: flat_store_byte v[5:6], v35 offset:222
@@ -4613,44 +4624,44 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: flat_store_byte v[5:6], v37 offset:221
; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:192
; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:204
-; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:200
+; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:200
; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:196
; ALIGNED-NEXT: flat_store_byte v[5:6], v67 offset:210
-; ALIGNED-NEXT: flat_store_byte v[5:6], v64 offset:212
-; ALIGNED-NEXT: flat_store_byte v[5:6], v38 offset:206
-; ALIGNED-NEXT: flat_store_byte v[5:6], v65 offset:208
-; ALIGNED-NEXT: flat_store_byte v[5:6], v39 offset:207
+; ALIGNED-NEXT: flat_store_byte v[5:6], v65 offset:212
+; ALIGNED-NEXT: flat_store_byte v[5:6], v39 offset:206
+; ALIGNED-NEXT: flat_store_byte v[5:6], v38 offset:208
+; ALIGNED-NEXT: flat_store_byte v[5:6], v48 offset:207
; ALIGNED-NEXT: flat_store_byte v[5:6], v70 offset:211
; ALIGNED-NEXT: flat_store_byte v[5:6], v80 offset:209
-; ALIGNED-NEXT: flat_store_byte v[5:6], v53 offset:215
-; ALIGNED-NEXT: flat_store_byte v[5:6], v52 offset:216
-; ALIGNED-NEXT: flat_store_byte v[5:6], v66 offset:214
-; ALIGNED-NEXT: flat_store_byte v[5:6], v51 offset:220
-; ALIGNED-NEXT: flat_store_byte v[5:6], v55 offset:218
-; ALIGNED-NEXT: flat_store_byte v[5:6], v54 offset:219
-; ALIGNED-NEXT: flat_store_byte v[5:6], v69 offset:217
+; ALIGNED-NEXT: flat_store_byte v[5:6], v54 offset:215
+; ALIGNED-NEXT: flat_store_byte v[5:6], v53 offset:216
+; ALIGNED-NEXT: flat_store_byte v[5:6], v69 offset:214
+; ALIGNED-NEXT: flat_store_byte v[5:6], v52 offset:220
+; ALIGNED-NEXT: flat_store_byte v[5:6], v66 offset:218
+; ALIGNED-NEXT: flat_store_byte v[5:6], v64 offset:219
+; ALIGNED-NEXT: flat_store_byte v[5:6], v68 offset:217
; ALIGNED-NEXT: flat_store_byte v[5:6], v71 offset:213
-; ALIGNED-NEXT: flat_store_byte v[5:6], v49 offset:205
-; ALIGNED-NEXT: buffer_store_dword v72, off, s[0:3], s32 offset:216
-; ALIGNED-NEXT: buffer_store_dword v79, off, s[0:3], s32 offset:220
-; ALIGNED-NEXT: buffer_store_dword v88, off, s[0:3], s32 offset:212
-; ALIGNED-NEXT: buffer_store_dword v107, off, s[0:3], s32 offset:208
+; ALIGNED-NEXT: flat_store_byte v[5:6], v50 offset:205
+; ALIGNED-NEXT: buffer_store_dword v73, off, s[0:3], s32 offset:216
+; ALIGNED-NEXT: buffer_store_dword v91, off, s[0:3], s32 offset:220
+; ALIGNED-NEXT: buffer_store_dword v94, off, s[0:3], s32 offset:212
+; ALIGNED-NEXT: buffer_store_dword v121, off, s[0:3], s32 offset:208
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Reload
-; ALIGNED-NEXT: flat_store_byte v[5:6], v82 offset:199
-; ALIGNED-NEXT: flat_store_byte v[5:6], v85 offset:200
-; ALIGNED-NEXT: flat_store_byte v[5:6], v83 offset:198
-; ALIGNED-NEXT: flat_store_byte v[5:6], v86 offset:204
-; ALIGNED-NEXT: flat_store_byte v[5:6], v87 offset:202
-; ALIGNED-NEXT: flat_store_byte v[5:6], v96 offset:203
+; ALIGNED-NEXT: flat_store_byte v[5:6], v83 offset:199
+; ALIGNED-NEXT: flat_store_byte v[5:6], v82 offset:200
+; ALIGNED-NEXT: flat_store_byte v[5:6], v84 offset:198
+; ALIGNED-NEXT: flat_store_byte v[5:6], v85 offset:204
+; ALIGNED-NEXT: flat_store_byte v[5:6], v96 offset:202
+; ALIGNED-NEXT: flat_store_byte v[5:6], v87 offset:203
; ALIGNED-NEXT: flat_store_byte v[5:6], v97 offset:201
-; ALIGNED-NEXT: flat_store_byte v[5:6], v84 offset:197
+; ALIGNED-NEXT: flat_store_byte v[5:6], v86 offset:197
; ALIGNED-NEXT: flat_store_byte v[5:6], v101 offset:191
; ALIGNED-NEXT: flat_store_byte v[5:6], v100 offset:192
-; ALIGNED-NEXT: flat_store_byte v[5:6], v112 offset:190
+; ALIGNED-NEXT: flat_store_byte v[5:6], v113 offset:190
; ALIGNED-NEXT: flat_store_byte v[5:6], v99 offset:196
; ALIGNED-NEXT: flat_store_byte v[5:6], v103 offset:194
; ALIGNED-NEXT: flat_store_byte v[5:6], v102 offset:195
-; ALIGNED-NEXT: flat_store_byte v[5:6], v113 offset:193
+; ALIGNED-NEXT: flat_store_byte v[5:6], v112 offset:193
; ALIGNED-NEXT: flat_store_byte v[5:6], v115 offset:189
; ALIGNED-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5]
; ALIGNED-NEXT: s_and_b32 vcc_lo, exec_lo, s6
@@ -4666,14 +4677,14 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Reload
-; ALIGNED-NEXT: flat_store_byte v[5:6], v116 offset:183
-; ALIGNED-NEXT: flat_store_byte v[5:6], v40 offset:184
-; ALIGNED-NEXT: flat_store_byte v[5:6], v118 offset:182
+; ALIGNED-NEXT: flat_store_byte v[5:6], v117 offset:183
+; ALIGNED-NEXT: flat_store_byte v[5:6], v116 offset:184
+; ALIGNED-NEXT: flat_store_byte v[5:6], v119 offset:182
; ALIGNED-NEXT: flat_store_byte v[5:6], v41 offset:188
; ALIGNED-NEXT: flat_store_byte v[5:6], v43 offset:186
; ALIGNED-NEXT: flat_store_byte v[5:6], v42 offset:187
; ALIGNED-NEXT: flat_store_byte v[5:6], v44 offset:185
-; ALIGNED-NEXT: flat_store_byte v[5:6], v119 offset:181
+; ALIGNED-NEXT: flat_store_byte v[5:6], v40 offset:181
; ALIGNED-NEXT: flat_store_byte v[5:6], v47 offset:175
; ALIGNED-NEXT: flat_store_byte v[5:6], v56 offset:176
; ALIGNED-NEXT: flat_store_byte v[5:6], v59 offset:174
@@ -4693,70 +4704,78 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Reload
-; ALIGNED-NEXT: flat_store_byte v[5:6], v62 offset:167
-; ALIGNED-NEXT: flat_store_byte v[5:6], v74 offset:168
-; ALIGNED-NEXT: flat_store_byte v[5:6], v63 offset:166
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[5:6], v63 offset:167
+; ALIGNED-NEXT: flat_store_byte v[5:6], v62 offset:168
+; ALIGNED-NEXT: flat_store_byte v[5:6], v72 offset:166
; ALIGNED-NEXT: flat_store_byte v[5:6], v75 offset:172
-; ALIGNED-NEXT: flat_store_byte v[5:6], v77 offset:170
-; ALIGNED-NEXT: flat_store_byte v[5:6], v76 offset:171
-; ALIGNED-NEXT: flat_store_byte v[5:6], v78 offset:169
-; ALIGNED-NEXT: flat_store_byte v[5:6], v73 offset:165
-; ALIGNED-NEXT: flat_store_byte v[5:6], v93 offset:159
-; ALIGNED-NEXT: flat_store_byte v[5:6], v92 offset:160
-; ALIGNED-NEXT: flat_store_byte v[5:6], v104 offset:158
-; ALIGNED-NEXT: flat_store_byte v[5:6], v91 offset:164
-; ALIGNED-NEXT: flat_store_byte v[5:6], v95 offset:162
-; ALIGNED-NEXT: flat_store_byte v[5:6], v94 offset:163
-; ALIGNED-NEXT: flat_store_byte v[5:6], v105 offset:161
-; ALIGNED-NEXT: flat_store_byte v[5:6], v108 offset:157
+; ALIGNED-NEXT: flat_store_byte v[5:6], v88 offset:170
+; ALIGNED-NEXT: flat_store_byte v[5:6], v77 offset:171
+; ALIGNED-NEXT: flat_store_byte v[5:6], v89 offset:169
+; ALIGNED-NEXT: flat_store_byte v[5:6], v74 offset:165
+; ALIGNED-NEXT: flat_store_byte v[5:6], v109 offset:159
+; ALIGNED-NEXT: flat_store_byte v[5:6], v108 offset:160
+; ALIGNED-NEXT: flat_store_byte v[5:6], v122 offset:158
+; ALIGNED-NEXT: flat_store_byte v[5:6], v105 offset:164
+; ALIGNED-NEXT: flat_store_byte v[5:6], v111 offset:162
+; ALIGNED-NEXT: flat_store_byte v[5:6], v120 offset:163
+; ALIGNED-NEXT: flat_store_byte v[5:6], v123 offset:161
+; ALIGNED-NEXT: flat_store_byte v[5:6], v125 offset:157
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1108 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1100 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Reload
-; ALIGNED-NEXT: flat_store_byte v[5:6], v110 offset:151
-; ALIGNED-NEXT: flat_store_byte v[5:6], v124 offset:152
-; ALIGNED-NEXT: flat_store_byte v[5:6], v120 offset:150
-; ALIGNED-NEXT: flat_store_byte v[5:6], v125 offset:156
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[5:6], v127 offset:151
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:152
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:150
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1120 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:156
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1112 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:154
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1156 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1116 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:155
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1104 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:153
-; ALIGNED-NEXT: flat_store_byte v[5:6], v123 offset:149
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1104 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:149
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1084 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:143
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1088 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1076 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:144
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1080 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1072 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:142
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1096 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:148
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1116 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1088 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:146
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1120 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:147
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1100 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1080 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:145
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1072 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:141
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1064 ; 4-byte Folded Reload
@@ -5002,13 +5021,13 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
@@ -5017,10 +5036,10 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:72
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:70
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:76
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
@@ -5213,11 +5232,9 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:21
-; ALIGNED-NEXT: flat_store_byte v[5:6], v90 offset:15
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1188 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:16
-; ALIGNED-NEXT: flat_store_byte v[5:6], v89 offset:14
+; ALIGNED-NEXT: flat_store_byte v[5:6], v78 offset:15
+; ALIGNED-NEXT: flat_store_byte v[5:6], v79 offset:16
+; ALIGNED-NEXT: flat_store_byte v[5:6], v76 offset:14
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:20
@@ -5230,55 +5247,49 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:17
-; ALIGNED-NEXT: flat_store_byte v[3:4], v106 offset:16
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v90 offset:16
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1188 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Reload
-; ALIGNED-NEXT: flat_store_byte v[5:6], v109 offset:7
-; ALIGNED-NEXT: flat_store_byte v[5:6], v121 offset:8
-; ALIGNED-NEXT: flat_store_byte v[5:6], v127 offset:10
-; ALIGNED-NEXT: flat_store_byte v[5:6], v111 offset:6
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:12
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:11
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1160 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:9
-; ALIGNED-NEXT: flat_store_byte v[3:4], v122 offset:8
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1096 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1156 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[5:6], v93 offset:7
+; ALIGNED-NEXT: flat_store_byte v[5:6], v92 offset:8
+; ALIGNED-NEXT: flat_store_byte v[5:6], v107 offset:10
+; ALIGNED-NEXT: flat_store_byte v[5:6], v95 offset:6
+; ALIGNED-NEXT: flat_store_byte v[5:6], v106 offset:12
+; ALIGNED-NEXT: flat_store_byte v[5:6], v110 offset:11
+; ALIGNED-NEXT: flat_store_byte v[5:6], v124 offset:9
+; ALIGNED-NEXT: flat_store_byte v[3:4], v104 offset:8
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:2
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1084 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1076 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:1
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:4
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1108 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1160 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:2
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1112 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:3
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:4
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[3:4], v0
; ALIGNED-NEXT: s_cbranch_vccnz .LBB4_1
@@ -5408,27 +5419,27 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0
; CHECK-NEXT: s_mov_b64 s[4:5], 0
; CHECK-NEXT: .LBB5_2: ; %memmove_fwd_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: v_add_co_u32 v52, vcc_lo, v2, s4
-; CHECK-NEXT: v_add_co_ci_u32_e64 v53, null, s5, v3, vcc_lo
+; CHECK-NEXT: v_add_co_u32 v24, vcc_lo, v2, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v25, null, s5, v3, vcc_lo
; CHECK-NEXT: s_clause 0x1
-; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[52:53] offset:16
-; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[52:53]
-; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v52, 48
-; CHECK-NEXT: v_add_co_ci_u32_e64 v97, null, 0, v53, vcc_lo
-; CHECK-NEXT: v_add_co_u32 v80, vcc_lo, v52, 0x60
-; CHECK-NEXT: v_add_co_ci_u32_e64 v81, null, 0, v53, vcc_lo
-; CHECK-NEXT: v_add_co_u32 v84, vcc_lo, 0x50, v52
-; CHECK-NEXT: v_add_co_ci_u32_e64 v85, null, 0, v53, vcc_lo
+; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[24:25]
+; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[24:25] offset:16
+; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v24, 48
+; CHECK-NEXT: v_add_co_ci_u32_e64 v97, null, 0, v25, vcc_lo
+; CHECK-NEXT: v_add_co_u32 v80, vcc_lo, v24, 0x60
+; CHECK-NEXT: v_add_co_ci_u32_e64 v81, null, 0, v25, vcc_lo
+; CHECK-NEXT: v_add_co_u32 v84, vcc_lo, 0x50, v24
+; CHECK-NEXT: v_add_co_ci_u32_e64 v85, null, 0, v25, vcc_lo
; CHECK-NEXT: s_clause 0xd
-; CHECK-NEXT: flat_load_dwordx4 v[12:15], v[52:53] offset:32
-; CHECK-NEXT: flat_load_dwordx4 v[16:19], v[52:53] offset:64
-; CHECK-NEXT: flat_load_dwordx4 v[20:23], v[80:81] offset:128
-; CHECK-NEXT: flat_load_dwordx4 v[24:27], v[80:81] offset:144
-; CHECK-NEXT: flat_load_dwordx4 v[28:31], v[80:81] offset:96
-; CHECK-NEXT: flat_load_dwordx4 v[32:35], v[80:81] offset:112
-; CHECK-NEXT: flat_load_dwordx4 v[36:39], v[80:81] offset:64
-; CHECK-NEXT: flat_load_dwordx4 v[48:51], v[80:81] offset:80
-; CHECK-NEXT: flat_load_dwordx4 v[52:55], v[52:53] offset:128
+; CHECK-NEXT: flat_load_dwordx4 v[12:15], v[24:25] offset:32
+; CHECK-NEXT: flat_load_dwordx4 v[16:19], v[24:25] offset:64
+; CHECK-NEXT: flat_load_dwordx4 v[20:23], v[24:25] offset:128
+; CHECK-NEXT: flat_load_dwordx4 v[24:27], v[80:81] offset:128
+; CHECK-NEXT: flat_load_dwordx4 v[28:31], v[80:81] offset:144
+; CHECK-NEXT: flat_load_dwordx4 v[32:35], v[80:81] offset:96
+; CHECK-NEXT: flat_load_dwordx4 v[36:39], v[80:81] offset:112
+; CHECK-NEXT: flat_load_dwordx4 v[48:51], v[80:81] offset:64
+; CHECK-NEXT: flat_load_dwordx4 v[52:55], v[80:81] offset:80
; CHECK-NEXT: flat_load_dwordx4 v[64:67], v[80:81] offset:48
; CHECK-NEXT: flat_load_dwordx4 v[68:71], v[80:81]
; CHECK-NEXT: flat_load_dwordx4 v[80:83], v[80:81] offset:16
@@ -5441,25 +5452,24 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0
; CHECK-NEXT: v_add_co_u32 v102, vcc_lo, v100, 48
; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0x800
; CHECK-NEXT: v_add_co_ci_u32_e64 v103, null, 0, v101, vcc_lo
-; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12)
+; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11)
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[20:23] offset:128
; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[16:19] offset:64
; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15] offset:32
-; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11]
-; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:16
-; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[20:23] offset:176
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7]
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11] offset:16
; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[24:27] offset:192
+; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[24:27] offset:176
; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[28:31] offset:144
+; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[28:31] offset:192
; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[32:35] offset:160
+; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[32:35] offset:144
; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[36:39] offset:112
+; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[36:39] offset:160
; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[48:51] offset:128
+; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[48:51] offset:112
; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15)
-; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:128
+; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[52:55] offset:128
; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15)
; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[64:67] offset:96
; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15)
@@ -5483,11 +5493,11 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0
; CHECK-NEXT: v_add_co_u32 v24, vcc_lo, v2, s4
; CHECK-NEXT: v_add_co_ci_u32_e64 v25, null, s5, v3, vcc_lo
; CHECK-NEXT: s_clause 0x4
-; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[24:25] offset:64
+; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[24:25] offset:128
; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[24:25] offset:32
-; CHECK-NEXT: flat_load_dwordx4 v[12:15], v[24:25]
-; CHECK-NEXT: flat_load_dwordx4 v[16:19], v[24:25] offset:16
-; CHECK-NEXT: flat_load_dwordx4 v[20:23], v[24:25] offset:128
+; CHECK-NEXT: flat_load_dwordx4 v[12:15], v[24:25] offset:64
+; CHECK-NEXT: flat_load_dwordx4 v[16:19], v[24:25]
+; CHECK-NEXT: flat_load_dwordx4 v[20:23], v[24:25] offset:16
; CHECK-NEXT: v_add_co_u32 v80, vcc_lo, v24, 0x60
; CHECK-NEXT: v_add_co_ci_u32_e64 v81, null, 0, v25, vcc_lo
; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v24, 48
@@ -5514,15 +5524,14 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0
; CHECK-NEXT: s_cmp_eq_u64 s[4:5], s[6:7]
; CHECK-NEXT: v_add_co_ci_u32_e64 v103, null, 0, v101, vcc_lo
; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:64
-; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15)
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:128
+; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(14)
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15] offset:64
; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11] offset:32
-; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15)
-; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15]
; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15)
-; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[16:19] offset:16
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[16:19]
; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15)
-; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[20:23] offset:128
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[20:23] offset:16
; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15)
; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[24:27] offset:176
; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15)
@@ -12550,54 +12559,54 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:52
; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:56
; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:60
+; CHECK-NEXT: buffer_load_dword v11, v2, s[0:3], 0 offen offset:64
+; CHECK-NEXT: buffer_load_dword v12, v2, s[0:3], 0 offen offset:68
+; CHECK-NEXT: buffer_load_dword v13, v2, s[0:3], 0 offen offset:72
; CHECK-NEXT: buffer_load_dword v14, v2, s[0:3], 0 offen offset:76
-; CHECK-NEXT: buffer_load_dword v18, v2, s[0:3], 0 offen offset:92
-; CHECK-NEXT: buffer_load_dword v17, v2, s[0:3], 0 offen offset:88
-; CHECK-NEXT: buffer_load_dword v16, v2, s[0:3], 0 offen offset:84
; CHECK-NEXT: buffer_load_dword v15, v2, s[0:3], 0 offen offset:80
-; CHECK-NEXT: buffer_load_dword v13, v2, s[0:3], 0 offen offset:72
-; CHECK-NEXT: buffer_load_dword v12, v2, s[0:3], 0 offen offset:68
-; CHECK-NEXT: buffer_load_dword v11, v2, s[0:3], 0 offen offset:64
-; CHECK-NEXT: buffer_load_dword v22, v2, s[0:3], 0 offen offset:108
-; CHECK-NEXT: buffer_load_dword v26, v2, s[0:3], 0 offen offset:124
-; CHECK-NEXT: buffer_load_dword v25, v2, s[0:3], 0 offen offset:120
-; CHECK-NEXT: buffer_load_dword v24, v2, s[0:3], 0 offen offset:116
-; CHECK-NEXT: buffer_load_dword v23, v2, s[0:3], 0 offen offset:112
-; CHECK-NEXT: buffer_load_dword v21, v2, s[0:3], 0 offen offset:104
-; CHECK-NEXT: buffer_load_dword v20, v2, s[0:3], 0 offen offset:100
-; CHECK-NEXT: buffer_load_dword v19, v2, s[0:3], 0 offen offset:96
+; CHECK-NEXT: buffer_load_dword v16, v2, s[0:3], 0 offen offset:84
+; CHECK-NEXT: buffer_load_dword v17, v2, s[0:3], 0 offen offset:88
+; CHECK-NEXT: buffer_load_dword v18, v2, s[0:3], 0 offen offset:92
+; CHECK-NEXT: buffer_load_dword v19, v2, s[0:3], 0 offen offset:128
+; CHECK-NEXT: buffer_load_dword v20, v2, s[0:3], 0 offen offset:132
+; CHECK-NEXT: buffer_load_dword v21, v2, s[0:3], 0 offen offset:136
+; CHECK-NEXT: buffer_load_dword v22, v2, s[0:3], 0 offen offset:140
+; CHECK-NEXT: buffer_load_dword v23, v2, s[0:3], 0 offen offset:144
+; CHECK-NEXT: buffer_load_dword v24, v2, s[0:3], 0 offen offset:148
+; CHECK-NEXT: buffer_load_dword v25, v2, s[0:3], 0 offen offset:152
+; CHECK-NEXT: buffer_load_dword v26, v2, s[0:3], 0 offen offset:156
+; CHECK-NEXT: buffer_load_dword v27, v2, s[0:3], 0 offen offset:160
+; CHECK-NEXT: buffer_load_dword v28, v2, s[0:3], 0 offen offset:164
+; CHECK-NEXT: buffer_load_dword v29, v2, s[0:3], 0 offen offset:168
; CHECK-NEXT: buffer_load_dword v30, v2, s[0:3], 0 offen offset:172
-; CHECK-NEXT: buffer_load_dword v34, v2, s[0:3], 0 offen offset:188
-; CHECK-NEXT: buffer_load_dword v33, v2, s[0:3], 0 offen offset:184
-; CHECK-NEXT: buffer_load_dword v32, v2, s[0:3], 0 offen offset:180
; CHECK-NEXT: buffer_load_dword v31, v2, s[0:3], 0 offen offset:176
-; CHECK-NEXT: buffer_load_dword v29, v2, s[0:3], 0 offen offset:168
-; CHECK-NEXT: buffer_load_dword v28, v2, s[0:3], 0 offen offset:164
-; CHECK-NEXT: buffer_load_dword v27, v2, s[0:3], 0 offen offset:160
+; CHECK-NEXT: buffer_load_dword v32, v2, s[0:3], 0 offen offset:180
+; CHECK-NEXT: buffer_load_dword v33, v2, s[0:3], 0 offen offset:184
+; CHECK-NEXT: buffer_load_dword v34, v2, s[0:3], 0 offen offset:188
+; CHECK-NEXT: buffer_load_dword v35, v2, s[0:3], 0 offen offset:192
+; CHECK-NEXT: buffer_load_dword v36, v2, s[0:3], 0 offen offset:196
+; CHECK-NEXT: buffer_load_dword v37, v2, s[0:3], 0 offen offset:200
; CHECK-NEXT: buffer_load_dword v38, v2, s[0:3], 0 offen offset:204
-; CHECK-NEXT: buffer_load_dword v51, v2, s[0:3], 0 offen offset:220
-; CHECK-NEXT: buffer_load_dword v50, v2, s[0:3], 0 offen offset:216
-; CHECK-NEXT: buffer_load_dword v49, v2, s[0:3], 0 offen offset:212
; CHECK-NEXT: buffer_load_dword v48, v2, s[0:3], 0 offen offset:208
-; CHECK-NEXT: buffer_load_dword v37, v2, s[0:3], 0 offen offset:200
-; CHECK-NEXT: buffer_load_dword v36, v2, s[0:3], 0 offen offset:196
-; CHECK-NEXT: buffer_load_dword v35, v2, s[0:3], 0 offen offset:192
+; CHECK-NEXT: buffer_load_dword v49, v2, s[0:3], 0 offen offset:212
+; CHECK-NEXT: buffer_load_dword v50, v2, s[0:3], 0 offen offset:216
+; CHECK-NEXT: buffer_load_dword v51, v2, s[0:3], 0 offen offset:220
+; CHECK-NEXT: buffer_load_dword v52, v2, s[0:3], 0 offen offset:224
+; CHECK-NEXT: buffer_load_dword v53, v2, s[0:3], 0 offen offset:228
+; CHECK-NEXT: buffer_load_dword v54, v2, s[0:3], 0 offen offset:232
; CHECK-NEXT: buffer_load_dword v55, v2, s[0:3], 0 offen offset:236
-; CHECK-NEXT: buffer_load_dword v67, v2, s[0:3], 0 offen offset:252
-; CHECK-NEXT: buffer_load_dword v66, v2, s[0:3], 0 offen offset:248
-; CHECK-NEXT: buffer_load_dword v65, v2, s[0:3], 0 offen offset:244
; CHECK-NEXT: buffer_load_dword v64, v2, s[0:3], 0 offen offset:240
-; CHECK-NEXT: buffer_load_dword v54, v2, s[0:3], 0 offen offset:232
-; CHECK-NEXT: buffer_load_dword v53, v2, s[0:3], 0 offen offset:228
-; CHECK-NEXT: buffer_load_dword v52, v2, s[0:3], 0 offen offset:224
-; CHECK-NEXT: buffer_load_dword v71, v2, s[0:3], 0 offen offset:140
-; CHECK-NEXT: buffer_load_dword v83, v2, s[0:3], 0 offen offset:156
-; CHECK-NEXT: buffer_load_dword v82, v2, s[0:3], 0 offen offset:152
-; CHECK-NEXT: buffer_load_dword v81, v2, s[0:3], 0 offen offset:148
-; CHECK-NEXT: buffer_load_dword v80, v2, s[0:3], 0 offen offset:144
-; CHECK-NEXT: buffer_load_dword v70, v2, s[0:3], 0 offen offset:136
-; CHECK-NEXT: buffer_load_dword v69, v2, s[0:3], 0 offen offset:132
-; CHECK-NEXT: buffer_load_dword v68, v2, s[0:3], 0 offen offset:128
+; CHECK-NEXT: buffer_load_dword v65, v2, s[0:3], 0 offen offset:244
+; CHECK-NEXT: buffer_load_dword v66, v2, s[0:3], 0 offen offset:248
+; CHECK-NEXT: buffer_load_dword v67, v2, s[0:3], 0 offen offset:252
+; CHECK-NEXT: buffer_load_dword v68, v2, s[0:3], 0 offen offset:96
+; CHECK-NEXT: buffer_load_dword v69, v2, s[0:3], 0 offen offset:100
+; CHECK-NEXT: buffer_load_dword v70, v2, s[0:3], 0 offen offset:104
+; CHECK-NEXT: buffer_load_dword v71, v2, s[0:3], 0 offen offset:108
+; CHECK-NEXT: buffer_load_dword v80, v2, s[0:3], 0 offen offset:112
+; CHECK-NEXT: buffer_load_dword v81, v2, s[0:3], 0 offen offset:116
+; CHECK-NEXT: buffer_load_dword v82, v2, s[0:3], 0 offen offset:120
+; CHECK-NEXT: buffer_load_dword v83, v2, s[0:3], 0 offen offset:124
; CHECK-NEXT: buffer_load_dword v84, v2, s[0:3], 0 offen
; CHECK-NEXT: buffer_load_dword v85, v2, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_load_dword v86, v2, s[0:3], 0 offen offset:8
@@ -12613,20 +12622,18 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; CHECK-NEXT: v_add_nc_u32_e32 v2, 0x100, v2
; CHECK-NEXT: v_add_co_ci_u32_e64 v103, null, 0, v101, vcc_lo
; CHECK-NEXT: s_addc_u32 s5, s5, 0
-; CHECK-NEXT: s_waitcnt vmcnt(19)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[64:67] offset:192
; CHECK-NEXT: s_waitcnt vmcnt(16)
+; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[64:67] offset:192
; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[52:55] offset:176
; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[48:51] offset:160
; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[35:38] offset:144
; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[31:34] offset:128
; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[27:30] offset:112
-; CHECK-NEXT: s_waitcnt vmcnt(11)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[80:83] offset:96
+; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[23:26] offset:96
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[19:22] offset:128
; CHECK-NEXT: s_waitcnt vmcnt(8)
-; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:128
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[23:26] offset:64
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[19:22] offset:48
+; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[80:83] offset:64
+; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[68:71] offset:48
; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[15:18] offset:32
; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[11:14] offset:64
; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[7:10]
@@ -12656,54 +12663,54 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:52
; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:56
; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:60
+; CHECK-NEXT: buffer_load_dword v11, v2, s[0:3], 0 offen offset:64
+; CHECK-NEXT: buffer_load_dword v12, v2, s[0:3], 0 offen offset:68
+; CHECK-NEXT: buffer_load_dword v13, v2, s[0:3], 0 offen offset:72
; CHECK-NEXT: buffer_load_dword v14, v2, s[0:3], 0 offen offset:76
-; CHECK-NEXT: buffer_load_dword v18, v2, s[0:3], 0 offen offset:92
-; CHECK-NEXT: buffer_load_dword v17, v2, s[0:3], 0 offen offset:88
-; CHECK-NEXT: buffer_load_dword v16, v2, s[0:3], 0 offen offset:84
; CHECK-NEXT: buffer_load_dword v15, v2, s[0:3], 0 offen offset:80
-; CHECK-NEXT: buffer_load_dword v13, v2, s[0:3], 0 offen offset:72
-; CHECK-NEXT: buffer_load_dword v12, v2, s[0:3], 0 offen offset:68
-; CHECK-NEXT: buffer_load_dword v11, v2, s[0:3], 0 offen offset:64
-; CHECK-NEXT: buffer_load_dword v22, v2, s[0:3], 0 offen offset:108
-; CHECK-NEXT: buffer_load_dword v26, v2, s[0:3], 0 offen offset:124
-; CHECK-NEXT: buffer_load_dword v25, v2, s[0:3], 0 offen offset:120
-; CHECK-NEXT: buffer_load_dword v24, v2, s[0:3], 0 offen offset:116
-; CHECK-NEXT: buffer_load_dword v23, v2, s[0:3], 0 offen offset:112
-; CHECK-NEXT: buffer_load_dword v21, v2, s[0:3], 0 offen offset:104
-; CHECK-NEXT: buffer_load_dword v20, v2, s[0:3], 0 offen offset:100
-; CHECK-NEXT: buffer_load_dword v19, v2, s[0:3], 0 offen offset:96
-; CHECK-NEXT: buffer_load_dword v30, v2, s[0:3], 0 offen offset:236
-; CHECK-NEXT: buffer_load_dword v34, v2, s[0:3], 0 offen offset:252
-; CHECK-NEXT: buffer_load_dword v33, v2, s[0:3], 0 offen offset:248
-; CHECK-NEXT: buffer_load_dword v32, v2, s[0:3], 0 offen offset:244
-; CHECK-NEXT: buffer_load_dword v31, v2, s[0:3], 0 offen offset:240
-; CHECK-NEXT: buffer_load_dword v29, v2, s[0:3], 0 offen offset:232
-; CHECK-NEXT: buffer_load_dword v28, v2, s[0:3], 0 offen offset:228
-; CHECK-NEXT: buffer_load_dword v27, v2, s[0:3], 0 offen offset:224
-; CHECK-NEXT: buffer_load_dword v38, v2, s[0:3], 0 offen offset:204
-; CHECK-NEXT: buffer_load_dword v51, v2, s[0:3], 0 offen offset:220
-; CHECK-NEXT: buffer_load_dword v50, v2, s[0:3], 0 offen offset:216
-; CHECK-NEXT: buffer_load_dword v49, v2, s[0:3], 0 offen offset:212
-; CHECK-NEXT: buffer_load_dword v48, v2, s[0:3], 0 offen offset:208
-; CHECK-NEXT: buffer_load_dword v37, v2, s[0:3], 0 offen offset:200
-; CHECK-NEXT: buffer_load_dword v36, v2, s[0:3], 0 offen offset:196
-; CHECK-NEXT: buffer_load_dword v35, v2, s[0:3], 0 offen offset:192
+; CHECK-NEXT: buffer_load_dword v16, v2, s[0:3], 0 offen offset:84
+; CHECK-NEXT: buffer_load_dword v17, v2, s[0:3], 0 offen offset:88
+; CHECK-NEXT: buffer_load_dword v18, v2, s[0:3], 0 offen offset:92
+; CHECK-NEXT: buffer_load_dword v19, v2, s[0:3], 0 offen offset:128
+; CHECK-NEXT: buffer_load_dword v20, v2, s[0:3], 0 offen offset:132
+; CHECK-NEXT: buffer_load_dword v21, v2, s[0:3], 0 offen offset:136
+; CHECK-NEXT: buffer_load_dword v22, v2, s[0:3], 0 offen offset:140
+; CHECK-NEXT: buffer_load_dword v23, v2, s[0:3], 0 offen offset:144
+; CHECK-NEXT: buffer_load_dword v24, v2, s[0:3], 0 offen offset:148
+; CHECK-NEXT: buffer_load_dword v25, v2, s[0:3], 0 offen offset:152
+; CHECK-NEXT: buffer_load_dword v26, v2, s[0:3], 0 offen offset:156
+; CHECK-NEXT: buffer_load_dword v27, v2, s[0:3], 0 offen offset:192
+; CHECK-NEXT: buffer_load_dword v28, v2, s[0:3], 0 offen offset:196
+; CHECK-NEXT: buffer_load_dword v29, v2, s[0:3], 0 offen offset:200
+; CHECK-NEXT: buffer_load_dword v30, v2, s[0:3], 0 offen offset:204
+; CHECK-NEXT: buffer_load_dword v31, v2, s[0:3], 0 offen offset:208
+; CHECK-NEXT: buffer_load_dword v32, v2, s[0:3], 0 offen offset:212
+; CHECK-NEXT: buffer_load_dword v33, v2, s[0:3], 0 offen offset:216
+; CHECK-NEXT: buffer_load_dword v34, v2, s[0:3], 0 offen offset:220
+; CHECK-NEXT: buffer_load_dword v35, v2, s[0:3], 0 offen offset:224
+; CHECK-NEXT: buffer_load_dword v36, v2, s[0:3], 0 offen offset:228
+; CHECK-NEXT: buffer_load_dword v37, v2, s[0:3], 0 offen offset:232
+; CHECK-NEXT: buffer_load_dword v38, v2, s[0:3], 0 offen offset:236
+; CHECK-NEXT: buffer_load_dword v48, v2, s[0:3], 0 offen offset:240
+; CHECK-NEXT: buffer_load_dword v49, v2, s[0:3], 0 offen offset:244
+; CHECK-NEXT: buffer_load_dword v50, v2, s[0:3], 0 offen offset:248
+; CHECK-NEXT: buffer_load_dword v51, v2, s[0:3], 0 offen offset:252
+; CHECK-NEXT: buffer_load_dword v52, v2, s[0:3], 0 offen offset:160
+; CHECK-NEXT: buffer_load_dword v53, v2, s[0:3], 0 offen offset:164
+; CHECK-NEXT: buffer_load_dword v54, v2, s[0:3], 0 offen offset:168
; CHECK-NEXT: buffer_load_dword v55, v2, s[0:3], 0 offen offset:172
-; CHECK-NEXT: buffer_load_dword v67, v2, s[0:3], 0 offen offset:188
-; CHECK-NEXT: buffer_load_dword v66, v2, s[0:3], 0 offen offset:184
-; CHECK-NEXT: buffer_load_dword v65, v2, s[0:3], 0 offen offset:180
; CHECK-NEXT: buffer_load_dword v64, v2, s[0:3], 0 offen offset:176
-; CHECK-NEXT: buffer_load_dword v54, v2, s[0:3], 0 offen offset:168
-; CHECK-NEXT: buffer_load_dword v53, v2, s[0:3], 0 offen offset:164
-; CHECK-NEXT: buffer_load_dword v52, v2, s[0:3], 0 offen offset:160
-; CHECK-NEXT: buffer_load_dword v71, v2, s[0:3], 0 offen offset:156
-; CHECK-NEXT: buffer_load_dword v70, v2, s[0:3], 0 offen offset:152
-; CHECK-NEXT: buffer_load_dword v69, v2, s[0:3], 0 offen offset:148
-; CHECK-NEXT: buffer_load_dword v68, v2, s[0:3], 0 offen offset:144
-; CHECK-NEXT: buffer_load_dword v83, v2, s[0:3], 0 offen offset:140
-; CHECK-NEXT: buffer_load_dword v82, v2, s[0:3], 0 offen offset:136
-; CHECK-NEXT: buffer_load_dword v81, v2, s[0:3], 0 offen offset:132
-; CHECK-NEXT: buffer_load_dword v80, v2, s[0:3], 0 offen offset:128
+; CHECK-NEXT: buffer_load_dword v65, v2, s[0:3], 0 offen offset:180
+; CHECK-NEXT: buffer_load_dword v66, v2, s[0:3], 0 offen offset:184
+; CHECK-NEXT: buffer_load_dword v67, v2, s[0:3], 0 offen offset:188
+; CHECK-NEXT: buffer_load_dword v68, v2, s[0:3], 0 offen offset:96
+; CHECK-NEXT: buffer_load_dword v69, v2, s[0:3], 0 offen offset:100
+; CHECK-NEXT: buffer_load_dword v70, v2, s[0:3], 0 offen offset:104
+; CHECK-NEXT: buffer_load_dword v71, v2, s[0:3], 0 offen offset:108
+; CHECK-NEXT: buffer_load_dword v80, v2, s[0:3], 0 offen offset:112
+; CHECK-NEXT: buffer_load_dword v81, v2, s[0:3], 0 offen offset:116
+; CHECK-NEXT: buffer_load_dword v82, v2, s[0:3], 0 offen offset:120
+; CHECK-NEXT: buffer_load_dword v83, v2, s[0:3], 0 offen offset:124
; CHECK-NEXT: buffer_load_dword v84, v2, s[0:3], 0 offen
; CHECK-NEXT: buffer_load_dword v85, v2, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_load_dword v86, v2, s[0:3], 0 offen offset:8
@@ -12719,24 +12726,19 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; CHECK-NEXT: s_add_u32 s4, s4, 0xffffff00
; CHECK-NEXT: v_add_co_ci_u32_e64 v103, null, 0, v101, vcc_lo
; CHECK-NEXT: s_addc_u32 s5, s5, -1
-; CHECK-NEXT: s_waitcnt vmcnt(35)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[31:34] offset:192
-; CHECK-NEXT: s_waitcnt vmcnt(32)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[27:30] offset:176
-; CHECK-NEXT: s_waitcnt vmcnt(27)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[48:51] offset:160
; CHECK-NEXT: s_waitcnt vmcnt(24)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[35:38] offset:144
-; CHECK-NEXT: s_waitcnt vmcnt(19)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[64:67] offset:128
+; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[48:51] offset:192
+; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[35:38] offset:176
+; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[31:34] offset:160
+; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[27:30] offset:144
; CHECK-NEXT: s_waitcnt vmcnt(16)
+; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[64:67] offset:128
; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[52:55] offset:112
-; CHECK-NEXT: s_waitcnt vmcnt(12)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[68:71] offset:96
+; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[23:26] offset:96
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[19:22] offset:128
; CHECK-NEXT: s_waitcnt vmcnt(8)
-; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:128
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[23:26] offset:64
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[19:22] offset:48
+; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[80:83] offset:64
+; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[68:71] offset:48
; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[15:18] offset:32
; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[11:14] offset:64
; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[7:10]
@@ -12820,17 +12822,16 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:22
; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:23
; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:24
-; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:25
-; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:26
-; ALIGNED-NEXT: buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:19
+; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:25
+; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:26
+; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:27
; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28
; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:29
-; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:30
-; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:31
+; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:30
+; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:31
; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:32
; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:33
; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:34
-; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27
; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:35
; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:36
; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:37
@@ -12855,6 +12856,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:56
; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:57
; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:58
+; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:59
; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:60
; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:61
; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:62
@@ -12862,7 +12864,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:64
; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:65
; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:66
-; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:59
; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:67
; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:68
; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:69
@@ -12870,86 +12871,90 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:71
; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:76
; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:77
-; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:75
+; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:72
+; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:73
; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:78
; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:79
-; ALIGNED-NEXT: buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:159
-; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:155
-; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:152
-; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:153
-; ALIGNED-NEXT: buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:154
-; ALIGNED-NEXT: s_clause 0x31
-; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:160
-; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:161
-; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:162
-; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:163
-; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:164
-; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:165
-; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:166
-; ALIGNED-NEXT: buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:167
-; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:172
-; ALIGNED-NEXT: buffer_load_ubyte v89, v2, s[0:3], 0 offen offset:173
-; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:174
-; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:175
-; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:171
-; ALIGNED-NEXT: buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:204
-; ALIGNED-NEXT: buffer_load_ubyte v99, v2, s[0:3], 0 offen offset:205
-; ALIGNED-NEXT: buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:206
-; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:207
-; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:203
; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:200
-; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:201
-; ALIGNED-NEXT: buffer_load_ubyte v86, v2, s[0:3], 0 offen offset:202
-; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:168
-; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:169
-; ALIGNED-NEXT: buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:170
-; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:176
-; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:177
-; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:178
-; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:179
-; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:180
-; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:181
-; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:182
-; ALIGNED-NEXT: buffer_load_ubyte v58, v2, s[0:3], 0 offen offset:183
-; ALIGNED-NEXT: buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:188
-; ALIGNED-NEXT: buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:189
-; ALIGNED-NEXT: buffer_load_ubyte v46, v2, s[0:3], 0 offen offset:190
-; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:191
-; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:187
-; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:184
-; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:185
-; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:186
-; ALIGNED-NEXT: buffer_load_ubyte v119, v2, s[0:3], 0 offen offset:192
-; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:193
-; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:194
-; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:195
-; ALIGNED-NEXT: buffer_load_ubyte v117, v2, s[0:3], 0 offen offset:196
-; ALIGNED-NEXT: buffer_load_ubyte v115, v2, s[0:3], 0 offen offset:197
-; ALIGNED-NEXT: buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:198
-; ALIGNED-NEXT: buffer_load_ubyte v103, v2, s[0:3], 0 offen offset:199
-; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:4
-; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:6
+; ALIGNED-NEXT: buffer_load_ubyte v86, v2, s[0:3], 0 offen offset:201
+; ALIGNED-NEXT: buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:202
+; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:203
+; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:158
+; ALIGNED-NEXT: s_clause 0x34
+; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:159
+; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:152
+; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:153
+; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:154
+; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:155
+; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:160
+; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:161
+; ALIGNED-NEXT: buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:162
+; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:163
+; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:164
+; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:165
+; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:166
+; ALIGNED-NEXT: buffer_load_ubyte v91, v2, s[0:3], 0 offen offset:167
+; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:172
+; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:173
+; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:174
+; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:175
+; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:168
+; ALIGNED-NEXT: buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:169
+; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:170
+; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:171
+; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:176
+; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:177
+; ALIGNED-NEXT: buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:178
+; ALIGNED-NEXT: buffer_load_ubyte v58, v2, s[0:3], 0 offen offset:179
+; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:180
+; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:181
+; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:182
+; ALIGNED-NEXT: buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:183
+; ALIGNED-NEXT: buffer_load_ubyte v46, v2, s[0:3], 0 offen offset:188
+; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:189
+; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:190
+; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:191
+; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:184
+; ALIGNED-NEXT: buffer_load_ubyte v41, v2, s[0:3], 0 offen offset:185
+; ALIGNED-NEXT: buffer_load_ubyte v119, v2, s[0:3], 0 offen offset:186
+; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:187
+; ALIGNED-NEXT: buffer_load_ubyte v117, v2, s[0:3], 0 offen offset:192
+; ALIGNED-NEXT: buffer_load_ubyte v115, v2, s[0:3], 0 offen offset:193
+; ALIGNED-NEXT: buffer_load_ubyte v103, v2, s[0:3], 0 offen offset:194
+; ALIGNED-NEXT: buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:195
+; ALIGNED-NEXT: buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:196
+; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:197
+; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:198
+; ALIGNED-NEXT: buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:199
+; ALIGNED-NEXT: buffer_load_ubyte v99, v2, s[0:3], 0 offen offset:204
+; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:205
+; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:206
+; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:207
+; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:2
+; ALIGNED-NEXT: buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:4
+; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:5
+; ALIGNED-NEXT: buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:6
; ALIGNED-NEXT: s_waitcnt vmcnt(62)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v5
-; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v9, 8, v8
; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v6
-; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v12
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v10, 8, v9
+; ALIGNED-NEXT: v_lshl_or_b32 v5, v8, 8, v6
+; ALIGNED-NEXT: v_lshl_or_b32 v6, v12, 8, v11
; ALIGNED-NEXT: v_lshl_or_b32 v7, v15, 8, v14
; ALIGNED-NEXT: v_lshl_or_b32 v8, v19, 8, v17
; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v13
@@ -12957,8 +12962,8 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: v_lshl_or_b32 v11, v23, 8, v22
; ALIGNED-NEXT: v_lshl_or_b32 v12, v27, 8, v25
; ALIGNED-NEXT: v_lshl_or_b32 v13, v24, 8, v21
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v14, v28, 8, v26
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3
; ALIGNED-NEXT: v_lshl_or_b32 v3, v6, 16, v5
; ALIGNED-NEXT: v_lshl_or_b32 v4, v8, 16, v7
@@ -12993,7 +12998,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v3, v66, 8, v64
; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(61)
; ALIGNED-NEXT: v_lshl_or_b32 v4, v68, 8, v67
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: s_clause 0x2
@@ -13002,137 +13006,133 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87
; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(62)
; ALIGNED-NEXT: v_lshl_or_b32 v0, v70, 8, v69
; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:83
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:74
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(61)
+; ALIGNED-NEXT: s_waitcnt vmcnt(62)
; ALIGNED-NEXT: v_lshl_or_b32 v1, v80, 8, v71
; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x4
-; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:212
-; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:213
-; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:214
-; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:215
-; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:211
+; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:73
+; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:75
+; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:72
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v82, 8, v81
+; ALIGNED-NEXT: s_clause 0x3
+; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:212
+; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:213
+; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:214
+; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:215
; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x7
+; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:216
+; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:217
+; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:218
+; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:219
+; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:220
+; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:221
+; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:222
+; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:223
+; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:208
-; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:209
-; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:210
+; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x3
+; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:208
+; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:209
+; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:210
+; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:211
+; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_clause 0x7
-; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:224
-; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:225
-; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:226
-; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:227
-; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:228
-; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:229
-; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:230
-; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:231
+; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:224
+; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:225
+; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:226
+; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:227
+; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:228
+; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:229
+; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:230
+; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:231
+; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x4
-; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:236
-; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:237
-; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:238
-; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:239
-; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:235
+; ALIGNED-NEXT: s_clause 0x3
+; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:236
+; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:237
+; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:238
+; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:239
+; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x2
+; ALIGNED-NEXT: s_clause 0x3
; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:232
-; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:233
-; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:234
+; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:233
+; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:234
+; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:235
; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0xc
-; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:240
-; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:241
-; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:242
-; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:243
-; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:244
-; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:245
-; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:246
-; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:247
-; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:252
+; ALIGNED-NEXT: s_clause 0x7
+; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:240
+; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:241
+; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:242
+; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:243
+; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:244
+; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:245
+; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:246
+; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:247
+; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x3
+; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:252
; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:253
-; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:254
-; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:255
-; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:251
-; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(44)
+; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:254
+; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:255
+; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(50)
; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(43)
+; ALIGNED-NEXT: s_waitcnt vmcnt(49)
; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(42)
+; ALIGNED-NEXT: s_waitcnt vmcnt(48)
; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1024 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(41)
+; ALIGNED-NEXT: s_waitcnt vmcnt(47)
; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(40)
+; ALIGNED-NEXT: s_waitcnt vmcnt(46)
; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(39)
+; ALIGNED-NEXT: s_waitcnt vmcnt(45)
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1004 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(33)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(32)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v81, 8, v3
-; ALIGNED-NEXT: s_clause 0x8
+; ALIGNED-NEXT: s_waitcnt vmcnt(44)
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 8, v3
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:84
-; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:216
-; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:217
-; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:218
-; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:219
-; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:220
-; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:221
-; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:222
-; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:223
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:81
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:80
-; ALIGNED-NEXT: s_waitcnt vmcnt(10)
+; ALIGNED-NEXT: s_waitcnt vmcnt(2)
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1016 ; 4-byte Folded Spill
@@ -13140,6 +13140,8 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1012 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7
+; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:95
; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:98
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6
@@ -13150,30 +13152,32 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3
; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:94
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:95
+; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:101
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:93
-; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:91
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:92
+; ALIGNED-NEXT: s_waitcnt vmcnt(7)
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1060 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(6)
-; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1116 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1104 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(5)
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1120 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1116 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1056 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1120 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1060 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1056 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1052 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1048 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3
+; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:90
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1072 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:101
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:91
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:89
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1084 ; 4-byte Folded Spill
@@ -13181,7 +13185,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1072 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
@@ -13191,7 +13195,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:99
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:100
-; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1104 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:97
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill
@@ -13206,6 +13209,8 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7
+; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:111
; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:114
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6
@@ -13216,40 +13221,42 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3
; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:110
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:111
+; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:117
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:109
-; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:107
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:108
+; ALIGNED-NEXT: s_waitcnt vmcnt(7)
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1144 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(6)
-; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(5)
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1140 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1144 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1136 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1136 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3
+; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:106
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1160 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:117
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:107
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:105
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1156 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:104
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1160 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1152 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1156 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
@@ -13257,7 +13264,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:115
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:116
-; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:113
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Spill
@@ -13272,6 +13278,8 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7
+; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:127
; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:130
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6
@@ -13282,30 +13290,32 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3
; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:126
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:127
+; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:133
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:125
-; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:123
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:124
+; ALIGNED-NEXT: s_waitcnt vmcnt(7)
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1224 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(6)
-; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1280 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1268 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(5)
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1284 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1276 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1280 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1224 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1272 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1216 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3
+; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:122
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1240 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:133
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:123
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:121
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1232 ; 4-byte Folded Spill
@@ -13313,7 +13323,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1244 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1276 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1240 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1236 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
@@ -13321,503 +13331,499 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:131
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:132
-; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1272 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:131
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:132
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:129
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1248 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:128
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1260 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1260 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1264 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1264 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1256 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1252 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7
+; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:142
; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:146
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6
; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:150
; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:151
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1284 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v4
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:142
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:143
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:143
+; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:149
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:141
-; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:139
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1316 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:140
+; ALIGNED-NEXT: s_waitcnt vmcnt(7)
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(6)
-; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(5)
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1336 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1328 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1288 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3
+; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:138
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:149
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:139
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:137
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1352 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:136
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1324 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1320 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1356 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1316 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1344 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:147
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:148
-; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:147
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:148
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:145
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:144
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v98, 8, v100
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v86, 8, v96
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:248
-; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:249
-; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:250
-; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v13
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:158
-; ALIGNED-NEXT: v_lshl_or_b32 v5, v15, 8, v20
+; ALIGNED-NEXT: v_lshl_or_b32 v6, v10, 8, v11
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v4
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v84, 8, v85
+; ALIGNED-NEXT: v_lshl_or_b32 v5, v14, 8, v18
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:157
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v75, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v69, 8, v82
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v68, 8, v80
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:156
-; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: v_lshl_or_b32 v121, v9, 8, v7
-; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v47, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v71, 8, v81
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v64, 8, v65
+; ALIGNED-NEXT: v_lshl_or_b32 v40, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v66, 8, v70
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v54, 8, v67
+; ALIGNED-NEXT: v_lshl_or_b32 v116, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v49, 8, v52
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v48, 8, v50
+; ALIGNED-NEXT: v_lshl_or_b32 v100, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v37, 8, v39
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v34, 8, v33
+; ALIGNED-NEXT: v_lshl_or_b32 v83, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v35, 8, v38
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v32, 8, v36
+; ALIGNED-NEXT: v_lshl_or_b32 v55, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v30, 8, v31
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v27, 8, v29
+; ALIGNED-NEXT: v_lshl_or_b32 v53, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v25, 8, v28
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v24, 8, v23
+; ALIGNED-NEXT: v_lshl_or_b32 v51, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v19, 8, v21
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v16, 8, v15
+; ALIGNED-NEXT: v_lshl_or_b32 v26, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v17, 8, v20
+; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:7
+; ALIGNED-NEXT: buffer_store_dword v110, off, s[0:3], s32 offset:1344 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v120, off, s[0:3], s32 offset:1352 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v126, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v22, v5, 16, v4
+; ALIGNED-NEXT: s_clause 0x3
+; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:248
+; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:249
+; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:250
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:251
+; ALIGNED-NEXT: v_lshl_or_b32 v5, v12, 8, v13
+; ALIGNED-NEXT: v_lshl_or_b32 v5, v6, 16, v5
+; ALIGNED-NEXT: s_waitcnt vmcnt(6)
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(5)
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v126, 8, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v99, 8, v101
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v123, 8, v125
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v120, 8, v110
-; ALIGNED-NEXT: v_lshl_or_b32 v91, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v87, 8, v96
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v97, 8, v86
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v111, 8, v123
-; ALIGNED-NEXT: v_lshl_or_b32 v76, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v70, 8, v84
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v111, 8, v121
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v122, 8, v124
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v94, 8, v95
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v68, 8, v82
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v92, 8, v93
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v105, 8, v109
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v91, 8, v94
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v106, 8, v109
-; ALIGNED-NEXT: v_lshl_or_b32 v57, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v69, 8, v83
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v64, 8, v65
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v95, 8, v106
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v93, 8, v104
-; ALIGNED-NEXT: v_lshl_or_b32 v41, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v67, 8, v81
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v77, 8, v78
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v105, 8, v107
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v55, 8, v66
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v79, 8, v88
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v79, 8, v88
-; ALIGNED-NEXT: v_lshl_or_b32 v118, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v50, 8, v53
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v71, 8, v51
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v72, 8, v73
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v89, 8, v90
-; ALIGNED-NEXT: v_lshl_or_b32 v102, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v39, 8, v49
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v74, 8, v76
+; ALIGNED-NEXT: s_waitcnt vmcnt(4)
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1356 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v78, 8, v74
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v36, 8, v35
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v58, 8, v57
+; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: v_lshl_or_b32 v6, v8, 8, v9
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: v_lshl_or_b32 v107, v4, 8, v7
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v75, 8, v77
-; ALIGNED-NEXT: v_lshl_or_b32 v85, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v37, 8, v48
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v34, 8, v38
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v61, 8, v63
+; ALIGNED-NEXT: v_lshl_or_b32 v6, v107, 16, v6
+; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:1
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v60, 8, v59
-; ALIGNED-NEXT: v_lshl_or_b32 v80, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v31, 8, v33
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v56, 8, v60
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v63, 8, v73
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v30, 8, v32
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v59, 8, v62
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v58, 8, v62
-; ALIGNED-NEXT: v_lshl_or_b32 v54, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v26, 8, v28
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v29, 8, v25
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v43, 8, v44
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v61, 8, v72
-; ALIGNED-NEXT: v_lshl_or_b32 v52, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v21, 8, v23
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v45, 8, v46
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v45, 8, v46
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v18, 8, v17
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v118, 8, v119
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v47, 8, v56
-; ALIGNED-NEXT: v_lshl_or_b32 v27, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v19, 8, v22
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v41, 8, v42
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v44, 8, v40
-; ALIGNED-NEXT: v_lshl_or_b32 v24, v5, 16, v4
-; ALIGNED-NEXT: v_lshl_or_b32 v5, v12, 8, v14
-; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:7
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v102, 8, v103
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v42, 8, v43
-; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:1324 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v16, v6, 16, v5
-; ALIGNED-NEXT: v_lshl_or_b32 v6, v8, 8, v10
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:5
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v112, 8, v113
-; ALIGNED-NEXT: buffer_store_dword v124, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v6, v121, 16, v6
-; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:1
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1496 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v116, 8, v119
-; ALIGNED-NEXT: v_lshl_or_b32 v108, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v0, v115, 8, v117
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v103, 8, v114
-; ALIGNED-NEXT: v_lshl_or_b32 v92, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v104, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v113, 8, v114
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v101, 8, v112
+; ALIGNED-NEXT: v_lshl_or_b32 v90, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v98, 8, v99
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v87, 8, v97
+; ALIGNED-NEXT: v_lshl_or_b32 v89, v1, 16, v0
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:2
+; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:3
-; ALIGNED-NEXT: s_waitcnt vmcnt(5)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1268 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v108, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: buffer_store_dword v107, off, s[0:3], s32 offset:1336 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; ALIGNED-NEXT: v_lshl_or_b32 v107, v107, 8, v1
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: v_lshl_or_b32 v108, v0, 8, v108
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:12
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v108, 16, v107
+; ALIGNED-NEXT: v_lshl_or_b32 v107, v120, 8, v110
+; ALIGNED-NEXT: v_lshl_or_b32 v108, v3, 8, v126
+; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:13
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v108, 16, v107
+; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:15
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:14
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1308 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v121, off, s[0:3], s32 offset:1288 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v121, v121, 8, v3
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:12
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v122, v0, 8, v1
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:15
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v122, 16, v121
-; ALIGNED-NEXT: v_lshl_or_b32 v121, v5, 8, v124
-; ALIGNED-NEXT: v_lshl_or_b32 v122, v4, 8, v125
-; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:13
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1320 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v122, 16, v121
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:14
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:11
+; ALIGNED-NEXT: v_lshl_or_b32 v107, v3, 8, v1
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x2
+; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:8
+; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:10
+; ALIGNED-NEXT: buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:11
; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v108, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: v_lshl_or_b32 v121, v4, 8, v3
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:10
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: v_lshl_or_b32 v122, v1, 8, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:9
-; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: v_mov_b32_e32 v4, v5
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v122, 16, v121
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:8
-; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: v_lshl_or_b32 v122, v5, 8, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v108, v108, 8, v0
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v108, 16, v107
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:9
+; ALIGNED-NEXT: s_waitcnt vmcnt(3)
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v108, v110, 8, v1
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: v_lshl_or_b32 v121, v1, 8, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:18
-; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:16
-; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:17
+; ALIGNED-NEXT: v_lshl_or_b32 v107, v0, 8, v3
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_mov_b32_e32 v3, v110
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v108, 16, v107
+; ALIGNED-NEXT: s_clause 0x3
+; ALIGNED-NEXT: buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:18
+; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:19
+; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:16
+; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:17
; ALIGNED-NEXT: v_add_nc_u32_e32 v2, 0x100, v2
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v122, 16, v121
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v108, 8, v110
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: v_lshl_or_b32 v126, v107, 8, v120
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v0, 16, v126
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:232
-; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:236
-; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:228
-; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:224
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:236
+; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:228
+; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:224
; ALIGNED-NEXT: s_clause 0x1 ; 8-byte Folded Reload
; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:704
; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:708
-; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v127, 8, v124
-; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: v_lshl_or_b32 v127, v1, 8, v125
-; ALIGNED-NEXT: v_lshl_or_b32 v127, v0, 16, v127
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_add_co_u32 v121, vcc_lo, v5, s4
+; ALIGNED-NEXT: v_add_co_u32 v126, vcc_lo, v5, s4
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: v_add_co_ci_u32_e64 v122, null, s5, v6, vcc_lo
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v127, null, s5, v6, vcc_lo
; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100
-; ALIGNED-NEXT: v_add_co_u32 v5, vcc_lo, v121, 3
-; ALIGNED-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v122, vcc_lo
+; ALIGNED-NEXT: v_add_co_u32 v5, vcc_lo, v126, 3
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v127, vcc_lo
; ALIGNED-NEXT: flat_store_byte v[5:6], v7 offset:247
-; ALIGNED-NEXT: flat_store_byte v[5:6], v9 offset:248
+; ALIGNED-NEXT: flat_store_byte v[5:6], v4 offset:248
; ALIGNED-NEXT: flat_store_byte v[5:6], v8 offset:246
-; ALIGNED-NEXT: flat_store_byte v[5:6], v11 offset:252
+; ALIGNED-NEXT: flat_store_byte v[5:6], v10 offset:252
; ALIGNED-NEXT: flat_store_byte v[5:6], v12 offset:250
-; ALIGNED-NEXT: flat_store_byte v[5:6], v13 offset:251
-; ALIGNED-NEXT: flat_store_byte v[5:6], v14 offset:249
-; ALIGNED-NEXT: flat_store_byte v[5:6], v10 offset:245
-; ALIGNED-NEXT: flat_store_byte v[5:6], v17 offset:239
-; ALIGNED-NEXT: flat_store_byte v[5:6], v18 offset:240
-; ALIGNED-NEXT: flat_store_byte v[5:6], v21 offset:238
-; ALIGNED-NEXT: flat_store_byte v[5:6], v15 offset:244
-; ALIGNED-NEXT: flat_store_byte v[5:6], v19 offset:242
-; ALIGNED-NEXT: flat_store_byte v[5:6], v20 offset:243
-; ALIGNED-NEXT: flat_store_byte v[5:6], v22 offset:241
-; ALIGNED-NEXT: flat_store_byte v[5:6], v23 offset:237
-; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:248
-; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:252
-; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:244
-; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:240
-; ALIGNED-NEXT: flat_store_byte v[5:6], v25 offset:231
-; ALIGNED-NEXT: flat_store_byte v[5:6], v29 offset:232
-; ALIGNED-NEXT: flat_store_byte v[5:6], v26 offset:230
-; ALIGNED-NEXT: flat_store_byte v[5:6], v30 offset:236
-; ALIGNED-NEXT: flat_store_byte v[5:6], v31 offset:234
-; ALIGNED-NEXT: flat_store_byte v[5:6], v32 offset:235
-; ALIGNED-NEXT: flat_store_byte v[5:6], v33 offset:233
+; ALIGNED-NEXT: flat_store_byte v[5:6], v11 offset:251
+; ALIGNED-NEXT: flat_store_byte v[5:6], v13 offset:249
+; ALIGNED-NEXT: flat_store_byte v[5:6], v9 offset:245
+; ALIGNED-NEXT: flat_store_byte v[5:6], v15 offset:239
+; ALIGNED-NEXT: flat_store_byte v[5:6], v16 offset:240
+; ALIGNED-NEXT: flat_store_byte v[5:6], v19 offset:238
+; ALIGNED-NEXT: flat_store_byte v[5:6], v14 offset:244
+; ALIGNED-NEXT: flat_store_byte v[5:6], v17 offset:242
+; ALIGNED-NEXT: flat_store_byte v[5:6], v18 offset:243
+; ALIGNED-NEXT: flat_store_byte v[5:6], v20 offset:241
+; ALIGNED-NEXT: flat_store_byte v[5:6], v21 offset:237
+; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:248
+; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:252
+; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:244
+; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:240
+; ALIGNED-NEXT: flat_store_byte v[5:6], v23 offset:231
+; ALIGNED-NEXT: flat_store_byte v[5:6], v24 offset:232
+; ALIGNED-NEXT: flat_store_byte v[5:6], v25 offset:230
+; ALIGNED-NEXT: flat_store_byte v[5:6], v27 offset:236
+; ALIGNED-NEXT: flat_store_byte v[5:6], v30 offset:234
+; ALIGNED-NEXT: flat_store_byte v[5:6], v29 offset:235
+; ALIGNED-NEXT: flat_store_byte v[5:6], v31 offset:233
; ALIGNED-NEXT: flat_store_byte v[5:6], v28 offset:229
-; ALIGNED-NEXT: flat_store_byte v[5:6], v35 offset:223
-; ALIGNED-NEXT: flat_store_byte v[5:6], v36 offset:224
-; ALIGNED-NEXT: flat_store_byte v[5:6], v39 offset:222
-; ALIGNED-NEXT: flat_store_byte v[5:6], v34 offset:228
-; ALIGNED-NEXT: flat_store_byte v[5:6], v37 offset:226
-; ALIGNED-NEXT: flat_store_byte v[5:6], v38 offset:227
-; ALIGNED-NEXT: flat_store_byte v[5:6], v48 offset:225
-; ALIGNED-NEXT: flat_store_byte v[5:6], v49 offset:221
-; ALIGNED-NEXT: buffer_store_dword v102, off, s[0:3], s32 offset:192
-; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:204
-; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:200
-; ALIGNED-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:196
-; ALIGNED-NEXT: flat_store_byte v[5:6], v70 offset:210
+; ALIGNED-NEXT: flat_store_byte v[5:6], v33 offset:223
+; ALIGNED-NEXT: flat_store_byte v[5:6], v34 offset:224
+; ALIGNED-NEXT: flat_store_byte v[5:6], v37 offset:222
+; ALIGNED-NEXT: flat_store_byte v[5:6], v32 offset:228
+; ALIGNED-NEXT: flat_store_byte v[5:6], v35 offset:226
+; ALIGNED-NEXT: flat_store_byte v[5:6], v36 offset:227
+; ALIGNED-NEXT: flat_store_byte v[5:6], v38 offset:225
+; ALIGNED-NEXT: flat_store_byte v[5:6], v39 offset:221
+; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:192
+; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:204
+; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200
+; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:196
+; ALIGNED-NEXT: flat_store_byte v[5:6], v69 offset:210
; ALIGNED-NEXT: flat_store_byte v[5:6], v68 offset:212
-; ALIGNED-NEXT: flat_store_byte v[5:6], v50 offset:206
-; ALIGNED-NEXT: flat_store_byte v[5:6], v71 offset:208
-; ALIGNED-NEXT: flat_store_byte v[5:6], v51 offset:207
-; ALIGNED-NEXT: flat_store_byte v[5:6], v82 offset:211
-; ALIGNED-NEXT: flat_store_byte v[5:6], v84 offset:209
+; ALIGNED-NEXT: flat_store_byte v[5:6], v49 offset:206
+; ALIGNED-NEXT: flat_store_byte v[5:6], v48 offset:208
+; ALIGNED-NEXT: flat_store_byte v[5:6], v50 offset:207
+; ALIGNED-NEXT: flat_store_byte v[5:6], v80 offset:211
+; ALIGNED-NEXT: flat_store_byte v[5:6], v82 offset:209
; ALIGNED-NEXT: flat_store_byte v[5:6], v65 offset:215
; ALIGNED-NEXT: flat_store_byte v[5:6], v64 offset:216
-; ALIGNED-NEXT: flat_store_byte v[5:6], v69 offset:214
-; ALIGNED-NEXT: flat_store_byte v[5:6], v55 offset:220
-; ALIGNED-NEXT: flat_store_byte v[5:6], v67 offset:218
-; ALIGNED-NEXT: flat_store_byte v[5:6], v66 offset:219
-; ALIGNED-NEXT: flat_store_byte v[5:6], v81 offset:217
-; ALIGNED-NEXT: flat_store_byte v[5:6], v83 offset:213
-; ALIGNED-NEXT: flat_store_byte v[5:6], v53 offset:205
-; ALIGNED-NEXT: buffer_store_dword v76, off, s[0:3], s32 offset:216
-; ALIGNED-NEXT: buffer_store_dword v91, off, s[0:3], s32 offset:220
-; ALIGNED-NEXT: buffer_store_dword v92, off, s[0:3], s32 offset:212
-; ALIGNED-NEXT: buffer_store_dword v108, off, s[0:3], s32 offset:208
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1496 ; 4-byte Folded Reload
-; ALIGNED-NEXT: flat_store_byte v[5:6], v86 offset:199
-; ALIGNED-NEXT: flat_store_byte v[5:6], v97 offset:200
-; ALIGNED-NEXT: flat_store_byte v[5:6], v87 offset:198
-; ALIGNED-NEXT: flat_store_byte v[5:6], v98 offset:204
-; ALIGNED-NEXT: flat_store_byte v[5:6], v99 offset:202
-; ALIGNED-NEXT: flat_store_byte v[5:6], v100 offset:203
-; ALIGNED-NEXT: flat_store_byte v[5:6], v101 offset:201
+; ALIGNED-NEXT: flat_store_byte v[5:6], v71 offset:214
+; ALIGNED-NEXT: flat_store_byte v[5:6], v54 offset:220
+; ALIGNED-NEXT: flat_store_byte v[5:6], v66 offset:218
+; ALIGNED-NEXT: flat_store_byte v[5:6], v67 offset:219
+; ALIGNED-NEXT: flat_store_byte v[5:6], v70 offset:217
+; ALIGNED-NEXT: flat_store_byte v[5:6], v81 offset:213
+; ALIGNED-NEXT: flat_store_byte v[5:6], v52 offset:205
+; ALIGNED-NEXT: buffer_store_dword v75, off, s[0:3], s32 offset:216
+; ALIGNED-NEXT: buffer_store_dword v89, off, s[0:3], s32 offset:220
+; ALIGNED-NEXT: buffer_store_dword v90, off, s[0:3], s32 offset:212
+; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:208
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[5:6], v85 offset:199
+; ALIGNED-NEXT: flat_store_byte v[5:6], v84 offset:200
+; ALIGNED-NEXT: flat_store_byte v[5:6], v86 offset:198
+; ALIGNED-NEXT: flat_store_byte v[5:6], v87 offset:204
+; ALIGNED-NEXT: flat_store_byte v[5:6], v98 offset:202
+; ALIGNED-NEXT: flat_store_byte v[5:6], v97 offset:203
+; ALIGNED-NEXT: flat_store_byte v[5:6], v99 offset:201
; ALIGNED-NEXT: flat_store_byte v[5:6], v96 offset:197
-; ALIGNED-NEXT: flat_store_byte v[5:6], v113 offset:191
-; ALIGNED-NEXT: flat_store_byte v[5:6], v112 offset:192
-; ALIGNED-NEXT: flat_store_byte v[5:6], v116 offset:190
-; ALIGNED-NEXT: flat_store_byte v[5:6], v103 offset:196
-; ALIGNED-NEXT: flat_store_byte v[5:6], v115 offset:194
-; ALIGNED-NEXT: flat_store_byte v[5:6], v114 offset:195
-; ALIGNED-NEXT: flat_store_byte v[5:6], v117 offset:193
-; ALIGNED-NEXT: flat_store_byte v[5:6], v119 offset:189
+; ALIGNED-NEXT: flat_store_byte v[5:6], v103 offset:191
+; ALIGNED-NEXT: flat_store_byte v[5:6], v102 offset:192
+; ALIGNED-NEXT: flat_store_byte v[5:6], v115 offset:190
+; ALIGNED-NEXT: flat_store_byte v[5:6], v101 offset:196
+; ALIGNED-NEXT: flat_store_byte v[5:6], v113 offset:194
+; ALIGNED-NEXT: flat_store_byte v[5:6], v112 offset:195
+; ALIGNED-NEXT: flat_store_byte v[5:6], v114 offset:193
+; ALIGNED-NEXT: flat_store_byte v[5:6], v117 offset:189
; ALIGNED-NEXT: s_addc_u32 s5, s5, 0
; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0x800
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload
-; ALIGNED-NEXT: flat_store_byte v[5:6], v40 offset:183
-; ALIGNED-NEXT: flat_store_byte v[5:6], v44 offset:184
-; ALIGNED-NEXT: flat_store_byte v[5:6], v42 offset:182
-; ALIGNED-NEXT: flat_store_byte v[5:6], v45 offset:188
-; ALIGNED-NEXT: flat_store_byte v[5:6], v47 offset:186
-; ALIGNED-NEXT: flat_store_byte v[5:6], v46 offset:187
-; ALIGNED-NEXT: flat_store_byte v[5:6], v56 offset:185
-; ALIGNED-NEXT: flat_store_byte v[5:6], v43 offset:181
-; ALIGNED-NEXT: flat_store_byte v[5:6], v59 offset:175
-; ALIGNED-NEXT: flat_store_byte v[5:6], v60 offset:176
-; ALIGNED-NEXT: flat_store_byte v[5:6], v63 offset:174
-; ALIGNED-NEXT: flat_store_byte v[5:6], v58 offset:180
-; ALIGNED-NEXT: flat_store_byte v[5:6], v61 offset:178
-; ALIGNED-NEXT: flat_store_byte v[5:6], v62 offset:179
-; ALIGNED-NEXT: flat_store_byte v[5:6], v72 offset:177
-; ALIGNED-NEXT: flat_store_byte v[5:6], v73 offset:173
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[5:6], v119 offset:183
+; ALIGNED-NEXT: flat_store_byte v[5:6], v118 offset:184
+; ALIGNED-NEXT: flat_store_byte v[5:6], v41 offset:182
+; ALIGNED-NEXT: flat_store_byte v[5:6], v43 offset:188
+; ALIGNED-NEXT: flat_store_byte v[5:6], v45 offset:186
+; ALIGNED-NEXT: flat_store_byte v[5:6], v44 offset:187
+; ALIGNED-NEXT: flat_store_byte v[5:6], v46 offset:185
+; ALIGNED-NEXT: flat_store_byte v[5:6], v42 offset:181
+; ALIGNED-NEXT: flat_store_byte v[5:6], v57 offset:175
+; ALIGNED-NEXT: flat_store_byte v[5:6], v58 offset:176
+; ALIGNED-NEXT: flat_store_byte v[5:6], v61 offset:174
+; ALIGNED-NEXT: flat_store_byte v[5:6], v56 offset:180
+; ALIGNED-NEXT: flat_store_byte v[5:6], v59 offset:178
+; ALIGNED-NEXT: flat_store_byte v[5:6], v60 offset:179
+; ALIGNED-NEXT: flat_store_byte v[5:6], v62 offset:177
+; ALIGNED-NEXT: flat_store_byte v[5:6], v63 offset:173
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload
-; ALIGNED-NEXT: flat_store_byte v[5:6], v74 offset:167
-; ALIGNED-NEXT: flat_store_byte v[5:6], v78 offset:168
-; ALIGNED-NEXT: flat_store_byte v[5:6], v75 offset:166
-; ALIGNED-NEXT: flat_store_byte v[5:6], v79 offset:172
-; ALIGNED-NEXT: flat_store_byte v[5:6], v89 offset:170
-; ALIGNED-NEXT: flat_store_byte v[5:6], v88 offset:171
-; ALIGNED-NEXT: flat_store_byte v[5:6], v90 offset:169
-; ALIGNED-NEXT: flat_store_byte v[5:6], v77 offset:165
-; ALIGNED-NEXT: flat_store_byte v[5:6], v95 offset:159
-; ALIGNED-NEXT: flat_store_byte v[5:6], v94 offset:160
-; ALIGNED-NEXT: flat_store_byte v[5:6], v106 offset:158
-; ALIGNED-NEXT: flat_store_byte v[5:6], v93 offset:164
-; ALIGNED-NEXT: flat_store_byte v[5:6], v105 offset:162
-; ALIGNED-NEXT: flat_store_byte v[5:6], v104 offset:163
-; ALIGNED-NEXT: flat_store_byte v[5:6], v107 offset:161
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[5:6], v73 offset:167
+; ALIGNED-NEXT: flat_store_byte v[5:6], v72 offset:168
+; ALIGNED-NEXT: flat_store_byte v[5:6], v74 offset:166
+; ALIGNED-NEXT: flat_store_byte v[5:6], v77 offset:172
+; ALIGNED-NEXT: flat_store_byte v[5:6], v79 offset:170
+; ALIGNED-NEXT: flat_store_byte v[5:6], v78 offset:171
+; ALIGNED-NEXT: flat_store_byte v[5:6], v88 offset:169
+; ALIGNED-NEXT: flat_store_byte v[5:6], v76 offset:165
+; ALIGNED-NEXT: flat_store_byte v[5:6], v93 offset:159
+; ALIGNED-NEXT: flat_store_byte v[5:6], v92 offset:160
+; ALIGNED-NEXT: flat_store_byte v[5:6], v105 offset:158
+; ALIGNED-NEXT: flat_store_byte v[5:6], v91 offset:164
+; ALIGNED-NEXT: flat_store_byte v[5:6], v95 offset:162
+; ALIGNED-NEXT: flat_store_byte v[5:6], v94 offset:163
+; ALIGNED-NEXT: flat_store_byte v[5:6], v106 offset:161
; ALIGNED-NEXT: flat_store_byte v[5:6], v109 offset:157
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1452 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1452 ; 4-byte Folded Reload
-; ALIGNED-NEXT: flat_store_byte v[5:6], v110 offset:151
-; ALIGNED-NEXT: flat_store_byte v[5:6], v120 offset:152
-; ALIGNED-NEXT: flat_store_byte v[5:6], v111 offset:150
-; ALIGNED-NEXT: flat_store_byte v[5:6], v126 offset:156
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[5:6], v121 offset:151
+; ALIGNED-NEXT: flat_store_byte v[5:6], v111 offset:152
+; ALIGNED-NEXT: flat_store_byte v[5:6], v122 offset:150
+; ALIGNED-NEXT: flat_store_byte v[5:6], v123 offset:156
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:154
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:155
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[5:6], v125 offset:155
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:153
-; ALIGNED-NEXT: flat_store_byte v[5:6], v123 offset:149
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[5:6], v124 offset:149
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:143
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:144
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1372 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:142
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1416 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:148
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:146
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:147
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1416 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1384 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:145
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1360 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:141
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1384 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1352 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1316 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1304 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1284 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1324 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:135
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1364 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1320 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:136
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1356 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1316 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:134
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1304 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:140
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1296 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:138
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1300 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:139
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1288 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:137
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1344 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:133
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1272 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1268 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:127
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1260 ; 4-byte Folded Reload
@@ -13826,13 +13832,13 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1256 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:126
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1284 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1280 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:132
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1276 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1272 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:130
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1280 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1276 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:131
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1264 ; 4-byte Folded Reload
@@ -13840,7 +13846,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:129
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1252 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[121:122], v0 offset:128
+; ALIGNED-NEXT: flat_store_byte v[126:127], v0 offset:128
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1248 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:360
@@ -13904,10 +13910,10 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:376
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1156 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:380
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:372
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Reload
@@ -13919,16 +13925,16 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1160 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:104
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1156 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:102
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:108
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:106
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:107
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Reload
@@ -14080,7 +14086,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:65
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[121:122], v0 offset:64
+; ALIGNED-NEXT: flat_store_byte v[126:127], v0 offset:64
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424
@@ -14188,7 +14194,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:30
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[121:122], v0 offset:32
+; ALIGNED-NEXT: flat_store_byte v[126:127], v0 offset:32
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:36
@@ -14210,7 +14216,9 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:388
-; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:384
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:384
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:23
@@ -14235,11 +14243,9 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:21
-; ALIGNED-NEXT: flat_store_byte v[5:6], v124 offset:15
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:16
-; ALIGNED-NEXT: flat_store_byte v[5:6], v1 offset:14
+; ALIGNED-NEXT: flat_store_byte v[5:6], v110 offset:15
+; ALIGNED-NEXT: flat_store_byte v[5:6], v108 offset:16
+; ALIGNED-NEXT: flat_store_byte v[5:6], v107 offset:14
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:20
@@ -14252,63 +14258,63 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:17
-; ALIGNED-NEXT: flat_store_byte v[121:122], v125 offset:16
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[126:127], v120 offset:16
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1320 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1364 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Reload
-; ALIGNED-NEXT: flat_store_byte v[5:6], v3 offset:7
-; ALIGNED-NEXT: flat_store_byte v[5:6], v4 offset:8
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[5:6], v1 offset:7
+; ALIGNED-NEXT: flat_store_byte v[5:6], v3 offset:8
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:10
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:6
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:12
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1372 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:11
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1360 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:9
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[121:122], v0 offset:8
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1300 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[126:127], v0 offset:8
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[121:122], v0 offset:2
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[126:127], v0 offset:2
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1340 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1288 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[121:122], v0 offset:1
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1340 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[126:127], v0 offset:1
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1356 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:4
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1352 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:2
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1324 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:3
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1296 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1344 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[121:122], v0 offset:4
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1268 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[126:127], v0 offset:4
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[121:122], v0
+; ALIGNED-NEXT: flat_store_byte v[126:127], v0
; ALIGNED-NEXT: s_cbranch_scc1 .LBB9_1
; ALIGNED-NEXT: .LBB9_2: ; %Flow10
; ALIGNED-NEXT: s_andn2_saveexec_b32 s8, s6
@@ -14326,30 +14332,29 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:22
; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:23
; ALIGNED-NEXT: buffer_load_ubyte v5, v6, s[0:3], 0 offen offset:24
-; ALIGNED-NEXT: buffer_load_ubyte v10, v6, s[0:3], 0 offen offset:25
-; ALIGNED-NEXT: buffer_load_ubyte v127, v6, s[0:3], 0 offen offset:19
+; ALIGNED-NEXT: buffer_load_ubyte v8, v6, s[0:3], 0 offen offset:25
+; ALIGNED-NEXT: buffer_load_ubyte v11, v6, s[0:3], 0 offen offset:26
+; ALIGNED-NEXT: buffer_load_ubyte v12, v6, s[0:3], 0 offen offset:27
; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:28
; ALIGNED-NEXT: buffer_load_ubyte v7, v6, s[0:3], 0 offen offset:29
-; ALIGNED-NEXT: buffer_load_ubyte v8, v6, s[0:3], 0 offen offset:30
-; ALIGNED-NEXT: buffer_load_ubyte v9, v6, s[0:3], 0 offen offset:31
-; ALIGNED-NEXT: buffer_load_ubyte v12, v6, s[0:3], 0 offen offset:26
+; ALIGNED-NEXT: buffer_load_ubyte v9, v6, s[0:3], 0 offen offset:30
+; ALIGNED-NEXT: buffer_load_ubyte v10, v6, s[0:3], 0 offen offset:31
; ALIGNED-NEXT: buffer_load_ubyte v14, v6, s[0:3], 0 offen offset:32
; ALIGNED-NEXT: buffer_load_ubyte v15, v6, s[0:3], 0 offen offset:33
-; ALIGNED-NEXT: buffer_load_ubyte v11, v6, s[0:3], 0 offen offset:27
; ALIGNED-NEXT: buffer_load_ubyte v19, v6, s[0:3], 0 offen offset:34
-; ALIGNED-NEXT: buffer_load_ubyte v20, v6, s[0:3], 0 offen offset:35
+; ALIGNED-NEXT: buffer_load_ubyte v21, v6, s[0:3], 0 offen offset:35
; ALIGNED-NEXT: buffer_load_ubyte v13, v6, s[0:3], 0 offen offset:36
-; ALIGNED-NEXT: buffer_load_ubyte v16, v6, s[0:3], 0 offen offset:37
-; ALIGNED-NEXT: buffer_load_ubyte v17, v6, s[0:3], 0 offen offset:38
-; ALIGNED-NEXT: buffer_load_ubyte v18, v6, s[0:3], 0 offen offset:39
+; ALIGNED-NEXT: buffer_load_ubyte v17, v6, s[0:3], 0 offen offset:37
+; ALIGNED-NEXT: buffer_load_ubyte v18, v6, s[0:3], 0 offen offset:38
+; ALIGNED-NEXT: buffer_load_ubyte v20, v6, s[0:3], 0 offen offset:39
; ALIGNED-NEXT: buffer_load_ubyte v22, v6, s[0:3], 0 offen offset:40
-; ALIGNED-NEXT: buffer_load_ubyte v26, v6, s[0:3], 0 offen offset:41
-; ALIGNED-NEXT: buffer_load_ubyte v27, v6, s[0:3], 0 offen offset:42
-; ALIGNED-NEXT: buffer_load_ubyte v28, v6, s[0:3], 0 offen offset:43
-; ALIGNED-NEXT: buffer_load_ubyte v21, v6, s[0:3], 0 offen offset:44
-; ALIGNED-NEXT: buffer_load_ubyte v23, v6, s[0:3], 0 offen offset:45
-; ALIGNED-NEXT: buffer_load_ubyte v24, v6, s[0:3], 0 offen offset:46
-; ALIGNED-NEXT: buffer_load_ubyte v25, v6, s[0:3], 0 offen offset:47
+; ALIGNED-NEXT: buffer_load_ubyte v27, v6, s[0:3], 0 offen offset:41
+; ALIGNED-NEXT: buffer_load_ubyte v28, v6, s[0:3], 0 offen offset:42
+; ALIGNED-NEXT: buffer_load_ubyte v29, v6, s[0:3], 0 offen offset:43
+; ALIGNED-NEXT: buffer_load_ubyte v23, v6, s[0:3], 0 offen offset:44
+; ALIGNED-NEXT: buffer_load_ubyte v24, v6, s[0:3], 0 offen offset:45
+; ALIGNED-NEXT: buffer_load_ubyte v25, v6, s[0:3], 0 offen offset:46
+; ALIGNED-NEXT: buffer_load_ubyte v26, v6, s[0:3], 0 offen offset:47
; ALIGNED-NEXT: buffer_load_ubyte v30, v6, s[0:3], 0 offen offset:48
; ALIGNED-NEXT: buffer_load_ubyte v31, v6, s[0:3], 0 offen offset:49
; ALIGNED-NEXT: buffer_load_ubyte v33, v6, s[0:3], 0 offen offset:50
@@ -14360,45 +14365,43 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_ubyte v36, v6, s[0:3], 0 offen offset:55
; ALIGNED-NEXT: buffer_load_ubyte v38, v6, s[0:3], 0 offen offset:56
; ALIGNED-NEXT: buffer_load_ubyte v51, v6, s[0:3], 0 offen offset:57
+; ALIGNED-NEXT: buffer_load_ubyte v16, v6, s[0:3], 0 offen offset:58
+; ALIGNED-NEXT: buffer_load_ubyte v52, v6, s[0:3], 0 offen offset:59
; ALIGNED-NEXT: buffer_load_ubyte v39, v6, s[0:3], 0 offen offset:60
; ALIGNED-NEXT: buffer_load_ubyte v50, v6, s[0:3], 0 offen offset:61
; ALIGNED-NEXT: buffer_load_ubyte v48, v6, s[0:3], 0 offen offset:62
; ALIGNED-NEXT: buffer_load_ubyte v49, v6, s[0:3], 0 offen offset:63
-; ALIGNED-NEXT: buffer_load_ubyte v52, v6, s[0:3], 0 offen offset:58
-; ALIGNED-NEXT: buffer_load_ubyte v29, v6, s[0:3], 0 offen offset:64
-; ALIGNED-NEXT: buffer_load_ubyte v55, v6, s[0:3], 0 offen offset:65
-; ALIGNED-NEXT: buffer_load_ubyte v53, v6, s[0:3], 0 offen offset:59
+; ALIGNED-NEXT: buffer_load_ubyte v53, v6, s[0:3], 0 offen offset:64
+; ALIGNED-NEXT: buffer_load_ubyte v54, v6, s[0:3], 0 offen offset:65
; ALIGNED-NEXT: buffer_load_ubyte v66, v6, s[0:3], 0 offen offset:66
; ALIGNED-NEXT: buffer_load_ubyte v67, v6, s[0:3], 0 offen offset:67
-; ALIGNED-NEXT: buffer_load_ubyte v54, v6, s[0:3], 0 offen offset:68
+; ALIGNED-NEXT: buffer_load_ubyte v55, v6, s[0:3], 0 offen offset:68
; ALIGNED-NEXT: buffer_load_ubyte v64, v6, s[0:3], 0 offen offset:69
; ALIGNED-NEXT: buffer_load_ubyte v65, v6, s[0:3], 0 offen offset:70
; ALIGNED-NEXT: buffer_load_ubyte v68, v6, s[0:3], 0 offen offset:71
; ALIGNED-NEXT: buffer_load_ubyte v69, v6, s[0:3], 0 offen offset:76
-; ALIGNED-NEXT: buffer_load_ubyte v81, v6, s[0:3], 0 offen offset:75
; ALIGNED-NEXT: buffer_load_ubyte v70, v6, s[0:3], 0 offen offset:77
+; ALIGNED-NEXT: buffer_load_ubyte v81, v6, s[0:3], 0 offen offset:72
+; ALIGNED-NEXT: buffer_load_ubyte v82, v6, s[0:3], 0 offen offset:73
; ALIGNED-NEXT: buffer_load_ubyte v71, v6, s[0:3], 0 offen offset:78
; ALIGNED-NEXT: buffer_load_ubyte v80, v6, s[0:3], 0 offen offset:79
-; ALIGNED-NEXT: buffer_load_ubyte v123, v6, s[0:3], 0 offen offset:152
-; ALIGNED-NEXT: buffer_load_ubyte v122, v6, s[0:3], 0 offen offset:153
-; ALIGNED-NEXT: buffer_load_ubyte v111, v6, s[0:3], 0 offen offset:154
-; ALIGNED-NEXT: buffer_load_ubyte v108, v6, s[0:3], 0 offen offset:160
-; ALIGNED-NEXT: buffer_load_ubyte v105, v6, s[0:3], 0 offen offset:161
-; ALIGNED-NEXT: s_clause 0x34
-; ALIGNED-NEXT: buffer_load_ubyte v94, v6, s[0:3], 0 offen offset:162
-; ALIGNED-NEXT: buffer_load_ubyte v92, v6, s[0:3], 0 offen offset:163
-; ALIGNED-NEXT: buffer_load_ubyte v106, v6, s[0:3], 0 offen offset:164
-; ALIGNED-NEXT: buffer_load_ubyte v95, v6, s[0:3], 0 offen offset:165
-; ALIGNED-NEXT: buffer_load_ubyte v104, v6, s[0:3], 0 offen offset:166
-; ALIGNED-NEXT: buffer_load_ubyte v91, v6, s[0:3], 0 offen offset:167
-; ALIGNED-NEXT: buffer_load_ubyte v78, v6, s[0:3], 0 offen offset:172
-; ALIGNED-NEXT: buffer_load_ubyte v77, v6, s[0:3], 0 offen offset:173
-; ALIGNED-NEXT: buffer_load_ubyte v76, v6, s[0:3], 0 offen offset:174
-; ALIGNED-NEXT: buffer_load_ubyte v75, v6, s[0:3], 0 offen offset:175
-; ALIGNED-NEXT: buffer_load_ubyte v74, v6, s[0:3], 0 offen offset:171
-; ALIGNED-NEXT: buffer_load_ubyte v73, v6, s[0:3], 0 offen offset:168
-; ALIGNED-NEXT: buffer_load_ubyte v63, v6, s[0:3], 0 offen offset:169
-; ALIGNED-NEXT: buffer_load_ubyte v62, v6, s[0:3], 0 offen offset:170
+; ALIGNED-NEXT: buffer_load_ubyte v125, v6, s[0:3], 0 offen offset:160
+; ALIGNED-NEXT: buffer_load_ubyte v123, v6, s[0:3], 0 offen offset:161
+; ALIGNED-NEXT: buffer_load_ubyte v111, v6, s[0:3], 0 offen offset:162
+; ALIGNED-NEXT: buffer_load_ubyte v109, v6, s[0:3], 0 offen offset:163
+; ALIGNED-NEXT: buffer_load_ubyte v124, v6, s[0:3], 0 offen offset:164
+; ALIGNED-NEXT: s_clause 0x30
+; ALIGNED-NEXT: buffer_load_ubyte v121, v6, s[0:3], 0 offen offset:165
+; ALIGNED-NEXT: buffer_load_ubyte v120, v6, s[0:3], 0 offen offset:166
+; ALIGNED-NEXT: buffer_load_ubyte v108, v6, s[0:3], 0 offen offset:167
+; ALIGNED-NEXT: buffer_load_ubyte v90, v6, s[0:3], 0 offen offset:172
+; ALIGNED-NEXT: buffer_load_ubyte v88, v6, s[0:3], 0 offen offset:173
+; ALIGNED-NEXT: buffer_load_ubyte v79, v6, s[0:3], 0 offen offset:174
+; ALIGNED-NEXT: buffer_load_ubyte v77, v6, s[0:3], 0 offen offset:175
+; ALIGNED-NEXT: buffer_load_ubyte v78, v6, s[0:3], 0 offen offset:168
+; ALIGNED-NEXT: buffer_load_ubyte v72, v6, s[0:3], 0 offen offset:169
+; ALIGNED-NEXT: buffer_load_ubyte v63, v6, s[0:3], 0 offen offset:170
+; ALIGNED-NEXT: buffer_load_ubyte v62, v6, s[0:3], 0 offen offset:171
; ALIGNED-NEXT: buffer_load_ubyte v61, v6, s[0:3], 0 offen offset:176
; ALIGNED-NEXT: buffer_load_ubyte v59, v6, s[0:3], 0 offen offset:177
; ALIGNED-NEXT: buffer_load_ubyte v47, v6, s[0:3], 0 offen offset:178
@@ -14411,63 +14414,62 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_ubyte v43, v6, s[0:3], 0 offen offset:189
; ALIGNED-NEXT: buffer_load_ubyte v42, v6, s[0:3], 0 offen offset:190
; ALIGNED-NEXT: buffer_load_ubyte v41, v6, s[0:3], 0 offen offset:191
-; ALIGNED-NEXT: buffer_load_ubyte v40, v6, s[0:3], 0 offen offset:187
-; ALIGNED-NEXT: buffer_load_ubyte v119, v6, s[0:3], 0 offen offset:184
-; ALIGNED-NEXT: buffer_load_ubyte v118, v6, s[0:3], 0 offen offset:185
-; ALIGNED-NEXT: buffer_load_ubyte v116, v6, s[0:3], 0 offen offset:186
+; ALIGNED-NEXT: buffer_load_ubyte v40, v6, s[0:3], 0 offen offset:184
+; ALIGNED-NEXT: buffer_load_ubyte v119, v6, s[0:3], 0 offen offset:185
+; ALIGNED-NEXT: buffer_load_ubyte v118, v6, s[0:3], 0 offen offset:186
+; ALIGNED-NEXT: buffer_load_ubyte v116, v6, s[0:3], 0 offen offset:187
; ALIGNED-NEXT: buffer_load_ubyte v115, v6, s[0:3], 0 offen offset:192
-; ALIGNED-NEXT: buffer_load_ubyte v112, v6, s[0:3], 0 offen offset:193
+; ALIGNED-NEXT: buffer_load_ubyte v113, v6, s[0:3], 0 offen offset:193
; ALIGNED-NEXT: buffer_load_ubyte v101, v6, s[0:3], 0 offen offset:194
; ALIGNED-NEXT: buffer_load_ubyte v100, v6, s[0:3], 0 offen offset:195
-; ALIGNED-NEXT: buffer_load_ubyte v113, v6, s[0:3], 0 offen offset:196
+; ALIGNED-NEXT: buffer_load_ubyte v112, v6, s[0:3], 0 offen offset:196
; ALIGNED-NEXT: buffer_load_ubyte v103, v6, s[0:3], 0 offen offset:197
; ALIGNED-NEXT: buffer_load_ubyte v102, v6, s[0:3], 0 offen offset:198
; ALIGNED-NEXT: buffer_load_ubyte v99, v6, s[0:3], 0 offen offset:199
; ALIGNED-NEXT: buffer_load_ubyte v97, v6, s[0:3], 0 offen offset:204
-; ALIGNED-NEXT: buffer_load_ubyte v87, v6, s[0:3], 0 offen offset:205
-; ALIGNED-NEXT: buffer_load_ubyte v96, v6, s[0:3], 0 offen offset:206
-; ALIGNED-NEXT: buffer_load_ubyte v86, v6, s[0:3], 0 offen offset:207
-; ALIGNED-NEXT: buffer_load_ubyte v85, v6, s[0:3], 0 offen offset:203
-; ALIGNED-NEXT: buffer_load_ubyte v84, v6, s[0:3], 0 offen offset:200
-; ALIGNED-NEXT: buffer_load_ubyte v83, v6, s[0:3], 0 offen offset:201
-; ALIGNED-NEXT: buffer_load_ubyte v82, v6, s[0:3], 0 offen offset:202
-; ALIGNED-NEXT: buffer_load_ubyte v124, v6, s[0:3], 0 offen offset:155
-; ALIGNED-NEXT: buffer_load_ubyte v120, v6, s[0:3], 0 offen offset:2
-; ALIGNED-NEXT: buffer_load_ubyte v89, v6, s[0:3], 0 offen offset:3
-; ALIGNED-NEXT: buffer_load_ubyte v93, v6, s[0:3], 0 offen offset:4
-; ALIGNED-NEXT: buffer_load_ubyte v109, v6, s[0:3], 0 offen offset:5
-; ALIGNED-NEXT: buffer_load_ubyte v110, v6, s[0:3], 0 offen offset:6
-; ALIGNED-NEXT: buffer_load_ubyte v121, v6, s[0:3], 0 offen offset:7
+; ALIGNED-NEXT: buffer_load_ubyte v96, v6, s[0:3], 0 offen offset:205
+; ALIGNED-NEXT: buffer_load_ubyte v87, v6, s[0:3], 0 offen offset:206
+; ALIGNED-NEXT: buffer_load_ubyte v85, v6, s[0:3], 0 offen offset:207
+; ALIGNED-NEXT: buffer_load_ubyte v110, v6, s[0:3], 0 offen offset:12
+; ALIGNED-NEXT: buffer_load_ubyte v105, v6, s[0:3], 0 offen offset:13
+; ALIGNED-NEXT: buffer_load_ubyte v107, v6, s[0:3], 0 offen offset:14
+; ALIGNED-NEXT: buffer_load_ubyte v106, v6, s[0:3], 0 offen offset:15
+; ALIGNED-NEXT: buffer_load_ubyte v93, v6, s[0:3], 0 offen offset:2
+; ALIGNED-NEXT: buffer_load_ubyte v75, v6, s[0:3], 0 offen offset:3
+; ALIGNED-NEXT: buffer_load_ubyte v76, v6, s[0:3], 0 offen offset:4
+; ALIGNED-NEXT: buffer_load_ubyte v89, v6, s[0:3], 0 offen offset:5
+; ALIGNED-NEXT: buffer_load_ubyte v92, v6, s[0:3], 0 offen offset:6
+; ALIGNED-NEXT: buffer_load_ubyte v94, v6, s[0:3], 0 offen offset:7
; ALIGNED-NEXT: s_waitcnt vmcnt(62)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2
-; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v2, v7, 8, v4
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v9, 8, v8
; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v10, 8, v5
-; ALIGNED-NEXT: v_lshl_or_b32 v5, v11, 8, v12
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v10, 8, v9
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v8, 8, v5
+; ALIGNED-NEXT: v_lshl_or_b32 v5, v12, 8, v11
; ALIGNED-NEXT: v_lshl_or_b32 v7, v15, 8, v14
-; ALIGNED-NEXT: v_lshl_or_b32 v8, v20, 8, v19
-; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v13
-; ALIGNED-NEXT: v_lshl_or_b32 v10, v18, 8, v17
-; ALIGNED-NEXT: v_lshl_or_b32 v11, v26, 8, v22
-; ALIGNED-NEXT: v_lshl_or_b32 v12, v28, 8, v27
-; ALIGNED-NEXT: v_lshl_or_b32 v13, v23, 8, v21
+; ALIGNED-NEXT: v_lshl_or_b32 v8, v21, 8, v19
+; ALIGNED-NEXT: v_lshl_or_b32 v9, v17, 8, v13
+; ALIGNED-NEXT: v_lshl_or_b32 v10, v20, 8, v18
+; ALIGNED-NEXT: v_lshl_or_b32 v11, v27, 8, v22
+; ALIGNED-NEXT: v_lshl_or_b32 v12, v29, 8, v28
+; ALIGNED-NEXT: v_lshl_or_b32 v13, v24, 8, v23
+; ALIGNED-NEXT: v_lshl_or_b32 v14, v26, 8, v25
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v14, v25, 8, v24
; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v2
; ALIGNED-NEXT: v_lshl_or_b32 v2, v5, 16, v4
; ALIGNED-NEXT: v_lshl_or_b32 v3, v8, 16, v7
@@ -14489,72 +14491,81 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v5, v51, 8, v38
; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v7, v53, 8, v52
+; ALIGNED-NEXT: v_lshl_or_b32 v7, v52, 8, v16
; ALIGNED-NEXT: v_lshl_or_b32 v0, v0, 16, v15
; ALIGNED-NEXT: v_lshl_or_b32 v1, v2, 16, v1
; ALIGNED-NEXT: v_lshl_or_b32 v2, v4, 16, v3
-; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:85
+; ALIGNED-NEXT: buffer_load_ubyte v8, v6, s[0:3], 0 offen offset:87
; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 16, v5
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v55, 8, v29
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v54, 8, v53
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v1, v67, 8, v66
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v64, 8, v54
+; ALIGNED-NEXT: v_lshl_or_b32 v2, v64, 8, v55
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(61)
; ALIGNED-NEXT: v_lshl_or_b32 v3, v68, 8, v65
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v5, v6, s[0:3], 0 offen offset:86
; ALIGNED-NEXT: buffer_load_ubyte v7, v6, s[0:3], 0 offen offset:82
-; ALIGNED-NEXT: buffer_load_ubyte v8, v6, s[0:3], 0 offen offset:87
+; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:85
+; ALIGNED-NEXT: buffer_load_ubyte v5, v6, s[0:3], 0 offen offset:86
; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v2
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:83
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(62)
; ALIGNED-NEXT: v_lshl_or_b32 v0, v70, 8, v69
+; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:84
; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:74
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(60)
; ALIGNED-NEXT: v_lshl_or_b32 v1, v80, 8, v71
+; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:73
-; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x4
+; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:75
+; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x3
; ALIGNED-NEXT: buffer_load_ubyte v80, v6, s[0:3], 0 offen offset:212
; ALIGNED-NEXT: buffer_load_ubyte v67, v6, s[0:3], 0 offen offset:213
; ALIGNED-NEXT: buffer_load_ubyte v70, v6, s[0:3], 0 offen offset:214
-; ALIGNED-NEXT: buffer_load_ubyte v64, v6, s[0:3], 0 offen offset:215
-; ALIGNED-NEXT: buffer_load_ubyte v65, v6, s[0:3], 0 offen offset:211
-; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:72
-; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v65, v6, s[0:3], 0 offen offset:215
+; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v82, 8, v81
+; ALIGNED-NEXT: s_clause 0x3
+; ALIGNED-NEXT: buffer_load_ubyte v86, v6, s[0:3], 0 offen offset:200
+; ALIGNED-NEXT: buffer_load_ubyte v84, v6, s[0:3], 0 offen offset:201
+; ALIGNED-NEXT: buffer_load_ubyte v83, v6, s[0:3], 0 offen offset:202
+; ALIGNED-NEXT: buffer_load_ubyte v82, v6, s[0:3], 0 offen offset:203
+; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_clause 0x7
; ALIGNED-NEXT: buffer_load_ubyte v71, v6, s[0:3], 0 offen offset:216
-; ALIGNED-NEXT: buffer_load_ubyte v66, v6, s[0:3], 0 offen offset:217
-; ALIGNED-NEXT: buffer_load_ubyte v53, v6, s[0:3], 0 offen offset:218
-; ALIGNED-NEXT: buffer_load_ubyte v52, v6, s[0:3], 0 offen offset:219
-; ALIGNED-NEXT: buffer_load_ubyte v69, v6, s[0:3], 0 offen offset:220
-; ALIGNED-NEXT: buffer_load_ubyte v55, v6, s[0:3], 0 offen offset:221
-; ALIGNED-NEXT: buffer_load_ubyte v54, v6, s[0:3], 0 offen offset:222
-; ALIGNED-NEXT: buffer_load_ubyte v51, v6, s[0:3], 0 offen offset:223
+; ALIGNED-NEXT: buffer_load_ubyte v69, v6, s[0:3], 0 offen offset:217
+; ALIGNED-NEXT: buffer_load_ubyte v55, v6, s[0:3], 0 offen offset:218
+; ALIGNED-NEXT: buffer_load_ubyte v54, v6, s[0:3], 0 offen offset:219
+; ALIGNED-NEXT: buffer_load_ubyte v68, v6, s[0:3], 0 offen offset:220
+; ALIGNED-NEXT: buffer_load_ubyte v66, v6, s[0:3], 0 offen offset:221
+; ALIGNED-NEXT: buffer_load_ubyte v64, v6, s[0:3], 0 offen offset:222
+; ALIGNED-NEXT: buffer_load_ubyte v52, v6, s[0:3], 0 offen offset:223
; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v49, v6, s[0:3], 0 offen offset:208
-; ALIGNED-NEXT: buffer_load_ubyte v38, v6, s[0:3], 0 offen offset:209
-; ALIGNED-NEXT: buffer_load_ubyte v39, v6, s[0:3], 0 offen offset:210
+; ALIGNED-NEXT: s_clause 0x3
+; ALIGNED-NEXT: buffer_load_ubyte v50, v6, s[0:3], 0 offen offset:208
+; ALIGNED-NEXT: buffer_load_ubyte v39, v6, s[0:3], 0 offen offset:209
+; ALIGNED-NEXT: buffer_load_ubyte v48, v6, s[0:3], 0 offen offset:210
+; ALIGNED-NEXT: buffer_load_ubyte v38, v6, s[0:3], 0 offen offset:211
; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill
@@ -14572,31 +14583,31 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_ubyte v33, v6, s[0:3], 0 offen offset:229
; ALIGNED-NEXT: buffer_load_ubyte v34, v6, s[0:3], 0 offen offset:230
; ALIGNED-NEXT: buffer_load_ubyte v30, v6, s[0:3], 0 offen offset:231
-; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x4
+; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x3
; ALIGNED-NEXT: buffer_load_ubyte v29, v6, s[0:3], 0 offen offset:236
-; ALIGNED-NEXT: buffer_load_ubyte v27, v6, s[0:3], 0 offen offset:237
-; ALIGNED-NEXT: buffer_load_ubyte v28, v6, s[0:3], 0 offen offset:238
-; ALIGNED-NEXT: buffer_load_ubyte v26, v6, s[0:3], 0 offen offset:239
-; ALIGNED-NEXT: buffer_load_ubyte v25, v6, s[0:3], 0 offen offset:235
+; ALIGNED-NEXT: buffer_load_ubyte v28, v6, s[0:3], 0 offen offset:237
+; ALIGNED-NEXT: buffer_load_ubyte v27, v6, s[0:3], 0 offen offset:238
+; ALIGNED-NEXT: buffer_load_ubyte v25, v6, s[0:3], 0 offen offset:239
; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v24, v6, s[0:3], 0 offen offset:232
-; ALIGNED-NEXT: buffer_load_ubyte v23, v6, s[0:3], 0 offen offset:233
-; ALIGNED-NEXT: buffer_load_ubyte v22, v6, s[0:3], 0 offen offset:234
+; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x3
+; ALIGNED-NEXT: buffer_load_ubyte v26, v6, s[0:3], 0 offen offset:232
+; ALIGNED-NEXT: buffer_load_ubyte v24, v6, s[0:3], 0 offen offset:233
+; ALIGNED-NEXT: buffer_load_ubyte v23, v6, s[0:3], 0 offen offset:234
+; ALIGNED-NEXT: buffer_load_ubyte v22, v6, s[0:3], 0 offen offset:235
; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0xc
+; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0xb
; ALIGNED-NEXT: buffer_load_ubyte v21, v6, s[0:3], 0 offen offset:240
; ALIGNED-NEXT: buffer_load_ubyte v19, v6, s[0:3], 0 offen offset:241
; ALIGNED-NEXT: buffer_load_ubyte v15, v6, s[0:3], 0 offen offset:242
@@ -14606,59 +14617,53 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_ubyte v18, v6, s[0:3], 0 offen offset:246
; ALIGNED-NEXT: buffer_load_ubyte v14, v6, s[0:3], 0 offen offset:247
; ALIGNED-NEXT: buffer_load_ubyte v13, v6, s[0:3], 0 offen offset:252
-; ALIGNED-NEXT: buffer_load_ubyte v11, v6, s[0:3], 0 offen offset:253
-; ALIGNED-NEXT: buffer_load_ubyte v12, v6, s[0:3], 0 offen offset:254
-; ALIGNED-NEXT: buffer_load_ubyte v10, v6, s[0:3], 0 offen offset:255
-; ALIGNED-NEXT: buffer_load_ubyte v9, v6, s[0:3], 0 offen offset:251
-; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(57)
-; ALIGNED-NEXT: buffer_store_dword v89, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v89, v89, 8, v120
-; ALIGNED-NEXT: buffer_store_dword v120, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v126, v6, s[0:3], 0 offen offset:15
-; ALIGNED-NEXT: buffer_load_ubyte v120, v6, s[0:3], 0 offen offset:11
-; ALIGNED-NEXT: s_waitcnt vmcnt(57)
-; ALIGNED-NEXT: buffer_store_dword v109, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v12, v6, s[0:3], 0 offen offset:253
+; ALIGNED-NEXT: buffer_load_ubyte v11, v6, s[0:3], 0 offen offset:254
+; ALIGNED-NEXT: buffer_load_ubyte v9, v6, s[0:3], 0 offen offset:255
+; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(59)
+; ALIGNED-NEXT: buffer_store_dword v75, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v75, v75, 8, v93
+; ALIGNED-NEXT: buffer_store_dword v93, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(56)
-; ALIGNED-NEXT: buffer_store_dword v110, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v92, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(55)
-; ALIGNED-NEXT: buffer_store_dword v121, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v93, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v94, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v76, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v89, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(54)
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(53)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(52)
; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(52)
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(51)
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(50)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1024 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(49)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1004 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(48)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(42)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v81, 8, v2
-; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:84
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 8, v2
+; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:83
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:81
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:80
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1024 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1016 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1012 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v2, 8, v7
+; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:94
; ALIGNED-NEXT: buffer_load_ubyte v7, v6, s[0:3], 0 offen offset:98
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v5
@@ -14666,33 +14671,35 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_ubyte v5, v6, s[0:3], 0 offen offset:102
; ALIGNED-NEXT: buffer_load_ubyte v8, v6, s[0:3], 0 offen offset:103
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1044 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 8, v2
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 8, v3
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:94
-; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:95
+; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:95
+; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:101
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:93
-; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:91
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1052 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:92
+; ALIGNED-NEXT: s_waitcnt vmcnt(7)
+; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1060 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(6)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1116 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1108 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(5)
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1120 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1116 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1060 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1120 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1064 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1064 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1056 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1048 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v2
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2
+; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:90
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:101
+; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:91
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:89
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1072 ; 4-byte Folded Spill
@@ -14700,7 +14707,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1084 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
@@ -14708,23 +14715,24 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:99
-; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:100
-; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1108 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:99
+; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:100
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:97
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:96
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1100 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1100 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1104 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1104 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1096 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v2, 8, v7
+; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:110
; ALIGNED-NEXT: buffer_load_ubyte v7, v6, s[0:3], 0 offen offset:114
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v5
@@ -14732,33 +14740,35 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_ubyte v5, v6, s[0:3], 0 offen offset:118
; ALIGNED-NEXT: buffer_load_ubyte v8, v6, s[0:3], 0 offen offset:119
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 8, v2
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 8, v3
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:110
-; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:111
+; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:111
+; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:117
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:109
-; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:107
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:108
+; ALIGNED-NEXT: s_waitcnt vmcnt(7)
+; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1140 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(6)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(5)
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1140 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1144 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1144 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1136 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v2
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2
+; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:106
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1160 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:117
+; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:107
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:105
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Spill
@@ -14766,7 +14776,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1160 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1156 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
@@ -14774,23 +14784,24 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:115
-; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:116
-; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:115
+; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:116
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:113
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:112
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1180 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1180 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1184 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1184 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1176 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v2, 8, v7
+; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:126
; ALIGNED-NEXT: buffer_load_ubyte v7, v6, s[0:3], 0 offen offset:130
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v5
@@ -14798,33 +14809,35 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_ubyte v5, v6, s[0:3], 0 offen offset:134
; ALIGNED-NEXT: buffer_load_ubyte v8, v6, s[0:3], 0 offen offset:135
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 8, v2
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 8, v3
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:126
-; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:127
+; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:127
+; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:133
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:125
-; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:123
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:124
+; ALIGNED-NEXT: s_waitcnt vmcnt(7)
+; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(6)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1276 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1268 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(5)
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1280 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1276 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1280 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1224 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1224 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1272 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1216 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v2
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2
+; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:122
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1240 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:133
+; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:123
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:121
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1232 ; 4-byte Folded Spill
@@ -14832,7 +14845,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1244 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1272 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1240 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1236 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
@@ -14840,23 +14853,24 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:131
-; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:132
-; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1268 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:131
+; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:132
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:129
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1248 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:128
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1260 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1260 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1264 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1264 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1256 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1252 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v2, 8, v7
+; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:142
; ALIGNED-NEXT: buffer_load_ubyte v7, v6, s[0:3], 0 offen offset:146
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v5
@@ -14864,33 +14878,35 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_ubyte v5, v6, s[0:3], 0 offen offset:150
; ALIGNED-NEXT: buffer_load_ubyte v8, v6, s[0:3], 0 offen offset:151
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1284 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 8, v2
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 8, v3
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:142
-; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:143
+; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:143
+; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:149
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:141
-; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:139
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:140
+; ALIGNED-NEXT: s_waitcnt vmcnt(7)
+; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(6)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(5)
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1356 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1352 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1288 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v2
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2
+; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:138
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1320 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:149
+; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:139
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:137
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill
@@ -14898,7 +14914,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1324 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1320 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1316 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
@@ -14906,174 +14922,180 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:147
-; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:148
-; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1356 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:147
+; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:148
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:145
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:144
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1344 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1352 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1344 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1336 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7
-; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:159
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v2, 8, v7
+; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:158
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v5
-; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v12
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 8, v2
-; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:158
+; ALIGNED-NEXT: v_lshl_or_b32 v5, v9, 8, v11
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 8, v3
+; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:159
; ALIGNED-NEXT: v_lshl_or_b32 v4, v14, 8, v18
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:157
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:156
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v100, 8, v101
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v86, 8, v96
+; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: buffer_load_ubyte v127, v6, s[0:3], 0 offen offset:154
+; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:155
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v82, 8, v83
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v124, 8, v111
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v122, 8, v123
+; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:153
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:152
+; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v2, 8, v127
+; ALIGNED-NEXT: v_lshl_or_b32 v2, v84, 8, v86
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v92, 8, v94
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v105, 8, v108
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v109, 8, v111
+; ALIGNED-NEXT: v_lshl_or_b32 v74, v3, 16, v2
+; ALIGNED-NEXT: v_lshl_or_b32 v2, v67, 8, v80
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v65, 8, v70
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v123, 8, v125
+; ALIGNED-NEXT: v_lshl_or_b32 v45, v3, 16, v2
+; ALIGNED-NEXT: v_lshl_or_b32 v2, v69, 8, v71
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v91, 8, v104
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v108, 8, v120
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v54, 8, v55
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v95, 8, v106
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v121, 8, v124
+; ALIGNED-NEXT: v_lshl_or_b32 v117, v3, 16, v2
+; ALIGNED-NEXT: v_lshl_or_b32 v2, v66, 8, v68
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v52, 8, v64
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v75, 8, v76
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v77, 8, v79
+; ALIGNED-NEXT: v_lshl_or_b32 v114, v3, 16, v2
+; ALIGNED-NEXT: v_lshl_or_b32 v2, v39, 8, v50
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 8, v78
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v88, 8, v90
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v38, 8, v48
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v74, 8, v62
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v62, 8, v63
+; ALIGNED-NEXT: v_lshl_or_b32 v98, v3, 16, v2
+; ALIGNED-NEXT: v_lshl_or_b32 v2, v35, 8, v37
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v32, 8, v31
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v63, 8, v73
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v72, 8, v78
+; ALIGNED-NEXT: v_lshl_or_b32 v81, v3, 16, v2
+; ALIGNED-NEXT: v_lshl_or_b32 v2, v33, 8, v36
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v56, 8, v47
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v30, 8, v34
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v59, 8, v61
+; ALIGNED-NEXT: v_lshl_or_b32 v53, v3, 16, v2
+; ALIGNED-NEXT: v_lshl_or_b32 v2, v28, 8, v29
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v25, 8, v27
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v46, 8, v58
+; ALIGNED-NEXT: v_lshl_or_b32 v51, v3, 16, v2
+; ALIGNED-NEXT: v_lshl_or_b32 v2, v24, 8, v26
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v57, 8, v60
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v22, 8, v23
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; ALIGNED-NEXT: v_lshl_or_b32 v1, v41, 8, v42
+; ALIGNED-NEXT: v_lshl_or_b32 v49, v3, 16, v2
+; ALIGNED-NEXT: v_lshl_or_b32 v2, v19, 8, v21
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v16, 8, v15
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill
; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v44
+; ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 16, v2
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v17, 8, v20
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v116
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v116, 8, v118
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v118, 8, v119
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v119, 8, v40
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v12, 8, v13
; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v8, v6, s[0:3], 0 offen offset:248
-; ALIGNED-NEXT: buffer_load_ubyte v7, v6, s[0:3], 0 offen offset:249
-; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:250
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v100, 8, v101
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v5, 16, v4
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v112, 8, v115
-; ALIGNED-NEXT: v_lshl_or_b32 v107, v2, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v103, 8, v113
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v99, 8, v102
-; ALIGNED-NEXT: v_lshl_or_b32 v90, v2, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v87, 8, v97
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v113, 8, v115
+; ALIGNED-NEXT: v_lshl_or_b32 v122, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v103, 8, v112
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v99, 8, v102
+; ALIGNED-NEXT: v_lshl_or_b32 v95, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v96, 8, v97
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v85, 8, v87
+; ALIGNED-NEXT: v_lshl_or_b32 v91, v1, 16, v0
+; ALIGNED-NEXT: s_clause 0x4
+; ALIGNED-NEXT: buffer_load_ubyte v10, v6, s[0:3], 0 offen offset:248
+; ALIGNED-NEXT: buffer_load_ubyte v8, v6, s[0:3], 0 offen offset:249
+; ALIGNED-NEXT: buffer_load_ubyte v7, v6, s[0:3], 0 offen offset:250
+; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:251
; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen
-; ALIGNED-NEXT: v_lshl_or_b32 v79, v3, 16, v2
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v83, 8, v84
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v85, 8, v82
-; ALIGNED-NEXT: v_lshl_or_b32 v72, v3, 16, v2
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v67, 8, v80
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v64, 8, v70
-; ALIGNED-NEXT: v_lshl_or_b32 v45, v3, 16, v2
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v66, 8, v71
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v52, 8, v53
-; ALIGNED-NEXT: v_lshl_or_b32 v117, v3, 16, v2
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v55, 8, v69
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v51, 8, v54
-; ALIGNED-NEXT: v_lshl_or_b32 v114, v3, 16, v2
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v38, 8, v49
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v65, 8, v39
-; ALIGNED-NEXT: v_lshl_or_b32 v98, v3, 16, v2
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v35, 8, v37
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v32, 8, v31
-; ALIGNED-NEXT: v_lshl_or_b32 v81, v3, 16, v2
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v33, 8, v36
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v30, 8, v34
-; ALIGNED-NEXT: v_lshl_or_b32 v68, v3, 16, v2
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v27, 8, v29
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v26, 8, v28
-; ALIGNED-NEXT: v_lshl_or_b32 v50, v3, 16, v2
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v23, 8, v24
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v25, 8, v22
-; ALIGNED-NEXT: v_lshl_or_b32 v48, v3, 16, v2
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v19, 8, v21
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v16, 8, v15
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 16, v2
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v17, 8, v20
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v11, 8, v13
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v5, 16, v4
-; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: v_lshl_or_b32 v5, v7, 8, v8
+; ALIGNED-NEXT: s_waitcnt vmcnt(3)
+; ALIGNED-NEXT: v_lshl_or_b32 v5, v8, 8, v10
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v88, v9, 8, v1
-; ALIGNED-NEXT: v_lshl_or_b32 v5, v88, 16, v5
-; ALIGNED-NEXT: buffer_load_ubyte v88, v6, s[0:3], 0 offen offset:1
+; ALIGNED-NEXT: v_lshl_or_b32 v73, v1, 8, v7
+; ALIGNED-NEXT: v_lshl_or_b32 v5, v73, 16, v5
+; ALIGNED-NEXT: buffer_load_ubyte v73, v6, s[0:3], 0 offen offset:1
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v88, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v88, v88, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v89, 16, v88
-; ALIGNED-NEXT: v_lshl_or_b32 v88, v109, 8, v93
-; ALIGNED-NEXT: v_lshl_or_b32 v89, v121, 8, v110
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v121, v6, s[0:3], 0 offen offset:8
-; ALIGNED-NEXT: buffer_load_ubyte v110, v6, s[0:3], 0 offen offset:9
-; ALIGNED-NEXT: buffer_load_ubyte v109, v6, s[0:3], 0 offen offset:10
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v89, 16, v88
-; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v125, v6, s[0:3], 0 offen offset:13
-; ALIGNED-NEXT: buffer_load_ubyte v89, v6, s[0:3], 0 offen offset:14
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:12
-; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: buffer_store_dword v89, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v89, v126, 8, v89
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: v_lshl_or_b32 v88, v125, 8, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v89, 16, v88
-; ALIGNED-NEXT: v_lshl_or_b32 v88, v110, 8, v121
-; ALIGNED-NEXT: v_lshl_or_b32 v89, v120, 8, v109
+; ALIGNED-NEXT: buffer_store_dword v73, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v73, v73, 8, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v75, 16, v73
+; ALIGNED-NEXT: v_lshl_or_b32 v75, v94, 8, v92
+; ALIGNED-NEXT: s_clause 0x3
+; ALIGNED-NEXT: buffer_load_ubyte v104, v6, s[0:3], 0 offen offset:8
+; ALIGNED-NEXT: buffer_load_ubyte v94, v6, s[0:3], 0 offen offset:9
+; ALIGNED-NEXT: buffer_load_ubyte v93, v6, s[0:3], 0 offen offset:10
+; ALIGNED-NEXT: buffer_load_ubyte v92, v6, s[0:3], 0 offen offset:11
+; ALIGNED-NEXT: v_lshl_or_b32 v73, v89, 8, v76
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v75, 16, v73
+; ALIGNED-NEXT: v_lshl_or_b32 v73, v105, 8, v110
+; ALIGNED-NEXT: v_lshl_or_b32 v75, v106, 8, v107
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v89, 16, v88
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v88, v6, s[0:3], 0 offen offset:18
-; ALIGNED-NEXT: buffer_load_ubyte v93, v6, s[0:3], 0 offen offset:16
-; ALIGNED-NEXT: buffer_load_ubyte v89, v6, s[0:3], 0 offen offset:17
-; ALIGNED-NEXT: v_add_nc_u32_e32 v6, 0xffffff00, v6
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v75, 16, v73
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: v_lshl_or_b32 v73, v94, 8, v104
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: v_lshl_or_b32 v75, v92, 8, v93
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v75, 16, v73
+; ALIGNED-NEXT: s_clause 0x3
+; ALIGNED-NEXT: buffer_load_ubyte v75, v6, s[0:3], 0 offen offset:18
+; ALIGNED-NEXT: buffer_load_ubyte v76, v6, s[0:3], 0 offen offset:19
+; ALIGNED-NEXT: buffer_load_ubyte v89, v6, s[0:3], 0 offen offset:16
+; ALIGNED-NEXT: buffer_load_ubyte v73, v6, s[0:3], 0 offen offset:17
+; ALIGNED-NEXT: v_add_nc_u32_e32 v6, 0xffffff00, v6
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:488
; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:492
; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:484
@@ -15082,24 +15104,24 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:704
; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708
; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v127, 8, v88
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 8, v75
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: v_lshl_or_b32 v127, v89, 8, v93
+; ALIGNED-NEXT: v_lshl_or_b32 v126, v73, 8, v89
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: v_add_co_u32 v2, vcc_lo, v2, s4
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: v_add_co_ci_u32_e64 v3, null, s5, v3, vcc_lo
-; ALIGNED-NEXT: v_lshl_or_b32 v127, v0, 16, v127
+; ALIGNED-NEXT: v_lshl_or_b32 v126, v0, 16, v126
; ALIGNED-NEXT: v_add_co_u32 v4, vcc_lo, v2, 3
; ALIGNED-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v3, vcc_lo
-; ALIGNED-NEXT: flat_store_byte v[4:5], v1 offset:247
-; ALIGNED-NEXT: flat_store_byte v[4:5], v9 offset:248
-; ALIGNED-NEXT: flat_store_byte v[4:5], v7 offset:246
-; ALIGNED-NEXT: flat_store_byte v[4:5], v10 offset:252
-; ALIGNED-NEXT: flat_store_byte v[4:5], v11 offset:250
-; ALIGNED-NEXT: flat_store_byte v[4:5], v12 offset:251
+; ALIGNED-NEXT: flat_store_byte v[4:5], v7 offset:247
+; ALIGNED-NEXT: flat_store_byte v[4:5], v1 offset:248
+; ALIGNED-NEXT: flat_store_byte v[4:5], v8 offset:246
+; ALIGNED-NEXT: flat_store_byte v[4:5], v9 offset:252
+; ALIGNED-NEXT: flat_store_byte v[4:5], v12 offset:250
+; ALIGNED-NEXT: flat_store_byte v[4:5], v11 offset:251
; ALIGNED-NEXT: flat_store_byte v[4:5], v13 offset:249
-; ALIGNED-NEXT: flat_store_byte v[4:5], v8 offset:245
+; ALIGNED-NEXT: flat_store_byte v[4:5], v10 offset:245
; ALIGNED-NEXT: flat_store_byte v[4:5], v15 offset:239
; ALIGNED-NEXT: flat_store_byte v[4:5], v16 offset:240
; ALIGNED-NEXT: flat_store_byte v[4:5], v19 offset:238
@@ -15108,18 +15130,18 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: flat_store_byte v[4:5], v18 offset:243
; ALIGNED-NEXT: flat_store_byte v[4:5], v20 offset:241
; ALIGNED-NEXT: flat_store_byte v[4:5], v21 offset:237
-; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:504
-; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:508
-; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:500
+; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:504
+; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:508
+; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:500
; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:496
-; ALIGNED-NEXT: flat_store_byte v[4:5], v22 offset:231
-; ALIGNED-NEXT: flat_store_byte v[4:5], v25 offset:232
-; ALIGNED-NEXT: flat_store_byte v[4:5], v23 offset:230
-; ALIGNED-NEXT: flat_store_byte v[4:5], v26 offset:236
-; ALIGNED-NEXT: flat_store_byte v[4:5], v27 offset:234
-; ALIGNED-NEXT: flat_store_byte v[4:5], v28 offset:235
+; ALIGNED-NEXT: flat_store_byte v[4:5], v23 offset:231
+; ALIGNED-NEXT: flat_store_byte v[4:5], v22 offset:232
+; ALIGNED-NEXT: flat_store_byte v[4:5], v24 offset:230
+; ALIGNED-NEXT: flat_store_byte v[4:5], v25 offset:236
+; ALIGNED-NEXT: flat_store_byte v[4:5], v28 offset:234
+; ALIGNED-NEXT: flat_store_byte v[4:5], v27 offset:235
; ALIGNED-NEXT: flat_store_byte v[4:5], v29 offset:233
-; ALIGNED-NEXT: flat_store_byte v[4:5], v24 offset:229
+; ALIGNED-NEXT: flat_store_byte v[4:5], v26 offset:229
; ALIGNED-NEXT: flat_store_byte v[4:5], v31 offset:223
; ALIGNED-NEXT: flat_store_byte v[4:5], v32 offset:224
; ALIGNED-NEXT: flat_store_byte v[4:5], v35 offset:222
@@ -15133,41 +15155,41 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:456
; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:452
; ALIGNED-NEXT: flat_store_byte v[4:5], v67 offset:210
-; ALIGNED-NEXT: flat_store_byte v[4:5], v64 offset:212
-; ALIGNED-NEXT: flat_store_byte v[4:5], v38 offset:206
-; ALIGNED-NEXT: flat_store_byte v[4:5], v65 offset:208
-; ALIGNED-NEXT: flat_store_byte v[4:5], v39 offset:207
+; ALIGNED-NEXT: flat_store_byte v[4:5], v65 offset:212
+; ALIGNED-NEXT: flat_store_byte v[4:5], v39 offset:206
+; ALIGNED-NEXT: flat_store_byte v[4:5], v38 offset:208
+; ALIGNED-NEXT: flat_store_byte v[4:5], v48 offset:207
; ALIGNED-NEXT: flat_store_byte v[4:5], v70 offset:211
; ALIGNED-NEXT: flat_store_byte v[4:5], v80 offset:209
-; ALIGNED-NEXT: flat_store_byte v[4:5], v53 offset:215
-; ALIGNED-NEXT: flat_store_byte v[4:5], v52 offset:216
-; ALIGNED-NEXT: flat_store_byte v[4:5], v66 offset:214
-; ALIGNED-NEXT: flat_store_byte v[4:5], v51 offset:220
-; ALIGNED-NEXT: flat_store_byte v[4:5], v55 offset:218
-; ALIGNED-NEXT: flat_store_byte v[4:5], v54 offset:219
-; ALIGNED-NEXT: flat_store_byte v[4:5], v69 offset:217
+; ALIGNED-NEXT: flat_store_byte v[4:5], v55 offset:215
+; ALIGNED-NEXT: flat_store_byte v[4:5], v54 offset:216
+; ALIGNED-NEXT: flat_store_byte v[4:5], v69 offset:214
+; ALIGNED-NEXT: flat_store_byte v[4:5], v52 offset:220
+; ALIGNED-NEXT: flat_store_byte v[4:5], v66 offset:218
+; ALIGNED-NEXT: flat_store_byte v[4:5], v64 offset:219
+; ALIGNED-NEXT: flat_store_byte v[4:5], v68 offset:217
; ALIGNED-NEXT: flat_store_byte v[4:5], v71 offset:213
-; ALIGNED-NEXT: flat_store_byte v[4:5], v49 offset:205
-; ALIGNED-NEXT: buffer_store_dword v72, off, s[0:3], s32 offset:472
-; ALIGNED-NEXT: buffer_store_dword v79, off, s[0:3], s32 offset:476
-; ALIGNED-NEXT: buffer_store_dword v90, off, s[0:3], s32 offset:468
-; ALIGNED-NEXT: buffer_store_dword v107, off, s[0:3], s32 offset:464
+; ALIGNED-NEXT: flat_store_byte v[4:5], v50 offset:205
+; ALIGNED-NEXT: buffer_store_dword v74, off, s[0:3], s32 offset:472
+; ALIGNED-NEXT: buffer_store_dword v91, off, s[0:3], s32 offset:476
+; ALIGNED-NEXT: buffer_store_dword v95, off, s[0:3], s32 offset:468
+; ALIGNED-NEXT: buffer_store_dword v122, off, s[0:3], s32 offset:464
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload
-; ALIGNED-NEXT: flat_store_byte v[4:5], v82 offset:199
-; ALIGNED-NEXT: flat_store_byte v[4:5], v85 offset:200
-; ALIGNED-NEXT: flat_store_byte v[4:5], v83 offset:198
-; ALIGNED-NEXT: flat_store_byte v[4:5], v86 offset:204
-; ALIGNED-NEXT: flat_store_byte v[4:5], v87 offset:202
-; ALIGNED-NEXT: flat_store_byte v[4:5], v96 offset:203
+; ALIGNED-NEXT: flat_store_byte v[4:5], v83 offset:199
+; ALIGNED-NEXT: flat_store_byte v[4:5], v82 offset:200
+; ALIGNED-NEXT: flat_store_byte v[4:5], v84 offset:198
+; ALIGNED-NEXT: flat_store_byte v[4:5], v85 offset:204
+; ALIGNED-NEXT: flat_store_byte v[4:5], v96 offset:202
+; ALIGNED-NEXT: flat_store_byte v[4:5], v87 offset:203
; ALIGNED-NEXT: flat_store_byte v[4:5], v97 offset:201
-; ALIGNED-NEXT: flat_store_byte v[4:5], v84 offset:197
+; ALIGNED-NEXT: flat_store_byte v[4:5], v86 offset:197
; ALIGNED-NEXT: flat_store_byte v[4:5], v101 offset:191
; ALIGNED-NEXT: flat_store_byte v[4:5], v100 offset:192
-; ALIGNED-NEXT: flat_store_byte v[4:5], v112 offset:190
+; ALIGNED-NEXT: flat_store_byte v[4:5], v113 offset:190
; ALIGNED-NEXT: flat_store_byte v[4:5], v99 offset:196
; ALIGNED-NEXT: flat_store_byte v[4:5], v103 offset:194
; ALIGNED-NEXT: flat_store_byte v[4:5], v102 offset:195
-; ALIGNED-NEXT: flat_store_byte v[4:5], v113 offset:193
+; ALIGNED-NEXT: flat_store_byte v[4:5], v112 offset:193
; ALIGNED-NEXT: flat_store_byte v[4:5], v115 offset:189
; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00
; ALIGNED-NEXT: s_addc_u32 s5, s5, -1
@@ -15184,14 +15206,14 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload
-; ALIGNED-NEXT: flat_store_byte v[4:5], v116 offset:183
-; ALIGNED-NEXT: flat_store_byte v[4:5], v40 offset:184
-; ALIGNED-NEXT: flat_store_byte v[4:5], v118 offset:182
+; ALIGNED-NEXT: flat_store_byte v[4:5], v118 offset:183
+; ALIGNED-NEXT: flat_store_byte v[4:5], v116 offset:184
+; ALIGNED-NEXT: flat_store_byte v[4:5], v119 offset:182
; ALIGNED-NEXT: flat_store_byte v[4:5], v41 offset:188
; ALIGNED-NEXT: flat_store_byte v[4:5], v43 offset:186
; ALIGNED-NEXT: flat_store_byte v[4:5], v42 offset:187
; ALIGNED-NEXT: flat_store_byte v[4:5], v44 offset:185
-; ALIGNED-NEXT: flat_store_byte v[4:5], v119 offset:181
+; ALIGNED-NEXT: flat_store_byte v[4:5], v40 offset:181
; ALIGNED-NEXT: flat_store_byte v[4:5], v47 offset:175
; ALIGNED-NEXT: flat_store_byte v[4:5], v56 offset:176
; ALIGNED-NEXT: flat_store_byte v[4:5], v59 offset:174
@@ -15211,69 +15233,75 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Reload
-; ALIGNED-NEXT: flat_store_byte v[4:5], v62 offset:167
-; ALIGNED-NEXT: flat_store_byte v[4:5], v74 offset:168
-; ALIGNED-NEXT: flat_store_byte v[4:5], v63 offset:166
-; ALIGNED-NEXT: flat_store_byte v[4:5], v75 offset:172
-; ALIGNED-NEXT: flat_store_byte v[4:5], v77 offset:170
-; ALIGNED-NEXT: flat_store_byte v[4:5], v76 offset:171
-; ALIGNED-NEXT: flat_store_byte v[4:5], v78 offset:169
-; ALIGNED-NEXT: flat_store_byte v[4:5], v73 offset:165
-; ALIGNED-NEXT: flat_store_byte v[4:5], v94 offset:159
-; ALIGNED-NEXT: flat_store_byte v[4:5], v92 offset:160
-; ALIGNED-NEXT: flat_store_byte v[4:5], v105 offset:158
-; ALIGNED-NEXT: flat_store_byte v[4:5], v91 offset:164
-; ALIGNED-NEXT: flat_store_byte v[4:5], v95 offset:162
-; ALIGNED-NEXT: flat_store_byte v[4:5], v104 offset:163
-; ALIGNED-NEXT: flat_store_byte v[4:5], v106 offset:161
-; ALIGNED-NEXT: flat_store_byte v[4:5], v108 offset:157
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[4:5], v63 offset:167
+; ALIGNED-NEXT: flat_store_byte v[4:5], v62 offset:168
+; ALIGNED-NEXT: flat_store_byte v[4:5], v72 offset:166
+; ALIGNED-NEXT: flat_store_byte v[4:5], v77 offset:172
+; ALIGNED-NEXT: flat_store_byte v[4:5], v88 offset:170
+; ALIGNED-NEXT: flat_store_byte v[4:5], v79 offset:171
+; ALIGNED-NEXT: flat_store_byte v[4:5], v90 offset:169
+; ALIGNED-NEXT: flat_store_byte v[4:5], v78 offset:165
+; ALIGNED-NEXT: flat_store_byte v[4:5], v111 offset:159
+; ALIGNED-NEXT: flat_store_byte v[4:5], v109 offset:160
+; ALIGNED-NEXT: flat_store_byte v[4:5], v123 offset:158
+; ALIGNED-NEXT: flat_store_byte v[4:5], v108 offset:164
+; ALIGNED-NEXT: flat_store_byte v[4:5], v121 offset:162
+; ALIGNED-NEXT: flat_store_byte v[4:5], v120 offset:163
+; ALIGNED-NEXT: flat_store_byte v[4:5], v124 offset:161
+; ALIGNED-NEXT: flat_store_byte v[4:5], v125 offset:157
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1372 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1364 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Reload
-; ALIGNED-NEXT: flat_store_byte v[4:5], v111 offset:151
-; ALIGNED-NEXT: flat_store_byte v[4:5], v124 offset:152
-; ALIGNED-NEXT: flat_store_byte v[4:5], v122 offset:150
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[4:5], v127 offset:151
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:152
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:150
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1384 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:156
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:154
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1416 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:155
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:153
-; ALIGNED-NEXT: flat_store_byte v[4:5], v123 offset:149
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1356 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:149
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:143
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1344 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1340 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:144
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1340 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:142
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1360 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:148
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1360 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1352 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:146
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1356 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:147
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1352 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1344 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:145
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Reload
@@ -15522,13 +15550,13 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:992 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:996 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:988 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1004 ; 4-byte Folded Reload
@@ -15537,10 +15565,10 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1000 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:72
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:996 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:992 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:70
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:76
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload
@@ -15708,7 +15736,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644
-; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:640
+; ALIGNED-NEXT: buffer_store_dword v126, off, s[0:3], s32 offset:640
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:23
@@ -15733,11 +15761,9 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:21
-; ALIGNED-NEXT: flat_store_byte v[4:5], v88 offset:15
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1452 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:16
-; ALIGNED-NEXT: flat_store_byte v[4:5], v89 offset:14
+; ALIGNED-NEXT: flat_store_byte v[4:5], v75 offset:15
+; ALIGNED-NEXT: flat_store_byte v[4:5], v76 offset:16
+; ALIGNED-NEXT: flat_store_byte v[4:5], v73 offset:14
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:20
@@ -15750,53 +15776,49 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:17
-; ALIGNED-NEXT: flat_store_byte v[2:3], v93 offset:16
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[2:3], v89 offset:16
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1452 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Reload
-; ALIGNED-NEXT: flat_store_byte v[4:5], v109 offset:7
-; ALIGNED-NEXT: flat_store_byte v[4:5], v120 offset:8
-; ALIGNED-NEXT: flat_store_byte v[4:5], v125 offset:10
-; ALIGNED-NEXT: flat_store_byte v[4:5], v110 offset:6
-; ALIGNED-NEXT: flat_store_byte v[4:5], v126 offset:12
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:11
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:9
-; ALIGNED-NEXT: flat_store_byte v[2:3], v121 offset:8
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[4:5], v93 offset:7
+; ALIGNED-NEXT: flat_store_byte v[4:5], v92 offset:8
+; ALIGNED-NEXT: flat_store_byte v[4:5], v105 offset:10
+; ALIGNED-NEXT: flat_store_byte v[4:5], v94 offset:6
+; ALIGNED-NEXT: flat_store_byte v[4:5], v106 offset:12
+; ALIGNED-NEXT: flat_store_byte v[4:5], v107 offset:11
+; ALIGNED-NEXT: flat_store_byte v[4:5], v110 offset:9
+; ALIGNED-NEXT: flat_store_byte v[2:3], v104 offset:8
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:2
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1364 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1416 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[4:5], v0
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:1
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:4
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1384 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:2
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:3
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1372 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:4
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte v[2:3], v0
; ALIGNED-NEXT: s_cbranch_scc0 .LBB9_4
diff --git a/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll b/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll
index 01b7f40f6256f..a8b3ffbb21ce7 100644
--- a/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll
+++ b/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll
@@ -142,8 +142,8 @@ define void @memmove_p0_p0_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr add
; CHECK-NEXT: s_clause 0x4
; CHECK-NEXT: flat_load_dword v8, v[2:3] offset:16
; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3]
-; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28
; CHECK-NEXT: flat_load_dword v9, v[2:3] offset:24
+; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28
; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:30
; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
; CHECK-NEXT: flat_store_dword v[0:1], v8 offset:16
@@ -198,18 +198,18 @@ define void @memmove_p0_p0_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:30
-; CHECK-NEXT: flat_load_dword v9, v[2:3] offset:16
-; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28
+; CHECK-NEXT: flat_load_dword v8, v[2:3] offset:16
; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3]
-; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT: flat_store_dword v[0:1], v9 offset:16
-; CHECK-NEXT: flat_load_dword v9, v[2:3] offset:20
+; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:28
+; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
+; CHECK-NEXT: flat_store_dword v[0:1], v8 offset:16
+; CHECK-NEXT: flat_load_dword v8, v[2:3] offset:20
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_dword v[0:1], v9 offset:20
+; CHECK-NEXT: flat_store_dword v[0:1], v8 offset:20
; CHECK-NEXT: flat_load_dword v2, v[2:3] offset:24
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:30
-; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:30
+; CHECK-NEXT: flat_store_short v[0:1], v9 offset:28
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(2)
; CHECK-NEXT: flat_store_dword v[0:1], v2 offset:24
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
@@ -371,8 +371,8 @@ define void @memmove_p0_p1_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr add
; CHECK-NEXT: s_clause 0x4
; CHECK-NEXT: global_load_dword v8, v[2:3], off offset:16
; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
-; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:24
+; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:30
; CHECK-NEXT: s_waitcnt vmcnt(4)
; CHECK-NEXT: flat_store_dword v[0:1], v8 offset:16
@@ -427,18 +427,18 @@ define void @memmove_p0_p1_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:30
-; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:16
-; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
+; CHECK-NEXT: global_load_dword v8, v[2:3], off offset:16
; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
-; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: flat_store_dword v[0:1], v9 offset:16
-; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:20
+; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:28
+; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: flat_store_dword v[0:1], v8 offset:16
+; CHECK-NEXT: global_load_dword v8, v[2:3], off offset:20
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dword v[0:1], v9 offset:20
+; CHECK-NEXT: flat_store_dword v[0:1], v8 offset:20
; CHECK-NEXT: global_load_dword v2, v[2:3], off offset:24
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:30
-; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:30
+; CHECK-NEXT: flat_store_short v[0:1], v9 offset:28
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dword v[0:1], v2 offset:24
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
@@ -820,8 +820,8 @@ define void @memmove_p0_p4_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr add
; CHECK-NEXT: s_clause 0x4
; CHECK-NEXT: global_load_dword v8, v[2:3], off offset:16
; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
-; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:24
+; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:30
; CHECK-NEXT: s_waitcnt vmcnt(4)
; CHECK-NEXT: flat_store_dword v[0:1], v8 offset:16
@@ -876,18 +876,18 @@ define void @memmove_p0_p4_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:30
-; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:16
-; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
+; CHECK-NEXT: global_load_dword v8, v[2:3], off offset:16
; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
-; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: flat_store_dword v[0:1], v9 offset:16
-; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:20
+; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:28
+; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: flat_store_dword v[0:1], v8 offset:16
+; CHECK-NEXT: global_load_dword v8, v[2:3], off offset:20
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dword v[0:1], v9 offset:20
+; CHECK-NEXT: flat_store_dword v[0:1], v8 offset:20
; CHECK-NEXT: global_load_dword v2, v[2:3], off offset:24
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:30
-; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:30
+; CHECK-NEXT: flat_store_short v[0:1], v9 offset:28
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dword v[0:1], v2 offset:24
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
@@ -1320,8 +1320,8 @@ define void @memmove_p1_p0_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr add
; CHECK-NEXT: s_clause 0x4
; CHECK-NEXT: flat_load_dword v8, v[2:3] offset:16
; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3]
-; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28
; CHECK-NEXT: flat_load_dword v9, v[2:3] offset:24
+; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28
; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:30
; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
; CHECK-NEXT: global_store_dword v[0:1], v8, off offset:16
@@ -1373,18 +1373,18 @@ define void @memmove_p1_p0_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:30
-; CHECK-NEXT: flat_load_dword v9, v[2:3] offset:16
-; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28
+; CHECK-NEXT: flat_load_dword v8, v[2:3] offset:16
; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3]
-; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT: global_store_dword v[0:1], v9, off offset:16
-; CHECK-NEXT: flat_load_dword v9, v[2:3] offset:20
+; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:28
+; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
+; CHECK-NEXT: global_store_dword v[0:1], v8, off offset:16
+; CHECK-NEXT: flat_load_dword v8, v[2:3] offset:20
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_store_dword v[0:1], v9, off offset:20
+; CHECK-NEXT: global_store_dword v[0:1], v8, off offset:20
; CHECK-NEXT: flat_load_dword v2, v[2:3] offset:24
-; CHECK-NEXT: global_store_byte v[0:1], v8, off offset:30
-; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT: global_store_byte v[0:1], v10, off offset:30
+; CHECK-NEXT: global_store_short v[0:1], v9, off offset:28
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: global_store_dword v[0:1], v2, off offset:24
; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
@@ -1537,8 +1537,8 @@ define void @memmove_p1_p1_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr add
; CHECK-NEXT: s_clause 0x4
; CHECK-NEXT: global_load_dword v8, v[2:3], off offset:16
; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
-; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:24
+; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:30
; CHECK-NEXT: s_waitcnt vmcnt(4)
; CHECK-NEXT: global_store_dword v[0:1], v8, off offset:16
@@ -1590,18 +1590,18 @@ define void @memmove_p1_p1_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:30
-; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:16
-; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
+; CHECK-NEXT: global_load_dword v8, v[2:3], off offset:16
; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
-; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: global_store_dword v[0:1], v9, off offset:16
-; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:20
+; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:28
+; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: global_store_dword v[0:1], v8, off offset:16
+; CHECK-NEXT: global_load_dword v8, v[2:3], off offset:20
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dword v[0:1], v9, off offset:20
+; CHECK-NEXT: global_store_dword v[0:1], v8, off offset:20
; CHECK-NEXT: global_load_dword v2, v[2:3], off offset:24
-; CHECK-NEXT: global_store_byte v[0:1], v8, off offset:30
-; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT: global_store_byte v[0:1], v10, off offset:30
+; CHECK-NEXT: global_store_short v[0:1], v9, off offset:28
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_dword v[0:1], v2, off offset:24
; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
@@ -1755,16 +1755,17 @@ define void @memmove_p1_p3_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr add
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: ds_read2_b64 v[3:6], v2 offset1:1
; CHECK-NEXT: ds_read_b32 v7, v2 offset:16
-; CHECK-NEXT: ds_read_u8 v10, v2 offset:30
; CHECK-NEXT: ds_read2_b32 v[8:9], v2 offset0:5 offset1:6
-; CHECK-NEXT: ds_read_u16 v2, v2 offset:28
+; CHECK-NEXT: ds_read_u16 v10, v2 offset:28
+; CHECK-NEXT: ds_read_u8 v2, v2 offset:30
; CHECK-NEXT: s_waitcnt lgkmcnt(4)
; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
-; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
; CHECK-NEXT: global_store_dwordx3 v[0:1], v[7:9], off offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_store_short v[0:1], v2, off offset:28
-; CHECK-NEXT: global_store_byte v[0:1], v10, off offset:30
+; CHECK-NEXT: global_store_byte v[0:1], v2, off offset:30
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p1.p3.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(3) noundef nonnull align 8 %src, i64 31, i1 false)
@@ -1806,16 +1807,17 @@ define void @memmove_p1_p3_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: ds_read_b128 v[3:6], v2
; CHECK-NEXT: ds_read_b32 v7, v2 offset:16
-; CHECK-NEXT: ds_read_u8 v10, v2 offset:30
; CHECK-NEXT: ds_read2_b32 v[8:9], v2 offset0:5 offset1:6
-; CHECK-NEXT: ds_read_u16 v2, v2 offset:28
+; CHECK-NEXT: ds_read_u16 v10, v2 offset:28
+; CHECK-NEXT: ds_read_u8 v2, v2 offset:30
; CHECK-NEXT: s_waitcnt lgkmcnt(4)
; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
-; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
; CHECK-NEXT: global_store_dwordx3 v[0:1], v[7:9], off offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_store_short v[0:1], v2, off offset:28
-; CHECK-NEXT: global_store_byte v[0:1], v10, off offset:30
+; CHECK-NEXT: global_store_byte v[0:1], v2, off offset:30
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p1.p3.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(3) noundef nonnull align 16 %src, i64 31, i1 false)
@@ -1964,8 +1966,8 @@ define void @memmove_p1_p4_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr add
; CHECK-NEXT: s_clause 0x4
; CHECK-NEXT: global_load_dword v8, v[2:3], off offset:16
; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
-; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:24
+; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:30
; CHECK-NEXT: s_waitcnt vmcnt(4)
; CHECK-NEXT: global_store_dword v[0:1], v8, off offset:16
@@ -2017,18 +2019,18 @@ define void @memmove_p1_p4_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:30
-; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:16
-; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
+; CHECK-NEXT: global_load_dword v8, v[2:3], off offset:16
; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
-; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: global_store_dword v[0:1], v9, off offset:16
-; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:20
+; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:28
+; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: global_store_dword v[0:1], v8, off offset:16
+; CHECK-NEXT: global_load_dword v8, v[2:3], off offset:20
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dword v[0:1], v9, off offset:20
+; CHECK-NEXT: global_store_dword v[0:1], v8, off offset:20
; CHECK-NEXT: global_load_dword v2, v[2:3], off offset:24
-; CHECK-NEXT: global_store_byte v[0:1], v8, off offset:30
-; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT: global_store_byte v[0:1], v10, off offset:30
+; CHECK-NEXT: global_store_short v[0:1], v9, off offset:28
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_dword v[0:1], v2, off offset:24
; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
@@ -2513,18 +2515,18 @@ define void @memmove_p3_p0_sz31_align_16_16(ptr addrspace(3) align 16 %dst, ptr
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:30
-; CHECK-NEXT: flat_load_dword v8, v[1:2] offset:16
-; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:28
+; CHECK-NEXT: flat_load_dword v7, v[1:2] offset:16
; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2]
-; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT: ds_write_b32 v0, v8 offset:16
-; CHECK-NEXT: flat_load_dword v8, v[1:2] offset:20
+; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:28
+; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
+; CHECK-NEXT: ds_write_b32 v0, v7 offset:16
+; CHECK-NEXT: flat_load_dword v7, v[1:2] offset:20
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: ds_write_b32 v0, v8 offset:20
+; CHECK-NEXT: ds_write_b32 v0, v7 offset:20
; CHECK-NEXT: flat_load_dword v1, v[1:2] offset:24
-; CHECK-NEXT: ds_write_b8 v0, v7 offset:30
-; CHECK-NEXT: ds_write_b16 v0, v9 offset:28
+; CHECK-NEXT: ds_write_b8 v0, v9 offset:30
+; CHECK-NEXT: ds_write_b16 v0, v8 offset:28
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(2)
; CHECK-NEXT: ds_write_b32 v0, v1 offset:24
; CHECK-NEXT: ds_write_b128 v0, v[3:6]
@@ -3751,18 +3753,18 @@ define void @memmove_p5_p0_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:30
-; CHECK-NEXT: flat_load_dword v8, v[1:2] offset:16
-; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:28
+; CHECK-NEXT: flat_load_dword v7, v[1:2] offset:16
; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2]
-; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT: flat_load_dword v8, v[1:2] offset:20
+; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:28
+; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: flat_load_dword v7, v[1:2] offset:20
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT: flat_load_dword v1, v[1:2] offset:24
-; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28
; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll b/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll
index 9585c486aeb9e..fd86113a3538d 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll
@@ -300,14 +300,14 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr(ptr
; GFX908-NEXT: ; def a0
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
; GFX908-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
-; GFX908-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
+; GFX908-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
; GFX908-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
-; GFX908-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
+; GFX908-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
; GFX908-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
-; GFX908-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
+; GFX908-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
; GFX908-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
+; GFX908-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_accvgpr_write_b32 a0, v0
; GFX908-NEXT: v_accvgpr_write_b32 a1, v1
@@ -406,14 +406,14 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_phys_agpr(ptr add
; GFX908-NEXT: ; use a[100:131]
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
; GFX908-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
-; GFX908-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
+; GFX908-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
; GFX908-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
-; GFX908-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
+; GFX908-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
; GFX908-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
-; GFX908-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
+; GFX908-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
; GFX908-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
+; GFX908-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_accvgpr_write_b32 a0, v0
; GFX908-NEXT: v_accvgpr_write_b32 a1, v1
@@ -512,14 +512,14 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_no_agprs(ptr addr
; GFX908-NEXT: ; def v0
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
; GFX908-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
-; GFX908-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
+; GFX908-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
; GFX908-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
-; GFX908-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
+; GFX908-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
; GFX908-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
-; GFX908-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
+; GFX908-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
; GFX908-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
+; GFX908-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_accvgpr_write_b32 a0, v0
; GFX908-NEXT: v_accvgpr_write_b32 a1, v1
@@ -640,14 +640,14 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call(ptr addrspace(1) %arg)
; GFX908-NEXT: v_mov_b32_e32 v40, 0
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX908-NEXT: global_load_dwordx4 v[28:31], v40, s[34:35] offset:112
; GFX908-NEXT: global_load_dwordx4 v[24:27], v40, s[34:35] offset:96
-; GFX908-NEXT: global_load_dwordx4 v[20:23], v40, s[34:35] offset:80
+; GFX908-NEXT: global_load_dwordx4 v[28:31], v40, s[34:35] offset:112
; GFX908-NEXT: global_load_dwordx4 v[16:19], v40, s[34:35] offset:64
-; GFX908-NEXT: global_load_dwordx4 v[12:15], v40, s[34:35] offset:48
+; GFX908-NEXT: global_load_dwordx4 v[20:23], v40, s[34:35] offset:80
; GFX908-NEXT: global_load_dwordx4 v[8:11], v40, s[34:35] offset:32
-; GFX908-NEXT: global_load_dwordx4 v[4:7], v40, s[34:35] offset:16
+; GFX908-NEXT: global_load_dwordx4 v[12:15], v40, s[34:35] offset:48
; GFX908-NEXT: global_load_dwordx4 v[0:3], v40, s[34:35]
+; GFX908-NEXT: global_load_dwordx4 v[4:7], v40, s[34:35] offset:16
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_accvgpr_write_b32 a0, v0
; GFX908-NEXT: v_accvgpr_write_b32 a1, v1
@@ -925,14 +925,14 @@ define void @test_mfma_f32_32x32x1f32_nonentry_noagpr(ptr addrspace(1) %arg) #0
; GFX908-LABEL: test_mfma_f32_32x32x1f32_nonentry_noagpr:
; GFX908: ; %bb.0: ; %bb
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dwordx4 v[30:33], v[0:1], off offset:112
; GFX908-NEXT: global_load_dwordx4 v[26:29], v[0:1], off offset:96
-; GFX908-NEXT: global_load_dwordx4 v[22:25], v[0:1], off offset:80
+; GFX908-NEXT: global_load_dwordx4 v[30:33], v[0:1], off offset:112
; GFX908-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:64
-; GFX908-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:48
+; GFX908-NEXT: global_load_dwordx4 v[22:25], v[0:1], off offset:80
; GFX908-NEXT: global_load_dwordx4 v[10:13], v[0:1], off offset:32
-; GFX908-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16
+; GFX908-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:48
; GFX908-NEXT: global_load_dwordx4 v[2:5], v[0:1], off
+; GFX908-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_accvgpr_write_b32 a0, v2
; GFX908-NEXT: v_accvgpr_write_b32 a1, v3
@@ -1026,14 +1026,14 @@ define void @test_mfma_f32_32x32x1f32_nonentry_with_agpr(ptr addrspace(1) %arg)
; GFX908-LABEL: test_mfma_f32_32x32x1f32_nonentry_with_agpr:
; GFX908: ; %bb.0: ; %bb
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dwordx4 v[30:33], v[0:1], off offset:112
; GFX908-NEXT: global_load_dwordx4 v[26:29], v[0:1], off offset:96
-; GFX908-NEXT: global_load_dwordx4 v[22:25], v[0:1], off offset:80
+; GFX908-NEXT: global_load_dwordx4 v[30:33], v[0:1], off offset:112
; GFX908-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:64
-; GFX908-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:48
+; GFX908-NEXT: global_load_dwordx4 v[22:25], v[0:1], off offset:80
; GFX908-NEXT: global_load_dwordx4 v[10:13], v[0:1], off offset:32
-; GFX908-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16
+; GFX908-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:48
; GFX908-NEXT: global_load_dwordx4 v[2:5], v[0:1], off
+; GFX908-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_accvgpr_write_b32 a0, v2
; GFX908-NEXT: v_accvgpr_write_b32 a1, v3
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll
index 30ad3be46053c..829895d6784a4 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll
@@ -12,28 +12,29 @@ define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr ad
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5
-; GFX9-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6
-; GFX9-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7
-; GFX9-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1
-; GFX9-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3
-; GFX9-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4
+; GFX9-NEXT: global_load_ubyte v2, v1, s[2:3] offset:7
+; GFX9-NEXT: global_load_ubyte v3, v1, s[2:3] offset:1
+; GFX9-NEXT: global_load_ubyte v4, v1, s[2:3] offset:3
+; GFX9-NEXT: global_load_ubyte v5, v1, s[2:3] offset:4
+; GFX9-NEXT: global_load_ubyte v6, v1, s[2:3] offset:6
; GFX9-NEXT: global_load_ubyte v7, v1, s[2:3]
; GFX9-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2
; GFX9-NEXT: s_waitcnt vmcnt(7)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0
+; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; GFX9-NEXT: s_waitcnt vmcnt(5)
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; GFX9-NEXT: s_waitcnt vmcnt(4)
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v4
; GFX9-NEXT: s_waitcnt vmcnt(3)
-; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5
+; GFX9-NEXT: v_or_b32_e32 v0, v0, v5
; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_or_b32_e32 v0, v0, v6
-; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_or_b32_e32 v3, v4, v7
+; GFX9-NEXT: v_or_b32_e32 v3, v3, v7
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v4, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v4, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_or_b32_e32 v0, v2, v0
; GFX9-NEXT: v_or_b32_e32 v2, v4, v3
; GFX9-NEXT: v_ffbh_u32_e32 v2, v2
@@ -49,33 +50,36 @@ define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr ad
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x7
-; GFX10-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5
-; GFX10-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6
-; GFX10-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7
-; GFX10-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1
-; GFX10-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3
-; GFX10-NEXT: global_load_ubyte v6, v1, s[2:3]
-; GFX10-NEXT: global_load_ubyte v7, v1, s[2:3] offset:2
-; GFX10-NEXT: global_load_ubyte v8, v1, s[2:3] offset:4
+; GFX10-NEXT: global_load_ubyte v0, v1, s[2:3] offset:1
+; GFX10-NEXT: global_load_ubyte v2, v1, s[2:3] offset:3
+; GFX10-NEXT: global_load_ubyte v3, v1, s[2:3] offset:5
+; GFX10-NEXT: global_load_ubyte v4, v1, s[2:3] offset:7
+; GFX10-NEXT: global_load_ubyte v5, v1, s[2:3]
+; GFX10-NEXT: global_load_ubyte v6, v1, s[2:3] offset:2
+; GFX10-NEXT: global_load_ubyte v7, v1, s[2:3] offset:4
+; GFX10-NEXT: global_load_ubyte v8, v1, s[2:3] offset:6
; GFX10-NEXT: s_waitcnt vmcnt(7)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0
+; GFX10-NEXT: s_waitcnt vmcnt(6)
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; GFX10-NEXT: s_waitcnt vmcnt(5)
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; GFX10-NEXT: s_waitcnt vmcnt(4)
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 8, v4
; GFX10-NEXT: s_waitcnt vmcnt(3)
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 8, v5
+; GFX10-NEXT: v_or_b32_e32 v0, v0, v5
+; GFX10-NEXT: s_waitcnt vmcnt(2)
+; GFX10-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_or_b32_e32 v3, v3, v7
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_or_b32_e32 v0, v0, v8
-; GFX10-NEXT: v_or_b32_e32 v4, v4, v6
-; GFX10-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_or_b32_e32 v3, v5, v4
+; GFX10-NEXT: v_or_b32_sdwa v4, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX10-NEXT: v_ffbh_u32_e32 v2, v3
+; GFX10-NEXT: v_or_b32_e32 v2, v4, v3
; GFX10-NEXT: v_ffbh_u32_e32 v0, v0
-; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, 32 clamp
-; GFX10-NEXT: v_min_u32_e32 v0, v2, v0
+; GFX10-NEXT: v_ffbh_u32_e32 v2, v2
+; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, 32 clamp
+; GFX10-NEXT: v_min_u32_e32 v0, v0, v2
; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
%val = load i64, ptr addrspace(1) %arrayidx, align 1
@@ -91,28 +95,29 @@ define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5
-; GFX9-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6
-; GFX9-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7
-; GFX9-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1
-; GFX9-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3
-; GFX9-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4
+; GFX9-NEXT: global_load_ubyte v2, v1, s[2:3] offset:7
+; GFX9-NEXT: global_load_ubyte v3, v1, s[2:3] offset:1
+; GFX9-NEXT: global_load_ubyte v4, v1, s[2:3] offset:3
+; GFX9-NEXT: global_load_ubyte v5, v1, s[2:3] offset:4
+; GFX9-NEXT: global_load_ubyte v6, v1, s[2:3] offset:6
; GFX9-NEXT: global_load_ubyte v7, v1, s[2:3]
; GFX9-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2
; GFX9-NEXT: s_waitcnt vmcnt(7)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0
+; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; GFX9-NEXT: s_waitcnt vmcnt(5)
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; GFX9-NEXT: s_waitcnt vmcnt(4)
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v4
; GFX9-NEXT: s_waitcnt vmcnt(3)
-; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5
+; GFX9-NEXT: v_or_b32_e32 v0, v0, v5
; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_or_b32_e32 v0, v0, v6
-; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_or_b32_e32 v3, v4, v7
+; GFX9-NEXT: v_or_b32_e32 v3, v3, v7
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v4, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v4, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_or_b32_e32 v0, v2, v0
; GFX9-NEXT: v_or_b32_e32 v2, v4, v3
; GFX9-NEXT: v_ffbh_u32_e32 v2, v2
@@ -129,33 +134,36 @@ define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x7
-; GFX10-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5
-; GFX10-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6
-; GFX10-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7
-; GFX10-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1
-; GFX10-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3
-; GFX10-NEXT: global_load_ubyte v6, v1, s[2:3]
-; GFX10-NEXT: global_load_ubyte v7, v1, s[2:3] offset:2
-; GFX10-NEXT: global_load_ubyte v8, v1, s[2:3] offset:4
+; GFX10-NEXT: global_load_ubyte v0, v1, s[2:3] offset:1
+; GFX10-NEXT: global_load_ubyte v2, v1, s[2:3] offset:3
+; GFX10-NEXT: global_load_ubyte v3, v1, s[2:3] offset:5
+; GFX10-NEXT: global_load_ubyte v4, v1, s[2:3] offset:7
+; GFX10-NEXT: global_load_ubyte v5, v1, s[2:3]
+; GFX10-NEXT: global_load_ubyte v6, v1, s[2:3] offset:2
+; GFX10-NEXT: global_load_ubyte v7, v1, s[2:3] offset:4
+; GFX10-NEXT: global_load_ubyte v8, v1, s[2:3] offset:6
; GFX10-NEXT: s_waitcnt vmcnt(7)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0
+; GFX10-NEXT: s_waitcnt vmcnt(6)
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; GFX10-NEXT: s_waitcnt vmcnt(5)
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; GFX10-NEXT: s_waitcnt vmcnt(4)
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 8, v4
; GFX10-NEXT: s_waitcnt vmcnt(3)
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 8, v5
+; GFX10-NEXT: v_or_b32_e32 v0, v0, v5
+; GFX10-NEXT: s_waitcnt vmcnt(2)
+; GFX10-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_or_b32_e32 v3, v3, v7
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_or_b32_e32 v0, v0, v8
-; GFX10-NEXT: v_or_b32_e32 v4, v4, v6
-; GFX10-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_or_b32_e32 v3, v5, v4
+; GFX10-NEXT: v_or_b32_sdwa v4, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX10-NEXT: v_ffbh_u32_e32 v2, v3
+; GFX10-NEXT: v_or_b32_e32 v2, v4, v3
; GFX10-NEXT: v_ffbh_u32_e32 v0, v0
-; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, 32 clamp
-; GFX10-NEXT: v_min_u32_e32 v0, v2, v0
+; GFX10-NEXT: v_ffbh_u32_e32 v2, v2
+; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, 32 clamp
+; GFX10-NEXT: v_min_u32_e32 v0, v0, v2
; GFX10-NEXT: v_min_u32_e32 v0, 64, v0
; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
@@ -172,28 +180,29 @@ define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr ad
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5
-; GFX9-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6
-; GFX9-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7
-; GFX9-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1
-; GFX9-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3
-; GFX9-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4
+; GFX9-NEXT: global_load_ubyte v2, v1, s[2:3] offset:7
+; GFX9-NEXT: global_load_ubyte v3, v1, s[2:3] offset:1
+; GFX9-NEXT: global_load_ubyte v4, v1, s[2:3] offset:3
+; GFX9-NEXT: global_load_ubyte v5, v1, s[2:3] offset:4
+; GFX9-NEXT: global_load_ubyte v6, v1, s[2:3] offset:6
; GFX9-NEXT: global_load_ubyte v7, v1, s[2:3]
; GFX9-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2
; GFX9-NEXT: s_waitcnt vmcnt(7)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0
+; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; GFX9-NEXT: s_waitcnt vmcnt(5)
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; GFX9-NEXT: s_waitcnt vmcnt(4)
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v4
; GFX9-NEXT: s_waitcnt vmcnt(3)
-; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5
+; GFX9-NEXT: v_or_b32_e32 v0, v0, v5
; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_or_b32_e32 v0, v0, v6
-; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_or_b32_e32 v3, v4, v7
+; GFX9-NEXT: v_or_b32_e32 v3, v3, v7
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v4, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v4, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_or_b32_e32 v0, v2, v0
; GFX9-NEXT: v_or_b32_e32 v2, v4, v3
; GFX9-NEXT: v_ffbl_b32_e32 v0, v0
@@ -211,27 +220,28 @@ define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr ad
; GFX10-NEXT: s_clause 0x7
; GFX10-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5
; GFX10-NEXT: global_load_ubyte v2, v1, s[2:3] offset:7
-; GFX10-NEXT: global_load_ubyte v3, v1, s[2:3] offset:6
-; GFX10-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1
-; GFX10-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3
-; GFX10-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4
+; GFX10-NEXT: global_load_ubyte v3, v1, s[2:3] offset:1
+; GFX10-NEXT: global_load_ubyte v4, v1, s[2:3] offset:3
+; GFX10-NEXT: global_load_ubyte v5, v1, s[2:3] offset:4
+; GFX10-NEXT: global_load_ubyte v6, v1, s[2:3] offset:6
; GFX10-NEXT: global_load_ubyte v7, v1, s[2:3]
; GFX10-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2
; GFX10-NEXT: s_waitcnt vmcnt(7)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0
; GFX10-NEXT: s_waitcnt vmcnt(6)
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX10-NEXT: s_waitcnt vmcnt(5)
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; GFX10-NEXT: s_waitcnt vmcnt(4)
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 8, v4
; GFX10-NEXT: s_waitcnt vmcnt(3)
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 8, v5
+; GFX10-NEXT: v_or_b32_e32 v0, v0, v5
; GFX10-NEXT: s_waitcnt vmcnt(2)
-; GFX10-NEXT: v_or_b32_e32 v0, v0, v6
-; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: v_or_b32_e32 v3, v4, v7
+; GFX10-NEXT: v_or_b32_e32 v3, v3, v7
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_or_b32_sdwa v4, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_or_b32_sdwa v4, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_or_b32_e32 v0, v2, v0
; GFX10-NEXT: v_or_b32_e32 v2, v4, v3
; GFX10-NEXT: v_ffbl_b32_e32 v0, v0
@@ -253,28 +263,29 @@ define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5
-; GFX9-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6
-; GFX9-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7
-; GFX9-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1
-; GFX9-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3
-; GFX9-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4
+; GFX9-NEXT: global_load_ubyte v2, v1, s[2:3] offset:7
+; GFX9-NEXT: global_load_ubyte v3, v1, s[2:3] offset:1
+; GFX9-NEXT: global_load_ubyte v4, v1, s[2:3] offset:3
+; GFX9-NEXT: global_load_ubyte v5, v1, s[2:3] offset:4
+; GFX9-NEXT: global_load_ubyte v6, v1, s[2:3] offset:6
; GFX9-NEXT: global_load_ubyte v7, v1, s[2:3]
; GFX9-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2
; GFX9-NEXT: s_waitcnt vmcnt(7)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0
+; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; GFX9-NEXT: s_waitcnt vmcnt(5)
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; GFX9-NEXT: s_waitcnt vmcnt(4)
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v4
; GFX9-NEXT: s_waitcnt vmcnt(3)
-; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5
+; GFX9-NEXT: v_or_b32_e32 v0, v0, v5
; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_or_b32_e32 v0, v0, v6
-; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_or_b32_e32 v3, v4, v7
+; GFX9-NEXT: v_or_b32_e32 v3, v3, v7
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v4, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v4, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_or_b32_e32 v0, v2, v0
; GFX9-NEXT: v_or_b32_e32 v2, v4, v3
; GFX9-NEXT: v_ffbl_b32_e32 v0, v0
@@ -293,27 +304,28 @@ define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace
; GFX10-NEXT: s_clause 0x7
; GFX10-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5
; GFX10-NEXT: global_load_ubyte v2, v1, s[2:3] offset:7
-; GFX10-NEXT: global_load_ubyte v3, v1, s[2:3] offset:6
-; GFX10-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1
-; GFX10-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3
-; GFX10-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4
+; GFX10-NEXT: global_load_ubyte v3, v1, s[2:3] offset:1
+; GFX10-NEXT: global_load_ubyte v4, v1, s[2:3] offset:3
+; GFX10-NEXT: global_load_ubyte v5, v1, s[2:3] offset:4
+; GFX10-NEXT: global_load_ubyte v6, v1, s[2:3] offset:6
; GFX10-NEXT: global_load_ubyte v7, v1, s[2:3]
; GFX10-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2
; GFX10-NEXT: s_waitcnt vmcnt(7)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0
; GFX10-NEXT: s_waitcnt vmcnt(6)
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX10-NEXT: s_waitcnt vmcnt(5)
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; GFX10-NEXT: s_waitcnt vmcnt(4)
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 8, v4
; GFX10-NEXT: s_waitcnt vmcnt(3)
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 8, v5
+; GFX10-NEXT: v_or_b32_e32 v0, v0, v5
; GFX10-NEXT: s_waitcnt vmcnt(2)
-; GFX10-NEXT: v_or_b32_e32 v0, v0, v6
-; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: v_or_b32_e32 v3, v4, v7
+; GFX10-NEXT: v_or_b32_e32 v3, v3, v7
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_or_b32_sdwa v4, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_or_b32_sdwa v4, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_or_b32_e32 v0, v2, v0
; GFX10-NEXT: v_or_b32_e32 v2, v4, v3
; GFX10-NEXT: v_ffbl_b32_e32 v0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/neg_ashr64_reduce.ll b/llvm/test/CodeGen/AMDGPU/neg_ashr64_reduce.ll
index db82530f66aa4..abe729d41f86c 100644
--- a/llvm/test/CodeGen/AMDGPU/neg_ashr64_reduce.ll
+++ b/llvm/test/CodeGen/AMDGPU/neg_ashr64_reduce.ll
@@ -48,19 +48,23 @@ define <3 x i64> @v3_ashr_metadata(ptr %arg0.ptr, ptr %arg1.ptr) {
; CHECK-LABEL: v3_ashr_metadata:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
-; CHECK-NEXT: v_add_co_u32_e32 v0, vcc, 20, v0
-; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; CHECK-NEXT: v_add_co_u32_e32 v11, vcc, 20, v0
+; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3]
+; CHECK-NEXT: v_addc_co_u32_e32 v12, vcc, 0, v1, vcc
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_load_dword v4, v[2:3] offset:16
-; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3]
-; CHECK-NEXT: v_mov_b32_e32 v3, -1
-; CHECK-NEXT: flat_load_dword v1, v[0:1]
+; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[0:1]
+; CHECK-NEXT: flat_load_dword v5, v[11:12]
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_ashrrev_i32_e32 v0, v8, v5
-; CHECK-NEXT: v_ashrrev_i32_e32 v2, v10, v7
-; CHECK-NEXT: v_ashrrev_i32_e32 v4, v4, v1
+; CHECK-NEXT: flat_load_dword v7, v[2:3] offset:16
+; CHECK-NEXT: ; kill: killed $vgpr2 killed $vgpr3
+; CHECK-NEXT: ; kill: killed $vgpr0 killed $vgpr1
; CHECK-NEXT: v_mov_b32_e32 v1, -1
+; CHECK-NEXT: v_mov_b32_e32 v3, -1
+; CHECK-NEXT: ; kill: killed $vgpr11 killed $vgpr12
+; CHECK-NEXT: v_ashrrev_i32_e32 v0, v4, v8
+; CHECK-NEXT: v_ashrrev_i32_e32 v2, v6, v10
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_ashrrev_i32_e32 v4, v7, v5
; CHECK-NEXT: v_mov_b32_e32 v5, -1
; CHECK-NEXT: s_setpc_b64 s[30:31]
%val = load <3 x i64>, ptr %arg0.ptr, !range !4, !noundef !{}
diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
index 1ed024f7aed36..428a300c713b5 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -188,64 +188,61 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX900-LABEL: fadd_v32_vs:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0
+; GFX900-NEXT: v_lshlrev_b32_e32 v32, 7, v0
; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
-; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
-; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16
-; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[0:1]
-; GFX900-NEXT: global_load_dwordx4 v[9:12], v0, s[0:1] offset:48
-; GFX900-NEXT: global_load_dwordx4 v[13:16], v0, s[0:1] offset:32
-; GFX900-NEXT: global_load_dwordx4 v[17:20], v0, s[0:1] offset:80
-; GFX900-NEXT: global_load_dwordx4 v[21:24], v0, s[0:1] offset:64
-; GFX900-NEXT: s_waitcnt vmcnt(5)
-; GFX900-NEXT: v_add_f32_e32 v4, s43, v4
-; GFX900-NEXT: v_add_f32_e32 v3, s42, v3
-; GFX900-NEXT: v_add_f32_e32 v2, s41, v2
-; GFX900-NEXT: v_add_f32_e32 v1, s40, v1
-; GFX900-NEXT: v_add_f32_e32 v32, s19, v32
-; GFX900-NEXT: v_add_f32_e32 v31, s18, v31
-; GFX900-NEXT: v_add_f32_e32 v30, s17, v30
-; GFX900-NEXT: v_add_f32_e32 v29, s16, v29
-; GFX900-NEXT: s_waitcnt vmcnt(4)
-; GFX900-NEXT: v_add_f32_e32 v8, s39, v8
-; GFX900-NEXT: v_add_f32_e32 v7, s38, v7
-; GFX900-NEXT: v_add_f32_e32 v6, s37, v6
-; GFX900-NEXT: v_add_f32_e32 v5, s36, v5
-; GFX900-NEXT: s_waitcnt vmcnt(3)
-; GFX900-NEXT: v_add_f32_e32 v12, s51, v12
-; GFX900-NEXT: v_add_f32_e32 v11, s50, v11
-; GFX900-NEXT: v_add_f32_e32 v10, s49, v10
-; GFX900-NEXT: v_add_f32_e32 v9, s48, v9
-; GFX900-NEXT: s_waitcnt vmcnt(2)
-; GFX900-NEXT: v_add_f32_e32 v16, s47, v16
-; GFX900-NEXT: v_add_f32_e32 v15, s46, v15
-; GFX900-NEXT: v_add_f32_e32 v14, s45, v14
-; GFX900-NEXT: v_add_f32_e32 v13, s44, v13
-; GFX900-NEXT: s_waitcnt vmcnt(1)
-; GFX900-NEXT: v_add_f32_e32 v20, s15, v20
-; GFX900-NEXT: v_add_f32_e32 v19, s14, v19
-; GFX900-NEXT: v_add_f32_e32 v18, s13, v18
-; GFX900-NEXT: v_add_f32_e32 v17, s12, v17
+; GFX900-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
+; GFX900-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1]
+; GFX900-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:48
+; GFX900-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:32
+; GFX900-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:80
+; GFX900-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:64
+; GFX900-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
+; GFX900-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
+; GFX900-NEXT: s_waitcnt vmcnt(7)
+; GFX900-NEXT: v_add_f32_e32 v3, s43, v3
+; GFX900-NEXT: v_add_f32_e32 v2, s42, v2
+; GFX900-NEXT: v_add_f32_e32 v1, s41, v1
+; GFX900-NEXT: v_add_f32_e32 v0, s40, v0
+; GFX900-NEXT: s_waitcnt vmcnt(6)
+; GFX900-NEXT: v_add_f32_e32 v7, s39, v7
+; GFX900-NEXT: v_add_f32_e32 v6, s38, v6
+; GFX900-NEXT: v_add_f32_e32 v5, s37, v5
; GFX900-NEXT: s_waitcnt vmcnt(0)
-; GFX900-NEXT: v_add_f32_e32 v24, s11, v24
-; GFX900-NEXT: v_add_f32_e32 v23, s10, v23
-; GFX900-NEXT: v_add_f32_e32 v22, s9, v22
-; GFX900-NEXT: v_add_f32_e32 v21, s8, v21
-; GFX900-NEXT: v_add_f32_e32 v28, s23, v28
-; GFX900-NEXT: v_add_f32_e32 v27, s22, v27
-; GFX900-NEXT: v_add_f32_e32 v26, s21, v26
-; GFX900-NEXT: v_add_f32_e32 v25, s20, v25
-; GFX900-NEXT: global_store_dwordx4 v0, v[29:32], s[0:1] offset:96
-; GFX900-NEXT: global_store_dwordx4 v0, v[25:28], s[0:1] offset:112
-; GFX900-NEXT: global_store_dwordx4 v0, v[21:24], s[0:1] offset:64
-; GFX900-NEXT: global_store_dwordx4 v0, v[17:20], s[0:1] offset:80
-; GFX900-NEXT: global_store_dwordx4 v0, v[13:16], s[0:1] offset:32
-; GFX900-NEXT: global_store_dwordx4 v0, v[9:12], s[0:1] offset:48
-; GFX900-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1]
-; GFX900-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] offset:16
+; GFX900-NEXT: v_add_f32_e32 v31, s19, v31
+; GFX900-NEXT: v_add_f32_e32 v30, s18, v30
+; GFX900-NEXT: v_add_f32_e32 v29, s17, v29
+; GFX900-NEXT: v_add_f32_e32 v28, s16, v28
+; GFX900-NEXT: v_add_f32_e32 v4, s36, v4
+; GFX900-NEXT: v_add_f32_e32 v11, s51, v11
+; GFX900-NEXT: v_add_f32_e32 v10, s50, v10
+; GFX900-NEXT: v_add_f32_e32 v9, s49, v9
+; GFX900-NEXT: v_add_f32_e32 v8, s48, v8
+; GFX900-NEXT: v_add_f32_e32 v15, s47, v15
+; GFX900-NEXT: v_add_f32_e32 v14, s46, v14
+; GFX900-NEXT: v_add_f32_e32 v13, s45, v13
+; GFX900-NEXT: v_add_f32_e32 v12, s44, v12
+; GFX900-NEXT: v_add_f32_e32 v19, s15, v19
+; GFX900-NEXT: v_add_f32_e32 v18, s14, v18
+; GFX900-NEXT: v_add_f32_e32 v17, s13, v17
+; GFX900-NEXT: v_add_f32_e32 v16, s12, v16
+; GFX900-NEXT: v_add_f32_e32 v23, s11, v23
+; GFX900-NEXT: v_add_f32_e32 v22, s10, v22
+; GFX900-NEXT: v_add_f32_e32 v21, s9, v21
+; GFX900-NEXT: v_add_f32_e32 v20, s8, v20
+; GFX900-NEXT: v_add_f32_e32 v27, s23, v27
+; GFX900-NEXT: v_add_f32_e32 v26, s22, v26
+; GFX900-NEXT: v_add_f32_e32 v25, s21, v25
+; GFX900-NEXT: v_add_f32_e32 v24, s20, v24
+; GFX900-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:96
+; GFX900-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:112
+; GFX900-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:64
+; GFX900-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:80
+; GFX900-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:32
+; GFX900-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:48
+; GFX900-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1]
+; GFX900-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16
; GFX900-NEXT: s_endpgm
;
; PACKED-SDAG-LABEL: fadd_v32_vs:
@@ -1475,64 +1472,61 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX900-LABEL: fmul_v32_vs:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0
+; GFX900-NEXT: v_lshlrev_b32_e32 v32, 7, v0
; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
-; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
-; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16
-; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[0:1]
-; GFX900-NEXT: global_load_dwordx4 v[9:12], v0, s[0:1] offset:48
-; GFX900-NEXT: global_load_dwordx4 v[13:16], v0, s[0:1] offset:32
-; GFX900-NEXT: global_load_dwordx4 v[17:20], v0, s[0:1] offset:80
-; GFX900-NEXT: global_load_dwordx4 v[21:24], v0, s[0:1] offset:64
-; GFX900-NEXT: s_waitcnt vmcnt(5)
-; GFX900-NEXT: v_mul_f32_e32 v4, s43, v4
-; GFX900-NEXT: v_mul_f32_e32 v3, s42, v3
-; GFX900-NEXT: v_mul_f32_e32 v2, s41, v2
-; GFX900-NEXT: v_mul_f32_e32 v1, s40, v1
-; GFX900-NEXT: v_mul_f32_e32 v32, s19, v32
-; GFX900-NEXT: v_mul_f32_e32 v31, s18, v31
-; GFX900-NEXT: v_mul_f32_e32 v30, s17, v30
-; GFX900-NEXT: v_mul_f32_e32 v29, s16, v29
-; GFX900-NEXT: s_waitcnt vmcnt(4)
-; GFX900-NEXT: v_mul_f32_e32 v8, s39, v8
-; GFX900-NEXT: v_mul_f32_e32 v7, s38, v7
-; GFX900-NEXT: v_mul_f32_e32 v6, s37, v6
-; GFX900-NEXT: v_mul_f32_e32 v5, s36, v5
-; GFX900-NEXT: s_waitcnt vmcnt(3)
-; GFX900-NEXT: v_mul_f32_e32 v12, s51, v12
-; GFX900-NEXT: v_mul_f32_e32 v11, s50, v11
-; GFX900-NEXT: v_mul_f32_e32 v10, s49, v10
-; GFX900-NEXT: v_mul_f32_e32 v9, s48, v9
-; GFX900-NEXT: s_waitcnt vmcnt(2)
-; GFX900-NEXT: v_mul_f32_e32 v16, s47, v16
-; GFX900-NEXT: v_mul_f32_e32 v15, s46, v15
-; GFX900-NEXT: v_mul_f32_e32 v14, s45, v14
-; GFX900-NEXT: v_mul_f32_e32 v13, s44, v13
-; GFX900-NEXT: s_waitcnt vmcnt(1)
-; GFX900-NEXT: v_mul_f32_e32 v20, s15, v20
-; GFX900-NEXT: v_mul_f32_e32 v19, s14, v19
-; GFX900-NEXT: v_mul_f32_e32 v18, s13, v18
-; GFX900-NEXT: v_mul_f32_e32 v17, s12, v17
+; GFX900-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
+; GFX900-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1]
+; GFX900-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:48
+; GFX900-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:32
+; GFX900-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:80
+; GFX900-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:64
+; GFX900-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
+; GFX900-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
+; GFX900-NEXT: s_waitcnt vmcnt(7)
+; GFX900-NEXT: v_mul_f32_e32 v3, s43, v3
+; GFX900-NEXT: v_mul_f32_e32 v2, s42, v2
+; GFX900-NEXT: v_mul_f32_e32 v1, s41, v1
+; GFX900-NEXT: v_mul_f32_e32 v0, s40, v0
+; GFX900-NEXT: s_waitcnt vmcnt(6)
+; GFX900-NEXT: v_mul_f32_e32 v7, s39, v7
+; GFX900-NEXT: v_mul_f32_e32 v6, s38, v6
+; GFX900-NEXT: v_mul_f32_e32 v5, s37, v5
; GFX900-NEXT: s_waitcnt vmcnt(0)
-; GFX900-NEXT: v_mul_f32_e32 v24, s11, v24
-; GFX900-NEXT: v_mul_f32_e32 v23, s10, v23
-; GFX900-NEXT: v_mul_f32_e32 v22, s9, v22
-; GFX900-NEXT: v_mul_f32_e32 v21, s8, v21
-; GFX900-NEXT: v_mul_f32_e32 v28, s23, v28
-; GFX900-NEXT: v_mul_f32_e32 v27, s22, v27
-; GFX900-NEXT: v_mul_f32_e32 v26, s21, v26
-; GFX900-NEXT: v_mul_f32_e32 v25, s20, v25
-; GFX900-NEXT: global_store_dwordx4 v0, v[29:32], s[0:1] offset:96
-; GFX900-NEXT: global_store_dwordx4 v0, v[25:28], s[0:1] offset:112
-; GFX900-NEXT: global_store_dwordx4 v0, v[21:24], s[0:1] offset:64
-; GFX900-NEXT: global_store_dwordx4 v0, v[17:20], s[0:1] offset:80
-; GFX900-NEXT: global_store_dwordx4 v0, v[13:16], s[0:1] offset:32
-; GFX900-NEXT: global_store_dwordx4 v0, v[9:12], s[0:1] offset:48
-; GFX900-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1]
-; GFX900-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] offset:16
+; GFX900-NEXT: v_mul_f32_e32 v31, s19, v31
+; GFX900-NEXT: v_mul_f32_e32 v30, s18, v30
+; GFX900-NEXT: v_mul_f32_e32 v29, s17, v29
+; GFX900-NEXT: v_mul_f32_e32 v28, s16, v28
+; GFX900-NEXT: v_mul_f32_e32 v4, s36, v4
+; GFX900-NEXT: v_mul_f32_e32 v11, s51, v11
+; GFX900-NEXT: v_mul_f32_e32 v10, s50, v10
+; GFX900-NEXT: v_mul_f32_e32 v9, s49, v9
+; GFX900-NEXT: v_mul_f32_e32 v8, s48, v8
+; GFX900-NEXT: v_mul_f32_e32 v15, s47, v15
+; GFX900-NEXT: v_mul_f32_e32 v14, s46, v14
+; GFX900-NEXT: v_mul_f32_e32 v13, s45, v13
+; GFX900-NEXT: v_mul_f32_e32 v12, s44, v12
+; GFX900-NEXT: v_mul_f32_e32 v19, s15, v19
+; GFX900-NEXT: v_mul_f32_e32 v18, s14, v18
+; GFX900-NEXT: v_mul_f32_e32 v17, s13, v17
+; GFX900-NEXT: v_mul_f32_e32 v16, s12, v16
+; GFX900-NEXT: v_mul_f32_e32 v23, s11, v23
+; GFX900-NEXT: v_mul_f32_e32 v22, s10, v22
+; GFX900-NEXT: v_mul_f32_e32 v21, s9, v21
+; GFX900-NEXT: v_mul_f32_e32 v20, s8, v20
+; GFX900-NEXT: v_mul_f32_e32 v27, s23, v27
+; GFX900-NEXT: v_mul_f32_e32 v26, s22, v26
+; GFX900-NEXT: v_mul_f32_e32 v25, s21, v25
+; GFX900-NEXT: v_mul_f32_e32 v24, s20, v24
+; GFX900-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:96
+; GFX900-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:112
+; GFX900-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:64
+; GFX900-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:80
+; GFX900-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:32
+; GFX900-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:48
+; GFX900-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1]
+; GFX900-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16
; GFX900-NEXT: s_endpgm
;
; PACKED-SDAG-LABEL: fmul_v32_vs:
@@ -2323,64 +2317,61 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX900-LABEL: fma_v32_vs:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0
+; GFX900-NEXT: v_lshlrev_b32_e32 v32, 7, v0
; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
-; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
-; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16
-; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[0:1]
-; GFX900-NEXT: global_load_dwordx4 v[9:12], v0, s[0:1] offset:48
-; GFX900-NEXT: global_load_dwordx4 v[13:16], v0, s[0:1] offset:32
-; GFX900-NEXT: global_load_dwordx4 v[17:20], v0, s[0:1] offset:80
-; GFX900-NEXT: global_load_dwordx4 v[21:24], v0, s[0:1] offset:64
-; GFX900-NEXT: s_waitcnt vmcnt(5)
-; GFX900-NEXT: v_fma_f32 v4, v4, s43, s43
-; GFX900-NEXT: v_fma_f32 v3, v3, s42, s42
-; GFX900-NEXT: v_fma_f32 v2, v2, s41, s41
-; GFX900-NEXT: v_fma_f32 v1, v1, s40, s40
-; GFX900-NEXT: v_fma_f32 v32, v32, s19, s19
-; GFX900-NEXT: v_fma_f32 v31, v31, s18, s18
-; GFX900-NEXT: v_fma_f32 v30, v30, s17, s17
-; GFX900-NEXT: v_fma_f32 v29, v29, s16, s16
-; GFX900-NEXT: s_waitcnt vmcnt(4)
-; GFX900-NEXT: v_fma_f32 v8, v8, s39, s39
-; GFX900-NEXT: v_fma_f32 v7, v7, s38, s38
-; GFX900-NEXT: v_fma_f32 v6, v6, s37, s37
-; GFX900-NEXT: v_fma_f32 v5, v5, s36, s36
-; GFX900-NEXT: s_waitcnt vmcnt(3)
-; GFX900-NEXT: v_fma_f32 v12, v12, s51, s51
-; GFX900-NEXT: v_fma_f32 v11, v11, s50, s50
-; GFX900-NEXT: v_fma_f32 v10, v10, s49, s49
-; GFX900-NEXT: v_fma_f32 v9, v9, s48, s48
-; GFX900-NEXT: s_waitcnt vmcnt(2)
-; GFX900-NEXT: v_fma_f32 v16, v16, s47, s47
-; GFX900-NEXT: v_fma_f32 v15, v15, s46, s46
-; GFX900-NEXT: v_fma_f32 v14, v14, s45, s45
-; GFX900-NEXT: v_fma_f32 v13, v13, s44, s44
-; GFX900-NEXT: s_waitcnt vmcnt(1)
-; GFX900-NEXT: v_fma_f32 v20, v20, s15, s15
-; GFX900-NEXT: v_fma_f32 v19, v19, s14, s14
-; GFX900-NEXT: v_fma_f32 v18, v18, s13, s13
-; GFX900-NEXT: v_fma_f32 v17, v17, s12, s12
+; GFX900-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
+; GFX900-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1]
+; GFX900-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:48
+; GFX900-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:32
+; GFX900-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:80
+; GFX900-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:64
+; GFX900-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
+; GFX900-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
+; GFX900-NEXT: s_waitcnt vmcnt(7)
+; GFX900-NEXT: v_fma_f32 v3, v3, s43, s43
+; GFX900-NEXT: v_fma_f32 v2, v2, s42, s42
+; GFX900-NEXT: v_fma_f32 v1, v1, s41, s41
+; GFX900-NEXT: v_fma_f32 v0, v0, s40, s40
+; GFX900-NEXT: s_waitcnt vmcnt(6)
+; GFX900-NEXT: v_fma_f32 v7, v7, s39, s39
+; GFX900-NEXT: v_fma_f32 v6, v6, s38, s38
+; GFX900-NEXT: v_fma_f32 v5, v5, s37, s37
; GFX900-NEXT: s_waitcnt vmcnt(0)
-; GFX900-NEXT: v_fma_f32 v24, v24, s11, s11
-; GFX900-NEXT: v_fma_f32 v23, v23, s10, s10
-; GFX900-NEXT: v_fma_f32 v22, v22, s9, s9
-; GFX900-NEXT: v_fma_f32 v21, v21, s8, s8
-; GFX900-NEXT: v_fma_f32 v28, v28, s23, s23
-; GFX900-NEXT: v_fma_f32 v27, v27, s22, s22
-; GFX900-NEXT: v_fma_f32 v26, v26, s21, s21
-; GFX900-NEXT: v_fma_f32 v25, v25, s20, s20
-; GFX900-NEXT: global_store_dwordx4 v0, v[29:32], s[0:1] offset:96
-; GFX900-NEXT: global_store_dwordx4 v0, v[25:28], s[0:1] offset:112
-; GFX900-NEXT: global_store_dwordx4 v0, v[21:24], s[0:1] offset:64
-; GFX900-NEXT: global_store_dwordx4 v0, v[17:20], s[0:1] offset:80
-; GFX900-NEXT: global_store_dwordx4 v0, v[13:16], s[0:1] offset:32
-; GFX900-NEXT: global_store_dwordx4 v0, v[9:12], s[0:1] offset:48
-; GFX900-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1]
-; GFX900-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] offset:16
+; GFX900-NEXT: v_fma_f32 v31, v31, s19, s19
+; GFX900-NEXT: v_fma_f32 v30, v30, s18, s18
+; GFX900-NEXT: v_fma_f32 v29, v29, s17, s17
+; GFX900-NEXT: v_fma_f32 v28, v28, s16, s16
+; GFX900-NEXT: v_fma_f32 v4, v4, s36, s36
+; GFX900-NEXT: v_fma_f32 v11, v11, s51, s51
+; GFX900-NEXT: v_fma_f32 v10, v10, s50, s50
+; GFX900-NEXT: v_fma_f32 v9, v9, s49, s49
+; GFX900-NEXT: v_fma_f32 v8, v8, s48, s48
+; GFX900-NEXT: v_fma_f32 v15, v15, s47, s47
+; GFX900-NEXT: v_fma_f32 v14, v14, s46, s46
+; GFX900-NEXT: v_fma_f32 v13, v13, s45, s45
+; GFX900-NEXT: v_fma_f32 v12, v12, s44, s44
+; GFX900-NEXT: v_fma_f32 v19, v19, s15, s15
+; GFX900-NEXT: v_fma_f32 v18, v18, s14, s14
+; GFX900-NEXT: v_fma_f32 v17, v17, s13, s13
+; GFX900-NEXT: v_fma_f32 v16, v16, s12, s12
+; GFX900-NEXT: v_fma_f32 v23, v23, s11, s11
+; GFX900-NEXT: v_fma_f32 v22, v22, s10, s10
+; GFX900-NEXT: v_fma_f32 v21, v21, s9, s9
+; GFX900-NEXT: v_fma_f32 v20, v20, s8, s8
+; GFX900-NEXT: v_fma_f32 v27, v27, s23, s23
+; GFX900-NEXT: v_fma_f32 v26, v26, s22, s22
+; GFX900-NEXT: v_fma_f32 v25, v25, s21, s21
+; GFX900-NEXT: v_fma_f32 v24, v24, s20, s20
+; GFX900-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:96
+; GFX900-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:112
+; GFX900-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:64
+; GFX900-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:80
+; GFX900-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:32
+; GFX900-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:48
+; GFX900-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1]
+; GFX900-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16
; GFX900-NEXT: s_endpgm
;
; PACKED-SDAG-LABEL: fma_v32_vs:
diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
index 0741cb256cc24..010f2d4808504 100644
--- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
@@ -3641,14 +3641,14 @@ define hidden void @extract_v6i16(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_clause 0x3
-; GFX10-NEXT: global_load_ushort v2, v[0:1], off offset:6
-; GFX10-NEXT: global_load_ushort v3, v[0:1], off
-; GFX10-NEXT: global_load_ushort v8, v[0:1], off offset:2
-; GFX10-NEXT: global_load_ushort v9, v[0:1], off offset:4
-; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: v_lshl_or_b32 v0, v8, 16, v3
+; GFX10-NEXT: global_load_ushort v2, v[0:1], off
+; GFX10-NEXT: global_load_ushort v3, v[0:1], off offset:2
+; GFX10-NEXT: global_load_ushort v8, v[0:1], off offset:4
+; GFX10-NEXT: global_load_ushort v9, v[0:1], off offset:6
+; GFX10-NEXT: s_waitcnt vmcnt(2)
+; GFX10-NEXT: v_lshl_or_b32 v0, v3, 16, v2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v9
+; GFX10-NEXT: v_lshl_or_b32 v1, v9, 16, v8
; GFX10-NEXT: global_store_dword v[4:5], v0, off
; GFX10-NEXT: global_store_dword v[6:7], v1, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -3787,14 +3787,14 @@ define hidden void @extract_v13i64(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_clause 0x2
-; GFX10-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:48
-; GFX10-NEXT: global_load_dwordx4 v[11:14], v[0:1], off
-; GFX10-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:64
+; GFX10-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:64
+; GFX10-NEXT: global_load_dwordx4 v[9:12], v[0:1], off
+; GFX10-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:48
; GFX10-NEXT: ; kill: killed $vgpr0 killed $vgpr1
; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: v_perm_b32 v0, v12, v13, 0x1000504
+; GFX10-NEXT: v_perm_b32 v0, v10, v11, 0x1000504
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_perm_b32 v1, v10, v14, 0x1000504
+; GFX10-NEXT: v_perm_b32 v1, v14, v8, 0x1000504
; GFX10-NEXT: global_store_dword v[4:5], v0, off
; GFX10-NEXT: global_store_dword v[6:7], v1, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -3802,15 +3802,15 @@ define hidden void @extract_v13i64(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
; GFX9-LABEL: extract_v13i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:48
-; GFX9-NEXT: global_load_dwordx4 v[11:14], v[0:1], off
-; GFX9-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:64
+; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:64
+; GFX9-NEXT: global_load_dwordx4 v[9:12], v[0:1], off
+; GFX9-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:48
; GFX9-NEXT: s_mov_b32 s4, 0x1000504
; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_perm_b32 v0, v12, v13, s4
+; GFX9-NEXT: v_perm_b32 v0, v10, v11, s4
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v1, v10, v14, s4
+; GFX9-NEXT: v_perm_b32 v1, v14, v8, s4
; GFX9-NEXT: global_store_dword v[4:5], v0, off
; GFX9-NEXT: global_store_dword v[6:7], v1, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/pr51516.mir b/llvm/test/CodeGen/AMDGPU/pr51516.mir
index 69983faf2b154..a1076b1fb50fb 100644
--- a/llvm/test/CodeGen/AMDGPU/pr51516.mir
+++ b/llvm/test/CodeGen/AMDGPU/pr51516.mir
@@ -117,6 +117,3 @@ body: |
S_ENDPGM 0
...
-## NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-# GCN: {{.*}}
-# GCN-GCNTRACKER: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
index b43454840ee16..9941dcab14950 100644
--- a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
@@ -158,8 +158,8 @@ define i64 @cmpxchg_private_i64(ptr addrspace(5) %ptr) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v2, v0
; GCN-NEXT: v_add_i32_e32 v3, vcc, 4, v2
-; GCN-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GCN-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index 3329c9a761900..e986c37bf75ba 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -500,14 +500,14 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX900-NEXT: v_mov_b32_e32 v5, 0
; GFX900-NEXT: v_mov_b32_e32 v7, 0x7f
-; GFX900-NEXT: s_movk_i32 s2, 0xd000
-; GFX900-NEXT: s_movk_i32 s3, 0xe000
-; GFX900-NEXT: s_movk_i32 s4, 0xf000
+; GFX900-NEXT: s_movk_i32 s0, 0xd000
+; GFX900-NEXT: s_movk_i32 s1, 0xe000
+; GFX900-NEXT: s_movk_i32 s2, 0xf000
; GFX900-NEXT: .LBB1_1: ; %for.cond.preheader
; GFX900-NEXT: ; =>This Loop Header: Depth=1
; GFX900-NEXT: ; Child Loop BB1_2 Depth 2
; GFX900-NEXT: v_mov_b32_e32 v3, v1
-; GFX900-NEXT: s_mov_b32 s5, 0
+; GFX900-NEXT: s_mov_b32 s3, 0
; GFX900-NEXT: v_mov_b32_e32 v2, v0
; GFX900-NEXT: .LBB1_2: ; %for.body
; GFX900-NEXT: ; Parent Loop BB1_1 Depth=1
@@ -519,39 +519,40 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v3, vcc
; GFX900-NEXT: global_load_dwordx2 v[18:19], v[14:15], off offset:-2048
; GFX900-NEXT: global_load_dwordx2 v[20:21], v[14:15], off
-; GFX900-NEXT: v_add_co_u32_e32 v16, vcc, s2, v2
+; GFX900-NEXT: v_add_co_u32_e32 v16, vcc, s0, v2
; GFX900-NEXT: v_addc_co_u32_e32 v17, vcc, -1, v3, vcc
-; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, s3, v2
+; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, s1, v2
; GFX900-NEXT: global_load_dwordx2 v[16:17], v[16:17], off offset:-2048
; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v3, vcc
; GFX900-NEXT: global_load_dwordx2 v[10:11], v[2:3], off offset:-4096
; GFX900-NEXT: global_load_dwordx2 v[12:13], v[2:3], off offset:-2048
-; GFX900-NEXT: s_addk_i32 s5, 0x2000
-; GFX900-NEXT: s_cmp_gt_u32 s5, 0x3fffff
+; GFX900-NEXT: s_addk_i32 s3, 0x2000
+; GFX900-NEXT: s_cmp_gt_u32 s3, 0x3fffff
; GFX900-NEXT: s_waitcnt vmcnt(5)
; GFX900-NEXT: v_add_co_u32_e32 v22, vcc, v8, v4
-; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v5, vcc
+; GFX900-NEXT: v_addc_co_u32_e32 v23, vcc, v9, v5, vcc
; GFX900-NEXT: global_load_dwordx2 v[8:9], v[14:15], off offset:-4096
+; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, s2, v2
+; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v3, vcc
; GFX900-NEXT: s_waitcnt vmcnt(5)
-; GFX900-NEXT: v_add_co_u32_e64 v24, s[0:1], v18, v22
-; GFX900-NEXT: v_addc_co_u32_e64 v25, s[0:1], v19, v5, s[0:1]
+; GFX900-NEXT: v_add_co_u32_e32 v26, vcc, v18, v22
+; GFX900-NEXT: v_addc_co_u32_e32 v27, vcc, v19, v23, vcc
; GFX900-NEXT: global_load_dwordx2 v[18:19], v[14:15], off offset:-2048
; GFX900-NEXT: global_load_dwordx2 v[22:23], v[14:15], off
-; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, s4, v2
-; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v3, vcc
-; GFX900-NEXT: global_load_dwordx2 v[4:5], v[4:5], off offset:-2048
-; GFX900-NEXT: s_waitcnt vmcnt(7)
-; GFX900-NEXT: v_add_co_u32_e32 v20, vcc, v20, v24
-; GFX900-NEXT: global_load_dwordx2 v[14:15], v[2:3], off
-; GFX900-NEXT: v_addc_co_u32_e32 v21, vcc, v21, v25, vcc
+; GFX900-NEXT: global_load_dwordx2 v[24:25], v[4:5], off offset:-2048
+; GFX900-NEXT: s_nop 0
+; GFX900-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
+; GFX900-NEXT: s_waitcnt vmcnt(8)
+; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, v20, v26
+; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, v21, v27, vcc
; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, 0x10000, v2
; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX900-NEXT: s_waitcnt vmcnt(7)
-; GFX900-NEXT: v_add_co_u32_e32 v16, vcc, v16, v20
-; GFX900-NEXT: v_addc_co_u32_e32 v17, vcc, v17, v21, vcc
+; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, v16, v14
+; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, v17, v15, vcc
; GFX900-NEXT: s_waitcnt vmcnt(4)
-; GFX900-NEXT: v_add_co_u32_e32 v8, vcc, v8, v16
-; GFX900-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v17, vcc
+; GFX900-NEXT: v_add_co_u32_e32 v8, vcc, v8, v14
+; GFX900-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v15, vcc
; GFX900-NEXT: s_waitcnt vmcnt(3)
; GFX900-NEXT: v_add_co_u32_e32 v8, vcc, v18, v8
; GFX900-NEXT: v_addc_co_u32_e32 v9, vcc, v19, v9, vcc
@@ -559,15 +560,15 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX900-NEXT: v_add_co_u32_e32 v8, vcc, v22, v8
; GFX900-NEXT: v_addc_co_u32_e32 v9, vcc, v23, v9, vcc
; GFX900-NEXT: s_waitcnt vmcnt(1)
+; GFX900-NEXT: v_add_co_u32_e32 v8, vcc, v24, v8
+; GFX900-NEXT: v_addc_co_u32_e32 v9, vcc, v25, v9, vcc
+; GFX900-NEXT: v_add_co_u32_e32 v8, vcc, v10, v8
+; GFX900-NEXT: v_addc_co_u32_e32 v9, vcc, v11, v9, vcc
+; GFX900-NEXT: v_add_co_u32_e32 v8, vcc, v12, v8
+; GFX900-NEXT: v_addc_co_u32_e32 v9, vcc, v13, v9, vcc
+; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8
; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v9, vcc
-; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v10, v4
-; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v11, v5, vcc
-; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v12, v4
-; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v13, v5, vcc
-; GFX900-NEXT: s_waitcnt vmcnt(0)
-; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v14, v4
-; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v15, v5, vcc
; GFX900-NEXT: s_cbranch_scc0 .LBB1_2
; GFX900-NEXT: ; %bb.3: ; %while.cond.loopexit
; GFX900-NEXT: ; in Loop: Header=BB1_1 Depth=1
@@ -623,25 +624,25 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX10-NEXT: ; => This Inner Loop Header: Depth=2
; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v4, 0xffffb800
; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, -1, v5, vcc_lo
-; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v4, 0xffffc800
+; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, 0xfffff000, v4
; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, -1, v5, vcc_lo
-; GFX10-NEXT: v_add_co_u32 v14, vcc_lo, v4, 0xffffd800
-; GFX10-NEXT: v_add_co_ci_u32_e32 v15, vcc_lo, -1, v5, vcc_lo
-; GFX10-NEXT: v_add_co_u32 v18, vcc_lo, v4, 0xffffe800
-; GFX10-NEXT: s_clause 0x2
-; GFX10-NEXT: global_load_dwordx2 v[12:13], v[8:9], off offset:-2048
-; GFX10-NEXT: global_load_dwordx2 v[16:17], v[10:11], off offset:-2048
-; GFX10-NEXT: global_load_dwordx2 v[20:21], v[14:15], off offset:-2048
-; GFX10-NEXT: v_add_co_ci_u32_e32 v19, vcc_lo, -1, v5, vcc_lo
-; GFX10-NEXT: v_add_co_u32 v22, vcc_lo, 0xfffff000, v4
-; GFX10-NEXT: v_add_co_ci_u32_e32 v23, vcc_lo, -1, v5, vcc_lo
-; GFX10-NEXT: s_clause 0x7
-; GFX10-NEXT: global_load_dwordx2 v[24:25], v[18:19], off offset:-2048
+; GFX10-NEXT: v_add_co_u32 v12, vcc_lo, v4, 0xffffc800
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dwordx2 v[14:15], v[8:9], off offset:-2048
+; GFX10-NEXT: global_load_dwordx2 v[16:17], v[10:11], off
+; GFX10-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, -1, v5, vcc_lo
; GFX10-NEXT: global_load_dwordx2 v[8:9], v[8:9], off
-; GFX10-NEXT: global_load_dwordx2 v[10:11], v[10:11], off
-; GFX10-NEXT: global_load_dwordx2 v[14:15], v[14:15], off
-; GFX10-NEXT: global_load_dwordx2 v[26:27], v[18:19], off
-; GFX10-NEXT: global_load_dwordx2 v[28:29], v[22:23], off
+; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v4, 0xffffd800
+; GFX10-NEXT: global_load_dwordx2 v[18:19], v[12:13], off offset:-2048
+; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, -1, v5, vcc_lo
+; GFX10-NEXT: v_add_co_u32 v20, vcc_lo, v4, 0xffffe800
+; GFX10-NEXT: v_add_co_ci_u32_e32 v21, vcc_lo, -1, v5, vcc_lo
+; GFX10-NEXT: s_clause 0x6
+; GFX10-NEXT: global_load_dwordx2 v[22:23], v[10:11], off offset:-2048
+; GFX10-NEXT: global_load_dwordx2 v[12:13], v[12:13], off
+; GFX10-NEXT: global_load_dwordx2 v[24:25], v[10:11], off
+; GFX10-NEXT: global_load_dwordx2 v[26:27], v[20:21], off offset:-2048
+; GFX10-NEXT: global_load_dwordx2 v[28:29], v[20:21], off
; GFX10-NEXT: global_load_dwordx2 v[30:31], v[4:5], off offset:-2048
; GFX10-NEXT: global_load_dwordx2 v[32:33], v[4:5], off
; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x10000, v4
@@ -649,21 +650,20 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX10-NEXT: s_addk_i32 s1, 0x2000
; GFX10-NEXT: s_cmp_gt_u32 s1, 0x3fffff
; GFX10-NEXT: s_waitcnt vmcnt(10)
-; GFX10-NEXT: v_add_co_u32 v2, s0, v12, v2
-; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v13, v3, s0
-; GFX10-NEXT: s_waitcnt vmcnt(6)
+; GFX10-NEXT: v_add_co_u32 v2, s0, v14, v2
+; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v15, v3, s0
+; GFX10-NEXT: s_waitcnt vmcnt(8)
; GFX10-NEXT: v_add_co_u32 v2, s0, v8, v2
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v9, v3, s0
-; GFX10-NEXT: v_add_co_u32 v2, s0, v16, v2
-; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v17, v3, s0
+; GFX10-NEXT: s_waitcnt vmcnt(7)
+; GFX10-NEXT: v_add_co_u32 v2, s0, v18, v2
+; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v19, v3, s0
; GFX10-NEXT: s_waitcnt vmcnt(5)
-; GFX10-NEXT: v_add_co_u32 v2, s0, v10, v2
-; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v11, v3, s0
-; GFX10-NEXT: v_add_co_u32 v2, s0, v20, v2
-; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v21, v3, s0
+; GFX10-NEXT: v_add_co_u32 v2, s0, v12, v2
+; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v13, v3, s0
+; GFX10-NEXT: v_add_co_u32 v2, s0, v22, v2
+; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v23, v3, s0
; GFX10-NEXT: s_waitcnt vmcnt(4)
-; GFX10-NEXT: v_add_co_u32 v2, s0, v14, v2
-; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v15, v3, s0
; GFX10-NEXT: v_add_co_u32 v2, s0, v24, v2
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v25, v3, s0
; GFX10-NEXT: s_waitcnt vmcnt(3)
@@ -672,6 +672,8 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX10-NEXT: s_waitcnt vmcnt(2)
; GFX10-NEXT: v_add_co_u32 v2, s0, v28, v2
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v29, v3, s0
+; GFX10-NEXT: v_add_co_u32 v2, s0, v16, v2
+; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v17, v3, s0
; GFX10-NEXT: s_waitcnt vmcnt(1)
; GFX10-NEXT: v_add_co_u32 v2, s0, v30, v2
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v31, v3, s0
@@ -732,34 +734,37 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX90A-NEXT: .LBB1_2: ; %for.body
; GFX90A-NEXT: ; Parent Loop BB1_1 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX90A-NEXT: v_add_co_u32_e32 v12, vcc, 0xffffb000, v6
+; GFX90A-NEXT: v_add_co_u32_e32 v10, vcc, 0xffffb000, v6
+; GFX90A-NEXT: v_addc_co_u32_e32 v11, vcc, -1, v7, vcc
+; GFX90A-NEXT: v_add_co_u32_e32 v12, vcc, 0xffffc000, v6
; GFX90A-NEXT: v_addc_co_u32_e32 v13, vcc, -1, v7, vcc
-; GFX90A-NEXT: global_load_dwordx2 v[12:13], v[12:13], off
-; GFX90A-NEXT: v_add_co_u32_e32 v14, vcc, 0xffffc000, v6
-; GFX90A-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v7, vcc
-; GFX90A-NEXT: global_load_dwordx2 v[18:19], v[14:15], off offset:-2048
-; GFX90A-NEXT: global_load_dwordx2 v[20:21], v[14:15], off
-; GFX90A-NEXT: v_add_co_u32_e32 v16, vcc, s0, v6
-; GFX90A-NEXT: v_addc_co_u32_e32 v17, vcc, -1, v7, vcc
-; GFX90A-NEXT: global_load_dwordx2 v[16:17], v[16:17], off offset:-2048
-; GFX90A-NEXT: v_add_co_u32_e32 v14, vcc, s1, v6
+; GFX90A-NEXT: global_load_dwordx2 v[16:17], v[10:11], off
+; GFX90A-NEXT: global_load_dwordx2 v[18:19], v[12:13], off offset:-2048
+; GFX90A-NEXT: v_add_co_u32_e32 v14, vcc, s0, v6
; GFX90A-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v7, vcc
-; GFX90A-NEXT: global_load_dwordx2 v[24:25], v[14:15], off offset:-4096
-; GFX90A-NEXT: global_load_dwordx2 v[26:27], v[14:15], off offset:-2048
-; GFX90A-NEXT: global_load_dwordx2 v[28:29], v[14:15], off
-; GFX90A-NEXT: v_add_co_u32_e32 v22, vcc, s2, v6
-; GFX90A-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v7, vcc
-; GFX90A-NEXT: global_load_dwordx2 v[14:15], v[22:23], off offset:-2048
-; GFX90A-NEXT: global_load_dwordx2 v[30:31], v[6:7], off
+; GFX90A-NEXT: global_load_dwordx2 v[20:21], v[12:13], off
+; GFX90A-NEXT: global_load_dwordx2 v[22:23], v[14:15], off offset:-2048
+; GFX90A-NEXT: v_add_co_u32_e32 v10, vcc, s1, v6
+; GFX90A-NEXT: v_addc_co_u32_e32 v11, vcc, -1, v7, vcc
+; GFX90A-NEXT: v_add_co_u32_e32 v12, vcc, s2, v6
+; GFX90A-NEXT: v_addc_co_u32_e32 v13, vcc, -1, v7, vcc
+; GFX90A-NEXT: global_load_dwordx2 v[14:15], v[10:11], off offset:-4096
+; GFX90A-NEXT: global_load_dwordx2 v[24:25], v[10:11], off offset:-2048
+; GFX90A-NEXT: global_load_dwordx2 v[26:27], v[10:11], off
+; GFX90A-NEXT: global_load_dwordx2 v[28:29], v[12:13], off offset:-2048
+; GFX90A-NEXT: global_load_dwordx2 v[30:31], v[6:7], off offset:-2048
+; GFX90A-NEXT: ; kill: killed $vgpr12 killed $vgpr13
+; GFX90A-NEXT: ; kill: killed $vgpr10 killed $vgpr11
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: global_load_dwordx2 v[10:11], v[6:7], off
; GFX90A-NEXT: global_load_dwordx2 v[8:9], v[6:7], off offset:-4096
-; GFX90A-NEXT: global_load_dwordx2 v[10:11], v[6:7], off offset:-2048
; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x10000, v6
; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
; GFX90A-NEXT: s_addk_i32 s3, 0x2000
; GFX90A-NEXT: s_cmp_gt_u32 s3, 0x3fffff
; GFX90A-NEXT: s_waitcnt vmcnt(10)
-; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v12, v4
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v13, v5, vcc
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v16, v4
+; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v17, v5, vcc
; GFX90A-NEXT: s_waitcnt vmcnt(9)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v18, v4
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v19, v5, vcc
@@ -767,28 +772,27 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v20, v4
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v21, v5, vcc
; GFX90A-NEXT: s_waitcnt vmcnt(7)
-; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v16, v4
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v17, v5, vcc
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v22, v4
+; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v23, v5, vcc
; GFX90A-NEXT: s_waitcnt vmcnt(6)
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v14, v4
+; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v15, v5, vcc
+; GFX90A-NEXT: s_waitcnt vmcnt(5)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v24, v4
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v25, v5, vcc
-; GFX90A-NEXT: s_waitcnt vmcnt(5)
+; GFX90A-NEXT: s_waitcnt vmcnt(4)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v26, v4
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v27, v5, vcc
-; GFX90A-NEXT: s_waitcnt vmcnt(4)
+; GFX90A-NEXT: s_waitcnt vmcnt(3)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v28, v4
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v29, v5, vcc
-; GFX90A-NEXT: s_waitcnt vmcnt(3)
-; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v14, v4
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v15, v5, vcc
-; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v5, vcc
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v10, v4
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v11, v5, vcc
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v30, v4
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v31, v5, vcc
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v10, v4
+; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v11, v5, vcc
; GFX90A-NEXT: s_cbranch_scc0 .LBB1_2
; GFX90A-NEXT: ; %bb.3: ; %while.cond.loopexit
; GFX90A-NEXT: ; in Loop: Header=BB1_1 Depth=1
@@ -1184,9 +1188,9 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) {
; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7]
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 7, v0
; GFX10-NEXT: v_mov_b32_e32 v2, 2
-; GFX10-NEXT: v_and_b32_e32 v10, 0xffff8000, v1
+; GFX10-NEXT: v_and_b32_e32 v8, 0xffff8000, v1
; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT: v_add_co_u32 v1, s0, s34, v10
+; GFX10-NEXT: v_add_co_u32 v1, s0, s34, v8
; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s0, s35, 0, s0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
@@ -1194,38 +1198,37 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) {
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, 0x1000
; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: s_clause 0x3
-; GFX10-NEXT: global_load_dword v11, v[0:1], off
-; GFX10-NEXT: global_load_dword v12, v[0:1], off offset:1024
-; GFX10-NEXT: global_load_dword v13, v[4:5], off offset:-2048
-; GFX10-NEXT: global_load_dword v14, v[2:3], off offset:1024
; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, 0x1000, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, 0x1800, v0
+; GFX10-NEXT: s_clause 0x5
+; GFX10-NEXT: global_load_dword v9, v[0:1], off
+; GFX10-NEXT: global_load_dword v10, v[0:1], off offset:1024
+; GFX10-NEXT: global_load_dword v11, v[4:5], off offset:-2048
+; GFX10-NEXT: global_load_dword v12, v[2:3], off offset:1024
+; GFX10-NEXT: global_load_dword v13, v[4:5], off
+; GFX10-NEXT: global_load_dword v14, v[6:7], off offset:1024
+; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, 0x2000
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v0, 0x2000
-; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: s_clause 0x2
-; GFX10-NEXT: global_load_dword v15, v[4:5], off
-; GFX10-NEXT: global_load_dword v16, v[6:7], off offset:1024
-; GFX10-NEXT: global_load_dword v17, v[2:3], off offset:1024
+; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x1800, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: s_clause 0x2
-; GFX10-NEXT: global_load_dword v2, v[8:9], off offset:-2048
-; GFX10-NEXT: global_load_dword v3, v[8:9], off
-; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:1024
+; GFX10-NEXT: s_clause 0x3
+; GFX10-NEXT: global_load_dword v6, v[2:3], off offset:-2048
+; GFX10-NEXT: global_load_dword v7, v[4:5], off offset:1024
+; GFX10-NEXT: global_load_dword v15, v[0:1], off offset:1024
+; GFX10-NEXT: global_load_dword v16, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(8)
-; GFX10-NEXT: v_add_nc_u32_e32 v0, v12, v11
+; GFX10-NEXT: v_add_nc_u32_e32 v0, v10, v9
; GFX10-NEXT: s_waitcnt vmcnt(6)
-; GFX10-NEXT: v_add3_u32 v0, v13, v0, v14
+; GFX10-NEXT: v_add3_u32 v0, v11, v0, v12
; GFX10-NEXT: s_waitcnt vmcnt(4)
-; GFX10-NEXT: v_add3_u32 v0, v15, v0, v16
+; GFX10-NEXT: v_add3_u32 v0, v13, v0, v14
; GFX10-NEXT: s_waitcnt vmcnt(2)
-; GFX10-NEXT: v_add3_u32 v0, v2, v0, v17
+; GFX10-NEXT: v_add3_u32 v0, v6, v0, v7
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_add3_u32 v0, v3, v0, v4
-; GFX10-NEXT: global_store_dword v10, v0, s[34:35]
+; GFX10-NEXT: v_add3_u32 v0, v16, v0, v15
+; GFX10-NEXT: global_store_dword v8, v0, s[34:35]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: Address32:
@@ -1691,12 +1694,12 @@ define amdgpu_kernel void @p32Offset64(ptr addrspace(1) %buffer) {
; GFX10-NEXT: s_clause 0x3
; GFX10-NEXT: global_load_dword v7, v[0:1], off
; GFX10-NEXT: global_load_dword v8, v[2:3], off offset:-2048
-; GFX10-NEXT: global_load_dword v9, v[2:3], off
-; GFX10-NEXT: global_load_dword v10, v[4:5], off offset:1024
+; GFX10-NEXT: global_load_dword v9, v[4:5], off offset:1024
+; GFX10-NEXT: global_load_dword v10, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(2)
; GFX10-NEXT: v_add_nc_u32_e32 v0, v8, v7
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_add3_u32 v0, v10, v0, v9
+; GFX10-NEXT: v_add3_u32 v0, v9, v0, v10
; GFX10-NEXT: global_store_dword v6, v0, s[34:35]
; GFX10-NEXT: s_endpgm
;
@@ -2143,9 +2146,9 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 7, v0
-; GFX9-NEXT: v_and_b32_e32 v22, 0xffff8000, v1
+; GFX9-NEXT: v_and_b32_e32 v20, 0xffff8000, v1
; GFX9-NEXT: v_mov_b32_e32 v1, s35
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s34, v22
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s34, v20
; GFX9-NEXT: v_mov_b32_e32 v3, 3
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -2161,13 +2164,13 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) {
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dwordx2 v[10:11], v[2:3], off offset:2048
+; GFX9-NEXT: global_load_dwordx2 v[12:13], v[2:3], off
; GFX9-NEXT: s_movk_i32 s0, 0x1000
-; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, s0, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dwordx2 v[14:15], v[12:13], off
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dwordx2 v[14:15], v[2:3], off offset:2048
; GFX9-NEXT: global_load_dwordx2 v[16:17], v[2:3], off
-; GFX9-NEXT: global_load_dwordx2 v[18:19], v[12:13], off offset:2048
-; GFX9-NEXT: global_load_dwordx2 v[20:21], v[0:1], off offset:2048
+; GFX9-NEXT: global_load_dwordx2 v[18:19], v[0:1], off offset:2048
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v4
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v5, vcc
@@ -2177,18 +2180,19 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) {
; GFX9-NEXT: s_waitcnt vmcnt(4)
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v11, v1, vcc
+; GFX9-NEXT: s_waitcnt vmcnt(3)
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v12, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v13, v1, vcc
; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v14, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v15, v1, vcc
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v16, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v17, v1, vcc
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v18, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v19, v1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v14, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v15, v1, vcc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v20, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v21, v1, vcc
-; GFX9-NEXT: global_store_dwordx2 v22, v[0:1], s[34:35]
+; GFX9-NEXT: global_store_dwordx2 v20, v[0:1], s[34:35]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: ReverseOrder:
@@ -2298,14 +2302,14 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) {
; GFX11-NEXT: global_load_b64 v[6:7], v[2:3], off offset:2048
; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off
; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, 0x1000, v0
+; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, 0x1000, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v1, vcc_lo
; GFX11-NEXT: s_clause 0x4
-; GFX11-NEXT: global_load_b64 v[12:13], v[8:9], off offset:2048
-; GFX11-NEXT: global_load_b64 v[14:15], v[10:11], off
+; GFX11-NEXT: global_load_b64 v[10:11], v[8:9], off offset:2048
; GFX11-NEXT: global_load_b64 v[8:9], v[8:9], off
-; GFX11-NEXT: global_load_b64 v[10:11], v[10:11], off offset:2048
+; GFX11-NEXT: global_load_b64 v[14:15], v[12:13], off offset:2048
+; GFX11-NEXT: global_load_b64 v[12:13], v[12:13], off
; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off offset:2048
; GFX11-NEXT: s_waitcnt vmcnt(6)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v6, v4
@@ -2316,20 +2320,21 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v5, vcc_lo
; GFX11-NEXT: s_waitcnt vmcnt(4)
-; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v12, v2
+; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v10, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v13, v3, vcc_lo
-; GFX11-NEXT: s_waitcnt vmcnt(2)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v11, v3, vcc_lo
+; GFX11-NEXT: s_waitcnt vmcnt(3)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v8, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v9, v3, vcc_lo
-; GFX11-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v10, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v11, v3, vcc_lo
+; GFX11-NEXT: s_waitcnt vmcnt(2)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v14, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v15, v3, vcc_lo
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v12, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v13, v3, vcc_lo
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll
index fc154604b8700..475fcf0d81829 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll
@@ -14,14 +14,14 @@ define amdgpu_kernel void @test_rewrite_mfma_copy_to_agpr_phi(ptr addrspace(1) %
; CHECK-NEXT: s_bitcmp0_b32 s6, 0
; CHECK-NEXT: s_cbranch_scc0 .LBB0_2
; CHECK-NEXT: ; %bb.1: ; %else
-; CHECK-NEXT: global_load_dwordx4 a[28:31], v32, s[2:3] offset:112
; CHECK-NEXT: global_load_dwordx4 a[24:27], v32, s[2:3] offset:96
-; CHECK-NEXT: global_load_dwordx4 a[20:23], v32, s[2:3] offset:80
+; CHECK-NEXT: global_load_dwordx4 a[28:31], v32, s[2:3] offset:112
; CHECK-NEXT: global_load_dwordx4 a[16:19], v32, s[2:3] offset:64
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v32, s[2:3] offset:48
+; CHECK-NEXT: global_load_dwordx4 a[20:23], v32, s[2:3] offset:80
; CHECK-NEXT: global_load_dwordx4 a[8:11], v32, s[2:3] offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v32, s[2:3] offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v32, s[2:3] offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v32, s[2:3]
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v32, s[2:3] offset:16
; CHECK-NEXT: v_mov_b32_e32 v33, 2.0
; CHECK-NEXT: v_mov_b32_e32 v34, 4.0
; CHECK-NEXT: s_waitcnt vmcnt(0)
@@ -34,14 +34,14 @@ define amdgpu_kernel void @test_rewrite_mfma_copy_to_agpr_phi(ptr addrspace(1) %
; CHECK-NEXT: ; implicit-def: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
; CHECK-NEXT: .LBB0_3: ; %if
; CHECK-NEXT: s_nop 15
-; CHECK-NEXT: global_load_dwordx4 a[28:31], v32, s[0:1] offset:112
; CHECK-NEXT: global_load_dwordx4 a[24:27], v32, s[0:1] offset:96
-; CHECK-NEXT: global_load_dwordx4 a[20:23], v32, s[0:1] offset:80
+; CHECK-NEXT: global_load_dwordx4 a[28:31], v32, s[0:1] offset:112
; CHECK-NEXT: global_load_dwordx4 a[16:19], v32, s[0:1] offset:64
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v32, s[0:1] offset:48
+; CHECK-NEXT: global_load_dwordx4 a[20:23], v32, s[0:1] offset:80
; CHECK-NEXT: global_load_dwordx4 a[8:11], v32, s[0:1] offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v32, s[0:1] offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v32, s[0:1] offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v32, s[0:1]
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v32, s[0:1] offset:16
; CHECK-NEXT: v_mov_b32_e32 v32, 2.0
; CHECK-NEXT: v_mov_b32_e32 v33, 4.0
; CHECK-NEXT: s_waitcnt vmcnt(0)
@@ -84,14 +84,14 @@ define amdgpu_kernel void @test_rewrite_mfma_copy_to_agpr_phi_loop(ptr addrspace
; CHECK-NEXT: v_lshlrev_b32_e32 v32, 7, v0
; CHECK-NEXT: s_mov_b32 s1, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 v[28:31], v32, s[2:3] offset:112
; CHECK-NEXT: global_load_dwordx4 v[24:27], v32, s[2:3] offset:96
-; CHECK-NEXT: global_load_dwordx4 v[20:23], v32, s[2:3] offset:80
+; CHECK-NEXT: global_load_dwordx4 v[28:31], v32, s[2:3] offset:112
; CHECK-NEXT: global_load_dwordx4 v[16:19], v32, s[2:3] offset:64
-; CHECK-NEXT: global_load_dwordx4 v[12:15], v32, s[2:3] offset:48
+; CHECK-NEXT: global_load_dwordx4 v[20:23], v32, s[2:3] offset:80
; CHECK-NEXT: global_load_dwordx4 v[8:11], v32, s[2:3] offset:32
-; CHECK-NEXT: global_load_dwordx4 v[4:7], v32, s[2:3] offset:16
+; CHECK-NEXT: global_load_dwordx4 v[12:15], v32, s[2:3] offset:48
; CHECK-NEXT: global_load_dwordx4 v[0:3], v32, s[2:3]
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v32, s[2:3] offset:16
; CHECK-NEXT: v_mov_b32_e32 v64, 4.0
; CHECK-NEXT: v_mov_b32_e32 v65, 2.0
; CHECK-NEXT: .LBB1_1: ; %loop
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.gfx90a.ll
index 7d00b12e7334a..9fc0e6abd6334 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.gfx90a.ll
@@ -7,10 +7,10 @@ define void @test_rewrite_mfma_i32_32x32x8i8(i32 %arg0, i32 %arg1, ptr addrspace
; CHECK-LABEL: test_rewrite_mfma_i32_32x32x8i8:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[2:3], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[2:3], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_i32_32x32x8i8 a[0:15], v0, v1, a[0:15]
; CHECK-NEXT: ;;#ASMSTART
@@ -44,14 +44,14 @@ define void @test_rewrite_mfma_f32_32x32x2bf16(<2 x i16> %arg0, <2 x i16> %arg1,
; CHECK-LABEL: test_rewrite_mfma_f32_32x32x2bf16:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[28:31], v[2:3], off offset:112
; CHECK-NEXT: global_load_dwordx4 a[24:27], v[2:3], off offset:96
-; CHECK-NEXT: global_load_dwordx4 a[20:23], v[2:3], off offset:80
+; CHECK-NEXT: global_load_dwordx4 a[28:31], v[2:3], off offset:112
; CHECK-NEXT: global_load_dwordx4 a[16:19], v[2:3], off offset:64
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
+; CHECK-NEXT: global_load_dwordx4 a[20:23], v[2:3], off offset:80
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[2:3], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[2:3], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x2bf16 a[0:31], v0, v1, a[0:31]
; CHECK-NEXT: ;;#ASMSTART
@@ -68,10 +68,10 @@ define void @test_rewrite_mfma_f32_16x16x2bf16(<2 x i16> %arg0, <2 x i16> %arg1,
; CHECK-LABEL: test_rewrite_mfma_f32_16x16x2bf16:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[2:3], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[2:3], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_16x16x2bf16 a[0:15], v0, v1, a[0:15]
; CHECK-NEXT: ;;#ASMSTART
@@ -105,10 +105,10 @@ define void @test_rewrite_mfma_f32_32x32x4bf16(<2 x i16> %arg0, <2 x i16> %arg1,
; CHECK-LABEL: test_rewrite_mfma_f32_32x32x4bf16:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[2:3], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[2:3], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x4bf16 a[0:15], v0, v1, a[0:15]
; CHECK-NEXT: ;;#ASMSTART
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.gfx950.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.gfx950.ll
index b2465b02f2eee..ece60b9b10d8a 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.gfx950.ll
@@ -24,10 +24,10 @@ define void @test_rewrite_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x half> %ar
; CHECK-LABEL: test_rewrite_mfma_f32_32x32x16_f16:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[8:9], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[8:9], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[8:9], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[8:9], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[8:9], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[8:9], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
; CHECK-NEXT: ;;#ASMSTART
@@ -61,10 +61,10 @@ define void @test_rewrite_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> %arg1,
; CHECK-LABEL: test_rewrite_mfma_i32_32x32x32_i8:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[8:9], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[8:9], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[8:9], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[8:9], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[8:9], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[8:9], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
; CHECK-NEXT: ;;#ASMSTART
@@ -98,10 +98,10 @@ define void @test_rewrite_mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x bfloat
; CHECK-LABEL: test_rewrite_mfma_f32_32x32x16_bf16:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[8:9], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[8:9], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[8:9], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[8:9], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[8:9], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[8:9], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15]
; CHECK-NEXT: ;;#ASMSTART
@@ -240,10 +240,10 @@ define void @test_rewrite_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0(<8 x
; CHECK-LABEL: test_rewrite_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[18:19], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[18:19], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[18:19], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[18:19], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[18:19], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[18:19], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0]
; CHECK-NEXT: ;;#ASMSTART
@@ -263,10 +263,10 @@ define void @test_rewrite_mfma_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0(<8 x i32> %
; CHECK-LABEL: test_rewrite_mfma_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[16:17], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[16:17], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[16:17], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[16:17], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[16:17], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[16:17], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15]
; CHECK-NEXT: ;;#ASMSTART
@@ -286,10 +286,10 @@ define void @test_rewrite_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2(<6 x
; CHECK-LABEL: test_rewrite_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[14:15], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[14:15], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[14:15], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[14:15], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[14:15], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[14:15], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v12, v13 op_sel_hi:[0,0,0] cbsz:2 blgp:2
; CHECK-NEXT: ;;#ASMSTART
@@ -309,10 +309,10 @@ define void @test_rewrite_mfma_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2(<6 x i32> %
; CHECK-LABEL: test_rewrite_mfma_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[12:13], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[12:13], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[12:13], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[12:13], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[12:13], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[12:13], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:2
; CHECK-NEXT: ;;#ASMSTART
@@ -332,10 +332,10 @@ define void @test_rewrite_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4(<4 x
; CHECK-LABEL: test_rewrite_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[10:11], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[10:11], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[10:11], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[10:11], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[10:11], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[10:11], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15], v8, v9 op_sel_hi:[0,0,0] cbsz:4 blgp:4
; CHECK-NEXT: ;;#ASMSTART
@@ -355,10 +355,10 @@ define void @test_rewrite_mfma_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4(<4 x i32> %
; CHECK-LABEL: test_rewrite_mfma_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[8:9], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[8:9], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[8:9], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[8:9], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[8:9], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[8:9], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:4 blgp:4
; CHECK-NEXT: ;;#ASMSTART
@@ -399,10 +399,10 @@ define void @test_rewrite_smfmac_f32_32x32x32_f16(<8 x half> %arg0, <16 x half>
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v15, v14
; CHECK-NEXT: v_mov_b32_e32 v14, v13
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[14:15], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[14:15], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[14:15], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[14:15], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[14:15], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[14:15], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_smfmac_f32_32x32x32_f16 a[0:15], v[0:3], v[4:11], v12
; CHECK-NEXT: ;;#ASMSTART
@@ -440,10 +440,10 @@ define void @test_rewrite_smfmac_f32_32x32x32_bf16(<8 x bfloat> %arg0, <16 x bfl
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v15, v14
; CHECK-NEXT: v_mov_b32_e32 v14, v13
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[14:15], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[14:15], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[14:15], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[14:15], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[14:15], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[14:15], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v12
; CHECK-NEXT: ;;#ASMSTART
@@ -481,10 +481,10 @@ define void @test_rewrite_smfmac_i32_32x32x64_i8(<4 x i32> %arg0, <8 x i32> %arg
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v15, v14
; CHECK-NEXT: v_mov_b32_e32 v14, v13
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[14:15], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[14:15], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[14:15], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[14:15], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[14:15], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[14:15], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_smfmac_i32_32x32x64_i8 a[0:15], v[0:3], v[4:11], v12
; CHECK-NEXT: ;;#ASMSTART
@@ -579,10 +579,10 @@ define void @test_rewrite_smfmac_f32_32x32x64_bf8_bf8(<4 x i32> %arg0, <8 x i32>
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v15, v14
; CHECK-NEXT: v_mov_b32_e32 v14, v13
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[14:15], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[14:15], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[14:15], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[14:15], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[14:15], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[14:15], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[0:3], v[4:11], v12 cbsz:1 abid:2
; CHECK-NEXT: ;;#ASMSTART
@@ -601,10 +601,10 @@ define void @test_rewrite_smfmac_f32_32x32x64_bf8_fp8(<4 x i32> %arg0, <8 x i32>
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v15, v14
; CHECK-NEXT: v_mov_b32_e32 v14, v13
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[14:15], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[14:15], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[14:15], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[14:15], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[14:15], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[14:15], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[0:3], v[4:11], v12 cbsz:1 abid:2
; CHECK-NEXT: ;;#ASMSTART
@@ -623,10 +623,10 @@ define void @test_rewrite_smfmac_f32_32x32x64_fp8_bf8(<4 x i32> %arg0, <8 x i32>
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v15, v14
; CHECK-NEXT: v_mov_b32_e32 v14, v13
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[14:15], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[14:15], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[14:15], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[14:15], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[14:15], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[14:15], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[0:3], v[4:11], v12 cbsz:1 abid:2
; CHECK-NEXT: ;;#ASMSTART
@@ -645,10 +645,10 @@ define void @test_rewrite_smfmac_f32_32x32x64_fp8_fp8(<4 x i32> %arg0, <8 x i32>
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v15, v14
; CHECK-NEXT: v_mov_b32_e32 v14, v13
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[14:15], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[14:15], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[14:15], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[14:15], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[14:15], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[14:15], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[0:3], v[4:11], v12 cbsz:1 abid:2
; CHECK-NEXT: ;;#ASMSTART
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
index e29be2b744874..6910eb317b996 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
@@ -42,19 +42,19 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma(ptr addrsp
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: v_lshlrev_b32_e32 v0, 7, v0
+; CHECK-NEXT: v_lshlrev_b32_e32 v4, 7, v0
; CHECK-NEXT: v_mov_b32_e32 v32, 1.0
; CHECK-NEXT: v_mov_b32_e32 v33, 2.0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 v[28:31], v0, s[0:1] offset:112
-; CHECK-NEXT: global_load_dwordx4 v[24:27], v0, s[0:1] offset:96
-; CHECK-NEXT: global_load_dwordx4 v[20:23], v0, s[0:1] offset:80
-; CHECK-NEXT: global_load_dwordx4 v[16:19], v0, s[0:1] offset:64
-; CHECK-NEXT: global_load_dwordx4 v[12:15], v0, s[0:1] offset:48
-; CHECK-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] offset:32
-; CHECK-NEXT: global_load_dwordx4 v[4:7], v0, s[0:1] offset:16
+; CHECK-NEXT: global_load_dwordx4 v[24:27], v4, s[0:1] offset:96
+; CHECK-NEXT: global_load_dwordx4 v[28:31], v4, s[0:1] offset:112
+; CHECK-NEXT: global_load_dwordx4 v[16:19], v4, s[0:1] offset:64
+; CHECK-NEXT: global_load_dwordx4 v[20:23], v4, s[0:1] offset:80
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v4, s[0:1] offset:32
+; CHECK-NEXT: global_load_dwordx4 v[12:15], v4, s[0:1] offset:48
+; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1]
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v4, s[0:1] offset:16
; CHECK-NEXT: v_accvgpr_write_b32 a0, 1.0
; CHECK-NEXT: v_accvgpr_write_b32 a1, 2.0
; CHECK-NEXT: s_waitcnt vmcnt(0)
@@ -123,19 +123,19 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_noshuffle(
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: v_lshlrev_b32_e32 v0, 7, v0
+; CHECK-NEXT: v_lshlrev_b32_e32 v4, 7, v0
; CHECK-NEXT: v_mov_b32_e32 v32, 1.0
; CHECK-NEXT: v_mov_b32_e32 v33, 2.0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 v[28:31], v0, s[0:1] offset:112
-; CHECK-NEXT: global_load_dwordx4 v[24:27], v0, s[0:1] offset:96
-; CHECK-NEXT: global_load_dwordx4 v[20:23], v0, s[0:1] offset:80
-; CHECK-NEXT: global_load_dwordx4 v[16:19], v0, s[0:1] offset:64
-; CHECK-NEXT: global_load_dwordx4 v[12:15], v0, s[0:1] offset:48
-; CHECK-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] offset:32
-; CHECK-NEXT: global_load_dwordx4 v[4:7], v0, s[0:1] offset:16
+; CHECK-NEXT: global_load_dwordx4 v[24:27], v4, s[0:1] offset:96
+; CHECK-NEXT: global_load_dwordx4 v[28:31], v4, s[0:1] offset:112
+; CHECK-NEXT: global_load_dwordx4 v[16:19], v4, s[0:1] offset:64
+; CHECK-NEXT: global_load_dwordx4 v[20:23], v4, s[0:1] offset:80
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v4, s[0:1] offset:32
+; CHECK-NEXT: global_load_dwordx4 v[12:15], v4, s[0:1] offset:48
+; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1]
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v4, s[0:1] offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
@@ -237,18 +237,18 @@ define amdgpu_kernel void @test_rewrite_mfma_direct_copy_to_agpr_class(ptr addrs
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: v_lshlrev_b32_e32 v0, 7, v0
+; CHECK-NEXT: v_lshlrev_b32_e32 v4, 7, v0
; CHECK-NEXT: v_mov_b32_e32 v32, 2.0
; CHECK-NEXT: v_mov_b32_e32 v33, 4.0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112
-; CHECK-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96
-; CHECK-NEXT: global_load_dwordx4 a[20:23], v0, s[0:1] offset:80
-; CHECK-NEXT: global_load_dwordx4 a[16:19], v0, s[0:1] offset:64
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v0, s[0:1] offset:48
-; CHECK-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16
-; CHECK-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1]
+; CHECK-NEXT: global_load_dwordx4 a[24:27], v4, s[0:1] offset:96
+; CHECK-NEXT: global_load_dwordx4 a[28:31], v4, s[0:1] offset:112
+; CHECK-NEXT: global_load_dwordx4 a[16:19], v4, s[0:1] offset:64
+; CHECK-NEXT: global_load_dwordx4 a[20:23], v4, s[0:1] offset:80
+; CHECK-NEXT: global_load_dwordx4 a[8:11], v4, s[0:1] offset:32
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v4, s[0:1] offset:48
+; CHECK-NEXT: global_load_dwordx4 a[0:3], v4, s[0:1]
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v4, s[0:1] offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v32, v33, a[0:31]
; CHECK-NEXT: ;;#ASMSTART
@@ -283,14 +283,14 @@ define void @test_rewrite_mfma_subreg_extract0(float %arg0, float %arg1, ptr add
; CHECK-LABEL: test_rewrite_mfma_subreg_extract0:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[28:31], v[2:3], off offset:112
; CHECK-NEXT: global_load_dwordx4 a[24:27], v[2:3], off offset:96
-; CHECK-NEXT: global_load_dwordx4 a[20:23], v[2:3], off offset:80
+; CHECK-NEXT: global_load_dwordx4 a[28:31], v[2:3], off offset:112
; CHECK-NEXT: global_load_dwordx4 a[16:19], v[2:3], off offset:64
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
+; CHECK-NEXT: global_load_dwordx4 a[20:23], v[2:3], off offset:80
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[2:3], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[2:3], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
; CHECK-NEXT: ;;#ASMSTART
@@ -309,14 +309,14 @@ define void @test_rewrite_mfma_subreg_extract1(float %arg0, float %arg1, ptr add
; CHECK-LABEL: test_rewrite_mfma_subreg_extract1:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[28:31], v[2:3], off offset:112
; CHECK-NEXT: global_load_dwordx4 a[24:27], v[2:3], off offset:96
-; CHECK-NEXT: global_load_dwordx4 a[20:23], v[2:3], off offset:80
+; CHECK-NEXT: global_load_dwordx4 a[28:31], v[2:3], off offset:112
; CHECK-NEXT: global_load_dwordx4 a[16:19], v[2:3], off offset:64
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
+; CHECK-NEXT: global_load_dwordx4 a[20:23], v[2:3], off offset:80
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[2:3], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[2:3], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
; CHECK-NEXT: ;;#ASMSTART
@@ -336,14 +336,14 @@ define void @test_rewrite_mfma_subreg_extract2(float %arg0, float %arg1, ptr add
; CHECK-LABEL: test_rewrite_mfma_subreg_extract2:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[28:31], v[2:3], off offset:112
; CHECK-NEXT: global_load_dwordx4 a[24:27], v[2:3], off offset:96
-; CHECK-NEXT: global_load_dwordx4 a[20:23], v[2:3], off offset:80
+; CHECK-NEXT: global_load_dwordx4 a[28:31], v[2:3], off offset:112
; CHECK-NEXT: global_load_dwordx4 a[16:19], v[2:3], off offset:64
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
+; CHECK-NEXT: global_load_dwordx4 a[20:23], v[2:3], off offset:80
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[2:3], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[2:3], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
; CHECK-NEXT: s_nop 15
@@ -832,14 +832,14 @@ define void @test_rewrite_mfma_f32_32x32x1f32(float %arg0, float %arg1, ptr addr
; CHECK-LABEL: test_rewrite_mfma_f32_32x32x1f32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[28:31], v[2:3], off offset:112
; CHECK-NEXT: global_load_dwordx4 a[24:27], v[2:3], off offset:96
-; CHECK-NEXT: global_load_dwordx4 a[20:23], v[2:3], off offset:80
+; CHECK-NEXT: global_load_dwordx4 a[28:31], v[2:3], off offset:112
; CHECK-NEXT: global_load_dwordx4 a[16:19], v[2:3], off offset:64
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
+; CHECK-NEXT: global_load_dwordx4 a[20:23], v[2:3], off offset:80
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[2:3], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[2:3], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
; CHECK-NEXT: ;;#ASMSTART
@@ -856,10 +856,10 @@ define void @test_rewrite_mfma_f32_16x16x1f32(float %arg0, float %arg1, ptr addr
; CHECK-LABEL: test_rewrite_mfma_f32_16x16x1f32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[2:3], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[2:3], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15]
; CHECK-NEXT: ;;#ASMSTART
@@ -893,10 +893,10 @@ define void @test_rewrite_mfma_f32_32x32x2f32(float %arg0, float %arg1, ptr addr
; CHECK-LABEL: test_rewrite_mfma_f32_32x32x2f32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[2:3], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[2:3], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x2_f32 a[0:15], v0, v1, a[0:15]
; CHECK-NEXT: ;;#ASMSTART
@@ -930,14 +930,14 @@ define void @test_rewrite_mfma_f32_32x32x4f16(<4 x half> %arg0, <4 x half> %arg1
; CHECK-LABEL: test_rewrite_mfma_f32_32x32x4f16:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[28:31], v[4:5], off offset:112
; CHECK-NEXT: global_load_dwordx4 a[24:27], v[4:5], off offset:96
-; CHECK-NEXT: global_load_dwordx4 a[20:23], v[4:5], off offset:80
+; CHECK-NEXT: global_load_dwordx4 a[28:31], v[4:5], off offset:112
; CHECK-NEXT: global_load_dwordx4 a[16:19], v[4:5], off offset:64
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[4:5], off offset:48
+; CHECK-NEXT: global_load_dwordx4 a[20:23], v[4:5], off offset:80
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[4:5], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[4:5], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[4:5], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[4:5], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[4:5], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x4_2b_f16 a[0:31], v[0:1], v[2:3], a[0:31]
; CHECK-NEXT: ;;#ASMSTART
@@ -954,10 +954,10 @@ define void @test_rewrite_mfma_f32_16x16x4f16(<4 x half> %arg0, <4 x half> %arg1
; CHECK-LABEL: test_rewrite_mfma_f32_16x16x4f16:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[4:5], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[4:5], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[4:5], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[4:5], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[4:5], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[4:5], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_16x16x4_4b_f16 a[0:15], v[0:1], v[2:3], a[0:15]
; CHECK-NEXT: ;;#ASMSTART
@@ -991,10 +991,10 @@ define void @test_rewrite_mfma_f32_32x32x8f16(<4 x half> %arg0, <4 x half> %arg1
; CHECK-LABEL: test_rewrite_mfma_f32_32x32x8f16:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[4:5], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[4:5], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[4:5], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[4:5], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[4:5], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[4:5], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x8_f16 a[0:15], v[0:1], v[2:3], a[0:15]
; CHECK-NEXT: ;;#ASMSTART
@@ -1028,14 +1028,14 @@ define void @test_rewrite_mfma_i32_32x32x4i8(i32 %arg0, i32 %arg1, ptr addrspace
; CHECK-LABEL: test_rewrite_mfma_i32_32x32x4i8:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[28:31], v[2:3], off offset:112
; CHECK-NEXT: global_load_dwordx4 a[24:27], v[2:3], off offset:96
-; CHECK-NEXT: global_load_dwordx4 a[20:23], v[2:3], off offset:80
+; CHECK-NEXT: global_load_dwordx4 a[28:31], v[2:3], off offset:112
; CHECK-NEXT: global_load_dwordx4 a[16:19], v[2:3], off offset:64
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
+; CHECK-NEXT: global_load_dwordx4 a[20:23], v[2:3], off offset:80
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[2:3], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[2:3], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_i32_32x32x4_2b_i8 a[0:31], v0, v1, a[0:31]
; CHECK-NEXT: ;;#ASMSTART
@@ -1052,10 +1052,10 @@ define void @test_rewrite_mfma_i32_16x16x4i8(i32 %arg0, i32 %arg1, ptr addrspace
; CHECK-LABEL: test_rewrite_mfma_i32_16x16x4i8:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[2:3], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[2:3], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_i32_16x16x4_4b_i8 a[0:15], v0, v1, a[0:15]
; CHECK-NEXT: ;;#ASMSTART
@@ -1093,14 +1093,14 @@ define void @test_rewrite_mfma_f32_32x32x4bf16_1k(<4 x i16> %arg0, <4 x i16> %ar
; CHECK-LABEL: test_rewrite_mfma_f32_32x32x4bf16_1k:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[28:31], v[4:5], off offset:112
; CHECK-NEXT: global_load_dwordx4 a[24:27], v[4:5], off offset:96
-; CHECK-NEXT: global_load_dwordx4 a[20:23], v[4:5], off offset:80
+; CHECK-NEXT: global_load_dwordx4 a[28:31], v[4:5], off offset:112
; CHECK-NEXT: global_load_dwordx4 a[16:19], v[4:5], off offset:64
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[4:5], off offset:48
+; CHECK-NEXT: global_load_dwordx4 a[20:23], v[4:5], off offset:80
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[4:5], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[4:5], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[4:5], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[4:5], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[4:5], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x4_2b_bf16 a[0:31], v[0:1], v[2:3], a[0:31]
; CHECK-NEXT: ;;#ASMSTART
@@ -1117,10 +1117,10 @@ define void @test_rewrite_mfma_f32_16x16x4bf16_1k(<4 x i16> %arg0, <4 x i16> %ar
; CHECK-LABEL: test_rewrite_mfma_f32_16x16x4bf16_1k:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[4:5], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[4:5], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[4:5], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[4:5], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[4:5], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[4:5], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_16x16x4_4b_bf16 a[0:15], v[0:1], v[2:3], a[0:15]
; CHECK-NEXT: ;;#ASMSTART
@@ -1154,10 +1154,10 @@ define void @test_rewrite_mfma_f32_32x32x8bf16_1k(<4 x i16> %arg0, <4 x i16> %ar
; CHECK-LABEL: test_rewrite_mfma_f32_32x32x8bf16_1k:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[4:5], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[4:5], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[4:5], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[4:5], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[4:5], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[4:5], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x8_bf16 a[0:15], v[0:1], v[2:3], a[0:15]
; CHECK-NEXT: ;;#ASMSTART
@@ -1191,8 +1191,8 @@ define void @test_rewrite_mfma_f64_16x16x4f64(double %arg0, double %arg1, ptr ad
; CHECK-LABEL: test_rewrite_mfma_f64_16x16x4f64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[4:5], off offset:16
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[4:5], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[4:5], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
; CHECK-NEXT: ;;#ASMSTART
@@ -1247,10 +1247,10 @@ define void @test_rewrite_mfma_i32_32x32x16_i8(i64 %arg0, i64 %arg1, ptr addrspa
; CHECK-LABEL: test_rewrite_mfma_i32_32x32x16_i8:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[4:5], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[4:5], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[4:5], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[4:5], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[4:5], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[4:5], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_i32_32x32x16_i8 a[0:15], v[0:1], v[2:3], a[0:15]
; CHECK-NEXT: ;;#ASMSTART
@@ -1284,10 +1284,10 @@ define void @test_rewrite_mfma_f32_32x32x4_xf32(<2 x float> %arg0, <2 x float> %
; CHECK-LABEL: test_rewrite_mfma_f32_32x32x4_xf32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[4:5], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[4:5], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[4:5], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[4:5], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[4:5], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[4:5], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x4_xf32 a[0:15], v[0:1], v[2:3], a[0:15]
; CHECK-NEXT: ;;#ASMSTART
@@ -1372,10 +1372,10 @@ define void @test_rewrite_mfma_f32_32x32x16_bf8_bf8(i64 %arg0, i64 %arg1, ptr ad
; CHECK-LABEL: test_rewrite_mfma_f32_32x32x16_bf8_bf8:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[4:5], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[4:5], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[4:5], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[4:5], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[4:5], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[4:5], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x16_bf8_bf8 a[0:15], v[0:1], v[2:3], a[0:15]
; CHECK-NEXT: ;;#ASMSTART
@@ -1392,10 +1392,10 @@ define void @test_rewrite_mfma_f32_32x32x16_bf8_fp8(i64 %arg0, i64 %arg1, ptr ad
; CHECK-LABEL: test_rewrite_mfma_f32_32x32x16_bf8_fp8:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[4:5], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[4:5], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[4:5], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[4:5], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[4:5], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[4:5], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x16_bf8_fp8 a[0:15], v[0:1], v[2:3], a[0:15]
; CHECK-NEXT: ;;#ASMSTART
@@ -1412,10 +1412,10 @@ define void @test_rewrite_mfma_f32_32x32x16_fp8_bf8(i64 %arg0, i64 %arg1, ptr ad
; CHECK-LABEL: test_rewrite_mfma_f32_32x32x16_fp8_bf8:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[4:5], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[4:5], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[4:5], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[4:5], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[4:5], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[4:5], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x16_fp8_bf8 a[0:15], v[0:1], v[2:3], a[0:15]
; CHECK-NEXT: ;;#ASMSTART
@@ -1432,10 +1432,10 @@ define void @test_rewrite_mfma_f32_32x32x16_fp8_fp8(i64 %arg0, i64 %arg1, ptr ad
; CHECK-LABEL: test_rewrite_mfma_f32_32x32x16_fp8_fp8:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[4:5], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[4:5], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[4:5], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[4:5], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[4:5], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[4:5], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x16_fp8_fp8 a[0:15], v[0:1], v[2:3], a[0:15]
; CHECK-NEXT: ;;#ASMSTART
@@ -1473,10 +1473,10 @@ define void @test_rewrite_smfmac_f32_32x32x16_f16(<4 x half> %arg0, <8 x half> %
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v9, v8
; CHECK-NEXT: v_mov_b32_e32 v8, v7
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[8:9], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[8:9], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[8:9], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[8:9], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[8:9], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[8:9], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_smfmac_f32_32x32x16_f16 a[0:15], v[0:1], v[2:5], v6
; CHECK-NEXT: ;;#ASMSTART
@@ -1514,10 +1514,10 @@ define void @test_rewrite_smfmac_f32_32x32x16_bf16(<4 x i16> %arg0, <8 x i16> %a
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v9, v8
; CHECK-NEXT: v_mov_b32_e32 v8, v7
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[8:9], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[8:9], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[8:9], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[8:9], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[8:9], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[8:9], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_smfmac_f32_32x32x16_bf16 a[0:15], v[0:1], v[2:5], v6
; CHECK-NEXT: ;;#ASMSTART
@@ -1555,10 +1555,10 @@ define void @test_rewrite_smfmac_i32_32x32x32_i8(<2 x i32> %arg0, <4 x i32> %arg
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v9, v8
; CHECK-NEXT: v_mov_b32_e32 v8, v7
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[8:9], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[8:9], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[8:9], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[8:9], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[8:9], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[8:9], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_smfmac_i32_32x32x32_i8 a[0:15], v[0:1], v[2:5], v6
; CHECK-NEXT: ;;#ASMSTART
@@ -1653,10 +1653,10 @@ define void @test_rewrite_smfmac_32x32x32_bf8_bf8(<2 x i32> %arg0, <4 x i32> %ar
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v9, v8
; CHECK-NEXT: v_mov_b32_e32 v8, v7
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[8:9], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[8:9], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[8:9], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[8:9], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[8:9], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[8:9], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 a[0:15], v[0:1], v[2:5], v6
; CHECK-NEXT: ;;#ASMSTART
@@ -1675,10 +1675,10 @@ define void @test_rewrite_smfmac_32x32x32_bf8_fp8(<2 x i32> %arg0, <4 x i32> %ar
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v9, v8
; CHECK-NEXT: v_mov_b32_e32 v8, v7
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[8:9], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[8:9], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[8:9], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[8:9], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[8:9], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[8:9], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 a[0:15], v[0:1], v[2:5], v6
; CHECK-NEXT: ;;#ASMSTART
@@ -1697,10 +1697,10 @@ define void @test_rewrite_smfmac_32x32x32_fp8_bf8(<2 x i32> %arg0, <4 x i32> %ar
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v9, v8
; CHECK-NEXT: v_mov_b32_e32 v8, v7
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[8:9], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[8:9], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[8:9], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[8:9], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[8:9], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[8:9], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 a[0:15], v[0:1], v[2:5], v6
; CHECK-NEXT: ;;#ASMSTART
@@ -1719,10 +1719,10 @@ define void @test_rewrite_smfmac_32x32x32_fp8_fp8(<2 x i32> %arg0, <4 x i32> %ar
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v9, v8
; CHECK-NEXT: v_mov_b32_e32 v8, v7
-; CHECK-NEXT: global_load_dwordx4 a[12:15], v[8:9], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[8:11], v[8:9], off offset:32
-; CHECK-NEXT: global_load_dwordx4 a[4:7], v[8:9], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[8:9], off offset:48
; CHECK-NEXT: global_load_dwordx4 a[0:3], v[8:9], off
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[8:9], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 a[0:15], v[0:1], v[2:5], v6
; CHECK-NEXT: ;;#ASMSTART
diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
index d46278633f341..274c386c0391c 100644
--- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
@@ -606,46 +606,46 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt
; GFX8-NEXT: v_ffbh_i32_e32 v11, v4
; GFX8-NEXT: v_ffbh_i32_e32 v13, v2
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v0
+; GFX8-NEXT: v_ashrrev_i32_e32 v12, 31, v12
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_xor_b32_e32 v14, v7, v8
; GFX8-NEXT: v_xor_b32_e32 v16, v5, v6
+; GFX8-NEXT: v_add_u32_e32 v11, vcc, -1, v11
+; GFX8-NEXT: v_add_u32_e32 v13, vcc, -1, v13
; GFX8-NEXT: v_ffbh_i32_e32 v15, v8
; GFX8-NEXT: v_ffbh_i32_e32 v17, v6
-; GFX8-NEXT: v_ashrrev_i32_e32 v12, 31, v12
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-NEXT: v_add_u32_e32 v12, vcc, 32, v12
; GFX8-NEXT: v_ashrrev_i32_e32 v14, 31, v14
; GFX8-NEXT: v_ashrrev_i32_e32 v16, 31, v16
-; GFX8-NEXT: v_add_u32_e32 v11, vcc, -1, v11
-; GFX8-NEXT: v_add_u32_e32 v13, vcc, -1, v13
; GFX8-NEXT: v_add_u32_e32 v15, vcc, -1, v15
; GFX8-NEXT: v_add_u32_e32 v17, vcc, -1, v17
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0
-; GFX8-NEXT: v_add_u32_e32 v12, vcc, 32, v12
-; GFX8-NEXT: v_add_u32_e32 v14, vcc, 32, v14
-; GFX8-NEXT: v_add_u32_e32 v16, vcc, 32, v16
; GFX8-NEXT: v_min_u32_e32 v0, v11, v0
; GFX8-NEXT: v_min_u32_e32 v11, v13, v12
-; GFX8-NEXT: v_min_u32_e32 v12, v15, v14
-; GFX8-NEXT: v_min_u32_e32 v13, v17, v16
+; GFX8-NEXT: v_add_u32_e32 v12, vcc, 32, v14
+; GFX8-NEXT: v_add_u32_e32 v13, vcc, 32, v16
; GFX8-NEXT: v_lshlrev_b64 v[3:4], v0, v[3:4]
; GFX8-NEXT: v_sub_u32_e32 v14, vcc, 32, v0
; GFX8-NEXT: v_lshlrev_b64 v[0:1], v11, v[1:2]
-; GFX8-NEXT: v_lshlrev_b64 v[7:8], v12, v[7:8]
-; GFX8-NEXT: v_lshlrev_b64 v[5:6], v13, v[5:6]
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v11
+; GFX8-NEXT: v_min_u32_e32 v11, v15, v12
+; GFX8-NEXT: v_min_u32_e32 v12, v17, v13
+; GFX8-NEXT: v_lshlrev_b64 v[7:8], v11, v[7:8]
+; GFX8-NEXT: v_lshlrev_b64 v[5:6], v12, v[5:6]
; GFX8-NEXT: v_min_u32_e32 v3, 1, v3
; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
-; GFX8-NEXT: v_min_u32_e32 v7, 1, v7
-; GFX8-NEXT: v_min_u32_e32 v5, 1, v5
; GFX8-NEXT: v_or_b32_e32 v3, v4, v3
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX8-NEXT: v_or_b32_e32 v1, v8, v7
-; GFX8-NEXT: v_or_b32_e32 v4, v6, v5
+; GFX8-NEXT: v_min_u32_e32 v1, 1, v7
+; GFX8-NEXT: v_min_u32_e32 v4, 1, v5
+; GFX8-NEXT: v_or_b32_e32 v1, v8, v1
+; GFX8-NEXT: v_or_b32_e32 v4, v6, v4
; GFX8-NEXT: v_cvt_f32_i32_e32 v3, v3
; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX8-NEXT: v_cvt_f32_i32_e32 v5, v1
; GFX8-NEXT: v_cvt_f32_i32_e32 v4, v4
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v11
-; GFX8-NEXT: v_sub_u32_e32 v11, vcc, 32, v12
-; GFX8-NEXT: v_sub_u32_e32 v12, vcc, 32, v13
+; GFX8-NEXT: v_sub_u32_e32 v11, vcc, 32, v11
+; GFX8-NEXT: v_sub_u32_e32 v12, vcc, 32, v12
; GFX8-NEXT: v_ldexp_f32 v1, v3, v14
; GFX8-NEXT: v_ldexp_f32 v0, v0, v2
; GFX8-NEXT: v_ldexp_f32 v3, v5, v11
@@ -985,58 +985,58 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt
; GFX8-NEXT: v_ffbh_i32_e32 v11, v4
; GFX8-NEXT: v_ffbh_i32_e32 v13, v2
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v0
+; GFX8-NEXT: v_ashrrev_i32_e32 v12, 31, v12
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_xor_b32_e32 v14, v7, v8
; GFX8-NEXT: v_xor_b32_e32 v16, v5, v6
+; GFX8-NEXT: v_add_u32_e32 v11, vcc, -1, v11
+; GFX8-NEXT: v_add_u32_e32 v13, vcc, -1, v13
; GFX8-NEXT: v_ffbh_i32_e32 v15, v8
; GFX8-NEXT: v_ffbh_i32_e32 v17, v6
-; GFX8-NEXT: v_ashrrev_i32_e32 v12, 31, v12
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-NEXT: v_add_u32_e32 v12, vcc, 32, v12
; GFX8-NEXT: v_ashrrev_i32_e32 v14, 31, v14
; GFX8-NEXT: v_ashrrev_i32_e32 v16, 31, v16
-; GFX8-NEXT: v_add_u32_e32 v11, vcc, -1, v11
-; GFX8-NEXT: v_add_u32_e32 v13, vcc, -1, v13
; GFX8-NEXT: v_add_u32_e32 v15, vcc, -1, v15
; GFX8-NEXT: v_add_u32_e32 v17, vcc, -1, v17
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0
-; GFX8-NEXT: v_add_u32_e32 v12, vcc, 32, v12
-; GFX8-NEXT: v_add_u32_e32 v14, vcc, 32, v14
-; GFX8-NEXT: v_add_u32_e32 v16, vcc, 32, v16
; GFX8-NEXT: v_min_u32_e32 v0, v11, v0
; GFX8-NEXT: v_min_u32_e32 v11, v13, v12
-; GFX8-NEXT: v_min_u32_e32 v12, v15, v14
-; GFX8-NEXT: v_min_u32_e32 v13, v17, v16
+; GFX8-NEXT: v_add_u32_e32 v12, vcc, 32, v14
+; GFX8-NEXT: v_add_u32_e32 v13, vcc, 32, v16
; GFX8-NEXT: v_lshlrev_b64 v[3:4], v0, v[3:4]
; GFX8-NEXT: v_sub_u32_e32 v14, vcc, 32, v0
; GFX8-NEXT: v_lshlrev_b64 v[0:1], v11, v[1:2]
-; GFX8-NEXT: v_lshlrev_b64 v[7:8], v12, v[7:8]
-; GFX8-NEXT: v_lshlrev_b64 v[5:6], v13, v[5:6]
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v11
+; GFX8-NEXT: v_min_u32_e32 v11, v15, v12
+; GFX8-NEXT: v_min_u32_e32 v12, v17, v13
+; GFX8-NEXT: v_lshlrev_b64 v[7:8], v11, v[7:8]
+; GFX8-NEXT: v_lshlrev_b64 v[5:6], v12, v[5:6]
; GFX8-NEXT: v_min_u32_e32 v3, 1, v3
; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
-; GFX8-NEXT: v_min_u32_e32 v7, 1, v7
-; GFX8-NEXT: v_min_u32_e32 v5, 1, v5
; GFX8-NEXT: v_or_b32_e32 v3, v4, v3
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX8-NEXT: v_or_b32_e32 v1, v8, v7
-; GFX8-NEXT: v_or_b32_e32 v4, v6, v5
+; GFX8-NEXT: v_min_u32_e32 v1, 1, v7
+; GFX8-NEXT: v_min_u32_e32 v4, 1, v5
; GFX8-NEXT: v_cvt_f32_i32_e32 v3, v3
; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v8, v1
+; GFX8-NEXT: v_or_b32_e32 v4, v6, v4
; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX8-NEXT: v_cvt_f32_i32_e32 v4, v4
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v11
-; GFX8-NEXT: v_sub_u32_e32 v11, vcc, 32, v12
-; GFX8-NEXT: v_sub_u32_e32 v12, vcc, 32, v13
+; GFX8-NEXT: v_sub_u32_e32 v11, vcc, 32, v11
+; GFX8-NEXT: v_sub_u32_e32 v12, vcc, 32, v12
; GFX8-NEXT: v_ldexp_f32 v3, v3, v14
; GFX8-NEXT: v_ldexp_f32 v0, v0, v2
-; GFX8-NEXT: v_ldexp_f32 v1, v1, v11
-; GFX8-NEXT: v_ldexp_f32 v2, v4, v12
-; GFX8-NEXT: v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT: v_cvt_f16_f32_e32 v4, v0
-; GFX8-NEXT: v_cvt_f16_f32_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT: v_cvt_f16_f32_e32 v6, v2
+; GFX8-NEXT: v_cvt_f16_f32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v0
+; GFX8-NEXT: v_ldexp_f32 v0, v1, v11
+; GFX8-NEXT: v_ldexp_f32 v1, v4, v12
+; GFX8-NEXT: v_cvt_f16_f32_sdwa v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_cvt_f16_f32_e32 v5, v1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v9
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v10, vcc
-; GFX8-NEXT: v_or_b32_e32 v2, v4, v3
-; GFX8-NEXT: v_or_b32_e32 v3, v6, v5
+; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX8-NEXT: v_or_b32_e32 v3, v5, v4
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
index 67dae136afb72..0a290fb08281f 100644
--- a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
+++ b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
@@ -119,8 +119,8 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN5:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM8:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 224, 0 :: (invariant load (s128) from %ir.131, addrspace 4)
; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY7]], 224, 0 :: (invariant load (s128) from %ir.147, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 576, 0 :: (invariant load (s128) from %ir.152, addrspace 4)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN6:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 576, 0 :: (invariant load (s128) from %ir.152, addrspace 4)
; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM11:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 224, 0 :: (invariant load (s128) from %ir.136, addrspace 4)
; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM12:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 576, 0 :: (invariant load (s128) from %ir.164, addrspace 4)
; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM13:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 224, 0 :: (invariant load (s128) from %ir.142, addrspace 4)
@@ -153,12 +153,12 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM10]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub3:sgpr_128 = S_MOV_B32 553734060
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 -1
- ; CHECK-NEXT: [[COPY15:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_9]], 0, 0 :: (invariant load (s128) from %ir.172, addrspace 4)
+ ; CHECK-NEXT: [[COPY15:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
; CHECK-NEXT: [[COPY15:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub1
; CHECK-NEXT: [[COPY15:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORD_IMM]]
- ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY15]], 0, 0 :: (dereferenceable invariant load (s32))
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN14:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM14]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY15]], 0, 0 :: (dereferenceable invariant load (s32))
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN15:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM12]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM16:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_10]], 0, 0 :: (invariant load (s128) from %ir.180, addrspace 4)
; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_11]], 0, 0 :: (invariant load (s128) from %ir.185, addrspace 4)
diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll
index d10ef709f8e33..cb0f9b715e69e 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.ll
@@ -343,13 +343,13 @@ define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v8, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:16
-; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3]
+; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3]
+; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_sub_u32_e32 v3, v7, v3
-; GFX9-NEXT: v_sub_u32_e32 v2, v6, v2
-; GFX9-NEXT: v_sub_u32_e32 v1, v5, v1
-; GFX9-NEXT: v_sub_u32_e32 v0, v4, v0
+; GFX9-NEXT: v_sub_u32_e32 v3, v3, v7
+; GFX9-NEXT: v_sub_u32_e32 v2, v2, v6
+; GFX9-NEXT: v_sub_u32_e32 v1, v1, v5
+; GFX9-NEXT: v_sub_u32_e32 v0, v0, v4
; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX9-NEXT: s_endpgm
;
@@ -359,13 +359,13 @@ define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1
; GFX12-NEXT: v_mov_b32_e32 v8, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_load_b128 v[0:3], v8, s[2:3] offset:16
-; GFX12-NEXT: global_load_b128 v[4:7], v8, s[2:3]
+; GFX12-NEXT: global_load_b128 v[0:3], v8, s[2:3]
+; GFX12-NEXT: global_load_b128 v[4:7], v8, s[2:3] offset:16
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_sub_nc_u32_e32 v3, v7, v3
-; GFX12-NEXT: v_sub_nc_u32_e32 v2, v6, v2
-; GFX12-NEXT: v_sub_nc_u32_e32 v1, v5, v1
-; GFX12-NEXT: v_sub_nc_u32_e32 v0, v4, v0
+; GFX12-NEXT: v_sub_nc_u32_e32 v3, v3, v7
+; GFX12-NEXT: v_sub_nc_u32_e32 v2, v2, v6
+; GFX12-NEXT: v_sub_nc_u32_e32 v1, v1, v5
+; GFX12-NEXT: v_sub_nc_u32_e32 v0, v0, v4
; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1]
; GFX12-NEXT: s_endpgm
%b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1
@@ -749,36 +749,36 @@ define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspac
;
; GFX9-LABEL: v_sub_i64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3]
-; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7]
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v0
+; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc
; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: v_sub_i64:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3]
-; GFX12-NEXT: global_load_b64 v[2:3], v2, s[4:5]
+; GFX12-NEXT: global_load_b64 v[0:1], v2, s[6:7]
+; GFX12-NEXT: global_load_b64 v[2:3], v2, s[2:3]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2
+; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v2, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
+; GFX12-NEXT: v_sub_co_ci_u32_e64 v1, null, v3, v1, vcc_lo
; GFX12-NEXT: global_store_b64 v4, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
@@ -843,41 +843,41 @@ define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace
;
; GFX9-LABEL: v_test_sub_v2i64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v8, 4, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3]
-; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[6:7]
+; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7]
+; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v8, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v6
-; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v7, vcc
-; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4
-; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
+; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v6, v2
+; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v7, v3, vcc
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v4, v0
+; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v5, v1, vcc
; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: v_test_sub_v2i64:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX12-NEXT: v_mov_b32_e32 v8, 0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 4, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_load_b128 v[0:3], v4, s[2:3]
-; GFX12-NEXT: global_load_b128 v[4:7], v4, s[4:5]
+; GFX12-NEXT: global_load_b128 v[0:3], v4, s[6:7]
+; GFX12-NEXT: global_load_b128 v[4:7], v4, s[2:3]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_sub_co_u32 v2, vcc_lo, v2, v6
+; GFX12-NEXT: v_sub_co_u32 v2, vcc_lo, v6, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_sub_co_ci_u32_e64 v3, null, v3, v7, vcc_lo
-; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v4
+; GFX12-NEXT: v_sub_co_ci_u32_e64 v3, null, v7, v3, vcc_lo
+; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v4, v0
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
-; GFX12-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v5, vcc_lo
+; GFX12-NEXT: v_sub_co_ci_u32_e64 v1, null, v5, v1, vcc_lo
; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1]
; GFX12-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
index d25178f11063c..d15a822052858 100644
--- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
@@ -507,27 +507,27 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_ffbh_u32_e32 v12, v8
; GFX8-NEXT: v_ffbh_u32_e32 v13, v6
-; GFX8-NEXT: v_min_u32_e32 v12, 32, v12
-; GFX8-NEXT: v_min_u32_e32 v13, 32, v13
; GFX8-NEXT: v_sub_u32_e32 v14, vcc, 32, v0
; GFX8-NEXT: v_lshlrev_b64 v[0:1], v11, v[1:2]
-; GFX8-NEXT: v_lshlrev_b64 v[7:8], v12, v[7:8]
-; GFX8-NEXT: v_lshlrev_b64 v[5:6], v13, v[5:6]
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v11
+; GFX8-NEXT: v_min_u32_e32 v11, 32, v12
+; GFX8-NEXT: v_min_u32_e32 v12, 32, v13
+; GFX8-NEXT: v_lshlrev_b64 v[7:8], v11, v[7:8]
+; GFX8-NEXT: v_lshlrev_b64 v[5:6], v12, v[5:6]
; GFX8-NEXT: v_min_u32_e32 v3, 1, v3
; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
-; GFX8-NEXT: v_min_u32_e32 v7, 1, v7
-; GFX8-NEXT: v_min_u32_e32 v5, 1, v5
; GFX8-NEXT: v_or_b32_e32 v3, v4, v3
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX8-NEXT: v_or_b32_e32 v1, v8, v7
-; GFX8-NEXT: v_or_b32_e32 v4, v6, v5
+; GFX8-NEXT: v_min_u32_e32 v1, 1, v7
+; GFX8-NEXT: v_min_u32_e32 v4, 1, v5
+; GFX8-NEXT: v_or_b32_e32 v1, v8, v1
+; GFX8-NEXT: v_or_b32_e32 v4, v6, v4
; GFX8-NEXT: v_cvt_f32_u32_e32 v3, v3
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX8-NEXT: v_cvt_f32_u32_e32 v5, v1
; GFX8-NEXT: v_cvt_f32_u32_e32 v4, v4
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v11
-; GFX8-NEXT: v_sub_u32_e32 v11, vcc, 32, v12
-; GFX8-NEXT: v_sub_u32_e32 v12, vcc, 32, v13
+; GFX8-NEXT: v_sub_u32_e32 v11, vcc, 32, v11
+; GFX8-NEXT: v_sub_u32_e32 v12, vcc, 32, v12
; GFX8-NEXT: v_ldexp_f32 v1, v3, v14
; GFX8-NEXT: v_ldexp_f32 v0, v0, v2
; GFX8-NEXT: v_ldexp_f32 v3, v5, v11
@@ -805,39 +805,39 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_ffbh_u32_e32 v12, v8
; GFX8-NEXT: v_ffbh_u32_e32 v13, v6
-; GFX8-NEXT: v_min_u32_e32 v12, 32, v12
-; GFX8-NEXT: v_min_u32_e32 v13, 32, v13
; GFX8-NEXT: v_sub_u32_e32 v14, vcc, 32, v0
; GFX8-NEXT: v_lshlrev_b64 v[0:1], v11, v[1:2]
-; GFX8-NEXT: v_lshlrev_b64 v[7:8], v12, v[7:8]
-; GFX8-NEXT: v_lshlrev_b64 v[5:6], v13, v[5:6]
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v11
+; GFX8-NEXT: v_min_u32_e32 v11, 32, v12
+; GFX8-NEXT: v_min_u32_e32 v12, 32, v13
+; GFX8-NEXT: v_lshlrev_b64 v[7:8], v11, v[7:8]
+; GFX8-NEXT: v_lshlrev_b64 v[5:6], v12, v[5:6]
; GFX8-NEXT: v_min_u32_e32 v3, 1, v3
; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
-; GFX8-NEXT: v_min_u32_e32 v7, 1, v7
-; GFX8-NEXT: v_min_u32_e32 v5, 1, v5
; GFX8-NEXT: v_or_b32_e32 v3, v4, v3
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX8-NEXT: v_or_b32_e32 v1, v8, v7
-; GFX8-NEXT: v_or_b32_e32 v4, v6, v5
+; GFX8-NEXT: v_min_u32_e32 v1, 1, v7
+; GFX8-NEXT: v_min_u32_e32 v4, 1, v5
; GFX8-NEXT: v_cvt_f32_u32_e32 v3, v3
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v8, v1
+; GFX8-NEXT: v_or_b32_e32 v4, v6, v4
; GFX8-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX8-NEXT: v_cvt_f32_u32_e32 v4, v4
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v11
-; GFX8-NEXT: v_sub_u32_e32 v11, vcc, 32, v12
-; GFX8-NEXT: v_sub_u32_e32 v12, vcc, 32, v13
+; GFX8-NEXT: v_sub_u32_e32 v11, vcc, 32, v11
+; GFX8-NEXT: v_sub_u32_e32 v12, vcc, 32, v12
; GFX8-NEXT: v_ldexp_f32 v3, v3, v14
; GFX8-NEXT: v_ldexp_f32 v0, v0, v2
-; GFX8-NEXT: v_ldexp_f32 v1, v1, v11
-; GFX8-NEXT: v_ldexp_f32 v2, v4, v12
-; GFX8-NEXT: v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT: v_cvt_f16_f32_e32 v4, v0
-; GFX8-NEXT: v_cvt_f16_f32_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT: v_cvt_f16_f32_e32 v6, v2
+; GFX8-NEXT: v_cvt_f16_f32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v0
+; GFX8-NEXT: v_ldexp_f32 v0, v1, v11
+; GFX8-NEXT: v_ldexp_f32 v1, v4, v12
+; GFX8-NEXT: v_cvt_f16_f32_sdwa v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_cvt_f16_f32_e32 v5, v1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v9
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v10, vcc
-; GFX8-NEXT: v_or_b32_e32 v2, v4, v3
-; GFX8-NEXT: v_or_b32_e32 v3, v6, v5
+; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX8-NEXT: v_or_b32_e32 v3, v5, v4
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index 5e18b469a4e88..8bfd38a351b2d 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -218,20 +218,19 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
; GFX942-NEXT: v_lshlrev_b32_e32 v1, 5, v0
; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx4 v[6:9], v1, s[0:1] offset:16
; GFX942-NEXT: global_load_dwordx4 v[2:5], v1, s[0:1]
+; GFX942-NEXT: global_load_dwordx4 v[6:9], v1, s[0:1] offset:16
; GFX942-NEXT: v_mov_b32_e32 v1, 0
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX942-NEXT: s_cbranch_execz .LBB5_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
; GFX942-NEXT: v_lshlrev_b32_e32 v0, 5, v0
-; GFX942-NEXT: global_load_dwordx4 v[6:9], v0, s[2:3] offset:16
; GFX942-NEXT: global_load_dwordx4 v[2:5], v0, s[2:3]
+; GFX942-NEXT: global_load_dwordx4 v[6:9], v0, s[2:3] offset:16
; GFX942-NEXT: .LBB5_2: ; %bb.2
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: global_store_dwordx4 v1, v[6:9], s[6:7] offset:16
-; GFX942-NEXT: s_waitcnt vmcnt(1)
; GFX942-NEXT: global_store_dwordx4 v1, v[2:5], s[6:7]
; GFX942-NEXT: s_endpgm
entry:
@@ -260,68 +259,64 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: global_load_dwordx4 v[22:25], v1, s[0:1] offset:224
; GFX942-NEXT: global_load_dwordx4 v[30:33], v1, s[0:1] offset:240
-; GFX942-NEXT: global_load_dwordx4 v[26:29], v1, s[0:1] offset:224
-; GFX942-NEXT: global_load_dwordx4 v[22:25], v1, s[0:1] offset:208
-; GFX942-NEXT: global_load_dwordx4 v[18:21], v1, s[0:1] offset:192
-; GFX942-NEXT: global_load_dwordx4 v[14:17], v1, s[0:1] offset:176
-; GFX942-NEXT: global_load_dwordx4 v[10:13], v1, s[0:1] offset:160
-; GFX942-NEXT: global_load_dwordx4 v[6:9], v1, s[0:1] offset:144
+; GFX942-NEXT: global_load_dwordx4 v[14:17], v1, s[0:1] offset:192
+; GFX942-NEXT: global_load_dwordx4 v[26:29], v1, s[0:1] offset:208
+; GFX942-NEXT: global_load_dwordx4 v[6:9], v1, s[0:1] offset:160
+; GFX942-NEXT: global_load_dwordx4 v[18:21], v1, s[0:1] offset:176
; GFX942-NEXT: global_load_dwordx4 v[2:5], v1, s[0:1] offset:128
+; GFX942-NEXT: global_load_dwordx4 v[10:13], v1, s[0:1] offset:144
+; GFX942-NEXT: global_load_dwordx4 v[54:57], v1, s[0:1] offset:96
; GFX942-NEXT: global_load_dwordx4 a[0:3], v1, s[0:1] offset:112
-; GFX942-NEXT: global_load_dwordx4 v[58:61], v1, s[0:1] offset:96
-; GFX942-NEXT: global_load_dwordx4 v[54:57], v1, s[0:1] offset:80
-; GFX942-NEXT: global_load_dwordx4 v[50:53], v1, s[0:1] offset:64
-; GFX942-NEXT: global_load_dwordx4 v[46:49], v1, s[0:1] offset:48
-; GFX942-NEXT: global_load_dwordx4 v[42:45], v1, s[0:1] offset:32
-; GFX942-NEXT: global_load_dwordx4 v[38:41], v1, s[0:1] offset:16
+; GFX942-NEXT: global_load_dwordx4 v[46:49], v1, s[0:1] offset:64
+; GFX942-NEXT: global_load_dwordx4 v[58:61], v1, s[0:1] offset:80
+; GFX942-NEXT: global_load_dwordx4 v[38:41], v1, s[0:1] offset:32
+; GFX942-NEXT: global_load_dwordx4 v[50:53], v1, s[0:1] offset:48
; GFX942-NEXT: global_load_dwordx4 v[34:37], v1, s[0:1]
+; GFX942-NEXT: global_load_dwordx4 v[42:45], v1, s[0:1] offset:16
; GFX942-NEXT: v_mov_b32_e32 v1, 0
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX942-NEXT: s_cbranch_execz .LBB6_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
; GFX942-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX942-NEXT: global_load_dwordx4 v[22:25], v0, s[2:3] offset:224
; GFX942-NEXT: global_load_dwordx4 v[30:33], v0, s[2:3] offset:240
-; GFX942-NEXT: global_load_dwordx4 v[26:29], v0, s[2:3] offset:224
-; GFX942-NEXT: global_load_dwordx4 v[22:25], v0, s[2:3] offset:208
-; GFX942-NEXT: global_load_dwordx4 v[18:21], v0, s[2:3] offset:192
-; GFX942-NEXT: global_load_dwordx4 v[14:17], v0, s[2:3] offset:176
-; GFX942-NEXT: global_load_dwordx4 v[10:13], v0, s[2:3] offset:160
-; GFX942-NEXT: global_load_dwordx4 v[6:9], v0, s[2:3] offset:144
+; GFX942-NEXT: global_load_dwordx4 v[14:17], v0, s[2:3] offset:192
+; GFX942-NEXT: global_load_dwordx4 v[26:29], v0, s[2:3] offset:208
+; GFX942-NEXT: global_load_dwordx4 v[6:9], v0, s[2:3] offset:160
+; GFX942-NEXT: global_load_dwordx4 v[18:21], v0, s[2:3] offset:176
; GFX942-NEXT: global_load_dwordx4 v[2:5], v0, s[2:3] offset:128
+; GFX942-NEXT: global_load_dwordx4 v[10:13], v0, s[2:3] offset:144
+; GFX942-NEXT: global_load_dwordx4 v[54:57], v0, s[2:3] offset:96
; GFX942-NEXT: global_load_dwordx4 a[0:3], v0, s[2:3] offset:112
-; GFX942-NEXT: global_load_dwordx4 v[58:61], v0, s[2:3] offset:96
-; GFX942-NEXT: global_load_dwordx4 v[54:57], v0, s[2:3] offset:80
-; GFX942-NEXT: global_load_dwordx4 v[50:53], v0, s[2:3] offset:64
-; GFX942-NEXT: global_load_dwordx4 v[46:49], v0, s[2:3] offset:48
-; GFX942-NEXT: global_load_dwordx4 v[42:45], v0, s[2:3] offset:32
-; GFX942-NEXT: global_load_dwordx4 v[38:41], v0, s[2:3] offset:16
+; GFX942-NEXT: global_load_dwordx4 v[46:49], v0, s[2:3] offset:64
+; GFX942-NEXT: global_load_dwordx4 v[58:61], v0, s[2:3] offset:80
+; GFX942-NEXT: global_load_dwordx4 v[38:41], v0, s[2:3] offset:32
+; GFX942-NEXT: global_load_dwordx4 v[50:53], v0, s[2:3] offset:48
; GFX942-NEXT: global_load_dwordx4 v[34:37], v0, s[2:3]
+; GFX942-NEXT: global_load_dwordx4 v[42:45], v0, s[2:3] offset:16
; GFX942-NEXT: .LBB6_2: ; %bb.2
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: s_waitcnt vmcnt(7)
+; GFX942-NEXT: s_waitcnt vmcnt(6)
; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7] offset:112
-; GFX942-NEXT: s_waitcnt vmcnt(7)
-; GFX942-NEXT: global_store_dwordx4 v1, v[58:61], s[6:7] offset:96
-; GFX942-NEXT: s_waitcnt vmcnt(7)
-; GFX942-NEXT: global_store_dwordx4 v1, v[54:57], s[6:7] offset:80
-; GFX942-NEXT: s_waitcnt vmcnt(7)
-; GFX942-NEXT: global_store_dwordx4 v1, v[50:53], s[6:7] offset:64
-; GFX942-NEXT: s_waitcnt vmcnt(7)
-; GFX942-NEXT: global_store_dwordx4 v1, v[46:49], s[6:7] offset:48
-; GFX942-NEXT: s_waitcnt vmcnt(7)
-; GFX942-NEXT: global_store_dwordx4 v1, v[42:45], s[6:7] offset:32
-; GFX942-NEXT: s_waitcnt vmcnt(7)
-; GFX942-NEXT: global_store_dwordx4 v1, v[38:41], s[6:7] offset:16
-; GFX942-NEXT: s_waitcnt vmcnt(7)
+; GFX942-NEXT: global_store_dwordx4 v1, v[54:57], s[6:7] offset:96
+; GFX942-NEXT: s_waitcnt vmcnt(6)
+; GFX942-NEXT: global_store_dwordx4 v1, v[58:61], s[6:7] offset:80
+; GFX942-NEXT: global_store_dwordx4 v1, v[46:49], s[6:7] offset:64
+; GFX942-NEXT: s_waitcnt vmcnt(6)
+; GFX942-NEXT: global_store_dwordx4 v1, v[50:53], s[6:7] offset:48
+; GFX942-NEXT: global_store_dwordx4 v1, v[38:41], s[6:7] offset:32
+; GFX942-NEXT: s_waitcnt vmcnt(6)
+; GFX942-NEXT: global_store_dwordx4 v1, v[42:45], s[6:7] offset:16
; GFX942-NEXT: global_store_dwordx4 v1, v[34:37], s[6:7]
; GFX942-NEXT: global_store_dwordx4 v1, v[30:33], s[6:7] offset:240
-; GFX942-NEXT: global_store_dwordx4 v1, v[26:29], s[6:7] offset:224
-; GFX942-NEXT: global_store_dwordx4 v1, v[22:25], s[6:7] offset:208
-; GFX942-NEXT: global_store_dwordx4 v1, v[18:21], s[6:7] offset:192
-; GFX942-NEXT: global_store_dwordx4 v1, v[14:17], s[6:7] offset:176
-; GFX942-NEXT: global_store_dwordx4 v1, v[10:13], s[6:7] offset:160
-; GFX942-NEXT: global_store_dwordx4 v1, v[6:9], s[6:7] offset:144
+; GFX942-NEXT: global_store_dwordx4 v1, v[22:25], s[6:7] offset:224
+; GFX942-NEXT: global_store_dwordx4 v1, v[26:29], s[6:7] offset:208
+; GFX942-NEXT: global_store_dwordx4 v1, v[14:17], s[6:7] offset:192
+; GFX942-NEXT: global_store_dwordx4 v1, v[18:21], s[6:7] offset:176
+; GFX942-NEXT: global_store_dwordx4 v1, v[6:9], s[6:7] offset:160
+; GFX942-NEXT: global_store_dwordx4 v1, v[10:13], s[6:7] offset:144
; GFX942-NEXT: global_store_dwordx4 v1, v[2:5], s[6:7] offset:128
; GFX942-NEXT: s_endpgm
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index 0fdc1a83dddbd..f0cef0e2eebe8 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -3270,16 +3270,16 @@ define amdgpu_ps float @test_wqm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1
; GFX9-W64-NEXT: v_mov_b32_e32 v0, s1
; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[16:19], 0 idxen
; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
+; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0
+; GFX9-W64-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: buffer_load_dword v2, v1, s[16:19], 0 idxen
; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-W64-NEXT: s_waitcnt vmcnt(1)
; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0
-; GFX9-W64-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1
-; GFX9-W64-NEXT: s_waitcnt vmcnt(1)
-; GFX9-W64-NEXT: v_mov_b32_e32 v3, v2
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
-; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0
+; GFX9-W64-NEXT: v_mov_b32_e32 v3, v2
; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v3
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[20:21]
; GFX9-W64-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1
@@ -3315,6 +3315,9 @@ define amdgpu_ps float @test_wqm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: buffer_load_dword v0, v3, s[16:19], 0 idxen
; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
+; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
+; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0
+; GFX10-W32-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: buffer_load_dword v2, v1, s[16:19], 0 idxen
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
@@ -3322,9 +3325,6 @@ define amdgpu_ps float @test_wqm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1
; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-W32-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_1D
-; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0
; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v3
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s20
; GFX10-W32-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_1D
More information about the llvm-commits
mailing list