[llvm] [AMDGPU] Enable reordering of VMEM loads during clustering (PR #107986)

Carl Ritson via llvm-commits llvm-commits at lists.llvm.org
Tue Sep 10 01:41:39 PDT 2024


https://github.com/perlfu created https://github.com/llvm/llvm-project/pull/107986

Add fine grain control over ReorderWhileClustering by adding
canReorderClusterMemOps query to TargetInstrInfo.
Implement this to true for RISC to maintain current behaviour.

On AMDGPU enable ReorderWhileClustering for loads and implement
canReorderClusterMemOps to reject reordering for operations other
than VMEM.
The intention of doing this is to allow some additional overlap
of computation with memory loads with large clauses.
Loads will be issued in an order closer to their usage more often,
producing more incremental s_waitcnt values.
On average this yields a very small reduction in VGPR pressure,
although edge cases may see increased pressure.
Reordering SMEM/LDS access is not beneficial as these must always
be waitcnt 0.

For the benefit of future tuning add support for function metadata
"amdgpu-reorder-loads-while-clustering" (set to either 0 or 1) to
disable/enable reordering behaviour per function.

>From 6c08cb7c93f72b7ee4f034f6c0f5a7be00e36133 Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Tue, 10 Sep 2024 15:24:25 +0900
Subject: [PATCH 1/2] [AMDGPU] Enable reordering of VMEM loads during
 clustering

Add fine grain control over ReorderWhileClustering by adding
canReorderClusterMemOps query to TargetInstrInfo.
Implement this to true for RISC to maintain current behaviour.

On AMDGPU enable ReorderWhileClustering for loads and implement
canReorderClusterMemOps to reject reordering for operations other
than VMEM.
The intention of doing this is to allow some additional overlap
of computation with memory loads, as loads will be issued in
an order closer to their usage, more incremental s_waitcnt can
be introduced.
On average this yields a very small reduction in VGPR pressure,
although edge cases may see increased pressure.
Reordering SMEM/LDS access is not beneficial as these must always
be waitcnt 0.

For the benefit of future tuning add support for function metadata
"amdgpu-reorder-loads-while-clustering" (set to either 0 or 1) to
disable/enable reordering behaviour per function.
---
 llvm/include/llvm/CodeGen/TargetInstrInfo.h   | 10 +++++++
 llvm/lib/CodeGen/MachineScheduler.cpp         |  4 ++-
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 27 +++++++++++++++----
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        | 17 ++++++++++++
 llvm/lib/Target/AMDGPU/SIInstrInfo.h          |  4 +++
 llvm/lib/Target/RISCV/RISCVInstrInfo.h        |  6 +++++
 6 files changed, 62 insertions(+), 6 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 65c5788ac5cc9f..240037687ab133 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -1574,6 +1574,16 @@ class TargetInstrInfo : public MCInstrInfo {
     llvm_unreachable("target did not implement shouldClusterMemOps()");
   }
 
+  /// Returns true if the two given memory operations can be reordered
+  /// while clustering.
+  /// Will only be queried if ReorderWhileClustering is enabled and
+  /// shouldClusterMemOps already returned true for the same operation pair.
+  virtual bool
+  canReorderClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
+                          ArrayRef<const MachineOperand *> BaseOps2) const {
+    llvm_unreachable("target did not implement canReorderClusterMemOps()");
+  }
+
   /// Reverses the branch condition of the specified condition list,
   /// returning false on success and true if it cannot be reversed.
   virtual bool
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 4e6d34346b1d80..876189a5d15621 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -1891,7 +1891,9 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps(
 
     SUnit *SUa = MemOpa.SU;
     SUnit *SUb = MemOpb.SU;
-    if (!ReorderWhileClustering && SUa->NodeNum > SUb->NodeNum)
+    if (SUa->NodeNum > SUb->NodeNum &&
+        (!ReorderWhileClustering ||
+         !TII->canReorderClusterMemOps(MemOpa.BaseOps, MemOpb.BaseOps)))
       std::swap(SUa, SUb);
 
     // FIXME: Is this check really required?
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 9c9c5051393730..1aa07c705e8218 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -393,6 +393,11 @@ static cl::opt<bool>
                            cl::desc("Enable AMDGPUAttributorPass"),
                            cl::init(true), cl::Hidden);
 
+static cl::opt<bool> ReorderWhileLoadClustering(
+    "amdgpu-reorder-while-load-clustering",
+    cl::desc("Enable reordering during load clustering"), cl::init(true),
+    cl::Hidden);
+
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   // Register the target
   RegisterTargetMachine<R600TargetMachine> X(getTheR600Target());
@@ -483,12 +488,20 @@ static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
   return new SIScheduleDAGMI(C);
 }
 
+static bool getReorderWhileLoadClustering(const MachineFunction *MF) {
+  if (!ReorderWhileLoadClustering)
+    return false;
+  return !!MF->getFunction().getFnAttributeAsParsedInteger(
+      "amdgpu-reorder-loads-while-clustering", 1);
+}
+
 static ScheduleDAGInstrs *
 createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
   const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
   ScheduleDAGMILive *DAG =
     new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
-  DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+  DAG->addMutation(createLoadClusterDAGMutation(
+      DAG->TII, DAG->TRI, getReorderWhileLoadClustering(C->MF)));
   if (ST.shouldClusterStores())
     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
   DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
@@ -510,7 +523,8 @@ createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
   const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
   auto DAG = new GCNIterativeScheduler(C,
     GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
-  DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+  DAG->addMutation(createLoadClusterDAGMutation(
+      DAG->TII, DAG->TRI, getReorderWhileLoadClustering(C->MF)));
   if (ST.shouldClusterStores())
     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
   return DAG;
@@ -526,7 +540,8 @@ createIterativeILPMachineScheduler(MachineSchedContext *C) {
   const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
   auto DAG = new GCNIterativeScheduler(C,
     GCNIterativeScheduler::SCHEDULE_ILP);
-  DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+  DAG->addMutation(createLoadClusterDAGMutation(
+      DAG->TII, DAG->TRI, getReorderWhileLoadClustering(C->MF)));
   if (ST.shouldClusterStores())
     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
@@ -967,7 +982,8 @@ class GCNPassConfig final : public AMDGPUPassConfig {
         C, std::make_unique<PostGenericScheduler>(C),
         /*RemoveKillFlags=*/true);
     const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
-    DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+    DAG->addMutation(createLoadClusterDAGMutation(
+        DAG->TII, DAG->TRI, getReorderWhileLoadClustering(C->MF)));
     if (ST.shouldClusterStores())
       DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
     DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
@@ -1207,7 +1223,8 @@ llvm::ScheduleDAGInstrs *
 AMDGPUPassConfig::createMachineScheduler(MachineSchedContext *C) const {
   const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
   ScheduleDAGMILive *DAG = createGenericSchedLive(C);
-  DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+  DAG->addMutation(createLoadClusterDAGMutation(
+      DAG->TII, DAG->TRI, getReorderWhileLoadClustering(C->MF)));
   if (ST.shouldClusterStores())
     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
   return DAG;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index c6f28af1e5e731..c8e14a8b47c8c5 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -581,6 +581,23 @@ bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
   return NumDWORDs <= 8;
 }
 
+bool SIInstrInfo::canReorderClusterMemOps(
+    ArrayRef<const MachineOperand *> BaseOps1,
+    ArrayRef<const MachineOperand *> BaseOps2) const {
+  const MachineInstr *FirstLdSt =
+      !BaseOps1.empty() ? BaseOps1.front()->getParent() : nullptr;
+  const MachineInstr *SecondLdSt =
+      !BaseOps2.empty() ? BaseOps2.front()->getParent() : nullptr;
+  ;
+
+  if (!FirstLdSt || !isVMEM(*FirstLdSt))
+    return false;
+  if (!SecondLdSt || !isVMEM(*SecondLdSt))
+    return false;
+
+  return true;
+}
+
 // FIXME: This behaves strangely. If, for example, you have 32 load + stores,
 // the first 16 loads will be interleaved with the stores, and the next 16 will
 // be clustered as expected. It should really split into 2 16 store batches.
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 71432510fdee4f..70fbb7966c6bfd 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -250,6 +250,10 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
                            unsigned ClusterSize,
                            unsigned NumBytes) const override;
 
+  bool canReorderClusterMemOps(
+      ArrayRef<const MachineOperand *> BaseOps1,
+      ArrayRef<const MachineOperand *> BaseOps2) const override;
+
   bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0,
                                int64_t Offset1, unsigned NumLoads) const override;
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index 457db9b9860d00..28b69a7e645c66 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -179,6 +179,12 @@ class RISCVInstrInfo : public RISCVGenInstrInfo {
                            unsigned ClusterSize,
                            unsigned NumBytes) const override;
 
+  bool canReorderClusterMemOps(
+      ArrayRef<const MachineOperand *> BaseOps1,
+      ArrayRef<const MachineOperand *> BaseOps2) const override {
+    return true;
+  }
+
   bool getMemOperandWithOffsetWidth(const MachineInstr &LdSt,
                                     const MachineOperand *&BaseOp,
                                     int64_t &Offset, LocationSize &Width,

>From 541b70feef91c861cbf8acd8e9e654c3f74fadae Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Tue, 10 Sep 2024 16:18:46 +0900
Subject: [PATCH 2/2] Test updates.

---
 llvm/test/CodeGen/AMDGPU/bf16.ll              | 176 +++----
 .../CodeGen/AMDGPU/call-argument-types.ll     |  33 +-
 llvm/test/CodeGen/AMDGPU/cluster_stores.ll    |  88 ++--
 .../fast-unaligned-load-store.private.ll      |   8 +-
 llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll |  74 +--
 .../insert_waitcnt_for_precise_memory.ll      |   4 +-
 ...e92561-restore-undef-scc-verifier-error.ll |   7 +-
 ...rval-bug-in-rename-independent-subregs.mir |  64 +--
 llvm/test/CodeGen/AMDGPU/load-global-i16.ll   | 176 +++----
 llvm/test/CodeGen/AMDGPU/load-global-i32.ll   | 357 +++++++-------
 llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll    | 434 +++++++++---------
 .../AMDGPU/memcpy-param-combinations.ll       | 144 +++---
 .../AMDGPU/memmove-param-combinations.ll      | 108 ++---
 llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll |   8 +-
 .../CodeGen/AMDGPU/private-memory-atomics.ll  |   2 +-
 llvm/test/CodeGen/AMDGPU/select.f16.ll        | 105 ++---
 .../AMDGPU/splitkit-getsubrangeformask.ll     |  14 +-
 llvm/test/CodeGen/AMDGPU/wqm.ll               |  14 +-
 18 files changed, 920 insertions(+), 896 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index d9ce1e4efe0e50..869f5f0e8c2f6c 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -1865,21 +1865,19 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) {
 ; GFX7-NEXT:    v_alignbit_b32 v7, v0, v1, 16
 ; GFX7-NEXT:    s_waitcnt vmcnt(9)
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v37
-; GFX7-NEXT:    v_mul_f32_e32 v20, 1.0, v28
-; GFX7-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:64
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    s_waitcnt vmcnt(9)
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v38
-; GFX7-NEXT:    v_alignbit_b32 v4, v33, v4, 16
 ; GFX7-NEXT:    s_waitcnt vmcnt(8)
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v38
+; GFX7-NEXT:    s_waitcnt vmcnt(7)
 ; GFX7-NEXT:    v_mul_f32_e32 v18, 1.0, v39
 ; GFX7-NEXT:    v_alignbit_b32 v36, v0, v1, 16
-; GFX7-NEXT:    s_waitcnt vmcnt(6)
+; GFX7-NEXT:    s_waitcnt vmcnt(5)
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v49
+; GFX7-NEXT:    v_alignbit_b32 v4, v33, v4, 16
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
 ; GFX7-NEXT:    v_mul_f32_e32 v19, 1.0, v48
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    s_waitcnt vmcnt(5)
+; GFX7-NEXT:    s_waitcnt vmcnt(4)
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v50
 ; GFX7-NEXT:    v_alignbit_b32 v35, v18, v19, 16
 ; GFX7-NEXT:    v_alignbit_b32 v34, v0, v1, 16
@@ -1888,11 +1886,14 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) {
 ; GFX7-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:24
 ; GFX7-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:20
 ; GFX7-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:16
+; GFX7-NEXT:    v_mul_f32_e32 v20, 1.0, v28
 ; GFX7-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:12
-; GFX7-NEXT:    s_waitcnt vmcnt(8)
-; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:64
+; GFX7-NEXT:    s_waitcnt vmcnt(9)
 ; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT:    s_waitcnt vmcnt(8)
+; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
 ; GFX7-NEXT:    v_alignbit_b32 v33, v6, v14, 16
 ; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v17
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
@@ -1900,18 +1901,18 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) {
 ; GFX7-NEXT:    v_alignbit_b32 v6, v6, v14, 16
 ; GFX7-NEXT:    s_waitcnt vmcnt(7)
 ; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v15
-; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
 ; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v30
-; GFX7-NEXT:    buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:96
 ; GFX7-NEXT:    v_mul_f32_e32 v16, 1.0, v29
+; GFX7-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:8
+; GFX7-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX7-NEXT:    buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:96
 ; GFX7-NEXT:    v_alignbit_b32 v17, v14, v15, 16
 ; GFX7-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:52
 ; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v27
 ; GFX7-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:48
 ; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v26
 ; GFX7-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:44
-; GFX7-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:8
-; GFX7-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:4
 ; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:60
 ; GFX7-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:56
 ; GFX7-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:40
@@ -1929,28 +1930,27 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) {
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_alignbit_b32 v21, v0, v1, 16
-; GFX7-NEXT:    s_waitcnt vmcnt(13)
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v18
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    s_waitcnt vmcnt(12)
+; GFX7-NEXT:    s_waitcnt vmcnt(13)
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v19
 ; GFX7-NEXT:    v_alignbit_b32 v20, v0, v1, 16
-; GFX7-NEXT:    s_waitcnt vmcnt(11)
+; GFX7-NEXT:    s_waitcnt vmcnt(12)
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v22
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    s_waitcnt vmcnt(10)
+; GFX7-NEXT:    s_waitcnt vmcnt(11)
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v23
 ; GFX7-NEXT:    v_alignbit_b32 v19, v0, v1, 16
-; GFX7-NEXT:    s_waitcnt vmcnt(8)
-; GFX7-NEXT:    v_mul_f32_e32 v23, 1.0, v35
-; GFX7-NEXT:    s_waitcnt vmcnt(5)
+; GFX7-NEXT:    s_waitcnt vmcnt(9)
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v29
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-NEXT:    s_waitcnt vmcnt(8)
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v30
 ; GFX7-NEXT:    v_alignbit_b32 v18, v0, v1, 16
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v28
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(6)
+; GFX7-NEXT:    v_mul_f32_e32 v23, 1.0, v35
 ; GFX7-NEXT:    s_waitcnt vmcnt(3)
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v33
 ; GFX7-NEXT:    s_waitcnt vmcnt(2)
@@ -35787,15 +35787,15 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
 ; GFX7-NEXT:    v_alignbit_b32 v8, v8, v17, 16
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
 ; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v25
-; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
 ; GFX7-NEXT:    v_alignbit_b32 v10, v10, v17, 16
+; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v28
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
 ; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v28
-; GFX7-NEXT:    v_alignbit_b32 v11, v12, v11, 16
-; GFX7-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:4
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
 ; GFX7-NEXT:    v_mul_f32_e32 v18, 1.0, v27
+; GFX7-NEXT:    v_alignbit_b32 v11, v12, v11, 16
+; GFX7-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:4
 ; GFX7-NEXT:    v_alignbit_b32 v17, v17, v18, 16
 ; GFX7-NEXT:    buffer_load_dword v18, off, s[0:3], s32
 ; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
@@ -36178,17 +36178,17 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat>
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v10
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v9
-; GFX7-NEXT:    v_mul_f32_e32 v18, 1.0, v18
 ; GFX7-NEXT:    v_alignbit_b32 v5, v5, v6, 16
 ; GFX7-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:12
 ; GFX7-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16
 ; GFX7-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:24
 ; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:40
+; GFX7-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:8
+; GFX7-NEXT:    v_mul_f32_e32 v18, 1.0, v18
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
 ; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v17
 ; GFX7-NEXT:    v_alignbit_b32 v17, v18, v17, 16
 ; GFX7-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:76
-; GFX7-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:8
 ; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
 ; GFX7-NEXT:    v_mul_f32_e32 v28, 1.0, v28
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
@@ -36225,6 +36225,7 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat>
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
 ; GFX7-NEXT:    v_mul_f32_e32 v29, 1.0, v29
 ; GFX7-NEXT:    v_alignbit_b32 v29, v30, v29, 16
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32
 ; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX7-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:32
@@ -36238,52 +36239,50 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat>
 ; GFX7-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:108
 ; GFX7-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:124
 ; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:128
-; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32
 ; GFX7-NEXT:    s_waitcnt vmcnt(14)
 ; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
 ; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
 ; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
 ; GFX7-NEXT:    s_waitcnt vmcnt(13)
-; GFX7-NEXT:    v_mul_f32_e32 v18, 1.0, v18
-; GFX7-NEXT:    s_waitcnt vmcnt(12)
 ; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
 ; GFX7-NEXT:    v_alignbit_b32 v6, v6, v7, 16
 ; GFX7-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:20
-; GFX7-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX7-NEXT:    s_waitcnt vmcnt(13)
+; GFX7-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT:    s_waitcnt vmcnt(12)
-; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
 ; GFX7-NEXT:    s_waitcnt vmcnt(11)
+; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT:    s_waitcnt vmcnt(10)
 ; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; GFX7-NEXT:    s_waitcnt vmcnt(9)
+; GFX7-NEXT:    s_waitcnt vmcnt(8)
 ; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GFX7-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-NEXT:    s_waitcnt vmcnt(6)
 ; GFX7-NEXT:    v_mul_f32_e32 v16, 1.0, v16
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; GFX7-NEXT:    s_waitcnt vmcnt(6)
+; GFX7-NEXT:    s_waitcnt vmcnt(5)
 ; GFX7-NEXT:    v_mul_f32_e32 v20, 1.0, v20
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
-; GFX7-NEXT:    s_waitcnt vmcnt(5)
+; GFX7-NEXT:    s_waitcnt vmcnt(4)
 ; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v22
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
 ; GFX7-NEXT:    v_mul_f32_e32 v24, 1.0, v24
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
-; GFX7-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-NEXT:    s_waitcnt vmcnt(3)
 ; GFX7-NEXT:    v_mul_f32_e32 v26, 1.0, v26
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
 ; GFX7-NEXT:    v_mul_f32_e32 v28, 1.0, v28
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
-; GFX7-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-NEXT:    s_waitcnt vmcnt(2)
 ; GFX7-NEXT:    v_mul_f32_e32 v30, 1.0, v30
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
@@ -36384,6 +36383,7 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat>
 ; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
 ; GFX7-NEXT:    v_alignbit_b32 v30, v30, v31, 16
 ; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_cndmask_b32_e32 v29, v30, v29, vcc
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v28, 16, v29
 ; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
@@ -37633,27 +37633,27 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[24:25], 1, v0
 ; GFX7-NEXT:    buffer_load_dword v0, off, s[0:3], s32
 ; GFX7-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:64
-; GFX7-NEXT:    v_and_b32_e32 v2, 1, v12
 ; GFX7-NEXT:    v_writelane_b32 v31, s30, 0
-; GFX7-NEXT:    v_cmp_eq_u32_e64 s[26:27], 1, v2
-; GFX7-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:60
-; GFX7-NEXT:    v_and_b32_e32 v3, 1, v13
 ; GFX7-NEXT:    v_writelane_b32 v31, s31, 1
-; GFX7-NEXT:    v_cmp_eq_u32_e64 s[28:29], 1, v3
-; GFX7-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:56
-; GFX7-NEXT:    v_and_b32_e32 v4, 1, v14
 ; GFX7-NEXT:    v_writelane_b32 v31, s34, 2
-; GFX7-NEXT:    v_cmp_eq_u32_e64 s[30:31], 1, v4
-; GFX7-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:52
 ; GFX7-NEXT:    v_and_b32_e32 v5, 1, v15
 ; GFX7-NEXT:    v_writelane_b32 v31, s35, 3
+; GFX7-NEXT:    v_and_b32_e32 v2, 1, v12
+; GFX7-NEXT:    v_and_b32_e32 v3, 1, v13
+; GFX7-NEXT:    v_and_b32_e32 v4, 1, v14
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[34:35], 1, v5
-; GFX7-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:48
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[26:27], 1, v2
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[28:29], 1, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[30:31], 1, v4
 ; GFX7-NEXT:    v_mul_f32_e32 v16, 1.0, v16
 ; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v17
 ; GFX7-NEXT:    v_mul_f32_e32 v18, 1.0, v18
 ; GFX7-NEXT:    v_mul_f32_e32 v19, 1.0, v19
 ; GFX7-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; GFX7-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:60
+; GFX7-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:56
+; GFX7-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:52
+; GFX7-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:48
 ; GFX7-NEXT:    s_waitcnt vmcnt(5)
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7-NEXT:    s_waitcnt vmcnt(4)
@@ -37661,65 +37661,65 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
 ; GFX7-NEXT:    v_cndmask_b32_e64 v15, v1, v0, s[34:35]
 ; GFX7-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v30
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    v_readlane_b32 s35, v31, 3
+; GFX7-NEXT:    v_readlane_b32 s34, v31, 2
 ; GFX7-NEXT:    s_waitcnt vmcnt(4)
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX7-NEXT:    v_cndmask_b32_e64 v14, v2, v1, s[30:31]
-; GFX7-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:40
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v29
-; GFX7-NEXT:    s_waitcnt vmcnt(4)
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_readlane_b32 s31, v31, 1
+; GFX7-NEXT:    v_readlane_b32 s30, v31, 0
 ; GFX7-NEXT:    v_cndmask_b32_e64 v13, v3, v2, s[28:29]
-; GFX7-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v28
-; GFX7-NEXT:    s_waitcnt vmcnt(4)
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX7-NEXT:    v_cndmask_b32_e64 v12, v4, v3, s[26:27]
-; GFX7-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:32
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v27
-; GFX7-NEXT:    s_waitcnt vmcnt(4)
-; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX7-NEXT:    v_cndmask_b32_e64 v11, v5, v4, s[24:25]
-; GFX7-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:28
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v26
-; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX7-NEXT:    v_readlane_b32 s35, v31, 3
-; GFX7-NEXT:    v_readlane_b32 s34, v31, 2
-; GFX7-NEXT:    v_readlane_b32 s31, v31, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v31, 0
+; GFX7-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:40
+; GFX7-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36
+; GFX7-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:32
+; GFX7-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:28
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX7-NEXT:    s_waitcnt vmcnt(4)
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7-NEXT:    v_cndmask_b32_e64 v10, v0, v5, s[22:23]
 ; GFX7-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:24
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v25
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX7-NEXT:    s_waitcnt vmcnt(4)
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; GFX7-NEXT:    v_cndmask_b32_e64 v9, v1, v5, s[20:21]
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v24
 ; GFX7-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4
-; GFX7-NEXT:    s_waitcnt vmcnt(4)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    v_cndmask_b32_e64 v8, v2, v5, s[18:19]
 ; GFX7-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v23
-; GFX7-NEXT:    s_waitcnt vmcnt(4)
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; GFX7-NEXT:    v_cndmask_b32_e64 v7, v3, v5, s[16:17]
 ; GFX7-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:12
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v22
-; GFX7-NEXT:    s_waitcnt vmcnt(4)
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX7-NEXT:    v_cndmask_b32_e64 v6, v4, v5, s[14:15]
 ; GFX7-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v21
-; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX7-NEXT:    s_waitcnt vmcnt(4)
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7-NEXT:    v_cndmask_b32_e64 v5, v0, v5, s[12:13]
@@ -39446,6 +39446,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 1, v28
 ; GFX10-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:88
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 1, v26
+; GFX10-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:24
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s7, 1, v24
 ; GFX10-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s8, 1, v22
@@ -39457,9 +39458,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s11, 1, v16
 ; GFX10-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:8
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s12, 1, v14
-; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:68
-; GFX10-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:24
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s13, 1, v12
 ; GFX10-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:4
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
@@ -39549,22 +39548,23 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX10-NEXT:    v_cndmask_b32_e64 v38, v29, v68, s13
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v39, 16, v68
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
-; GFX10-NEXT:    s_waitcnt vmcnt(6)
+; GFX10-NEXT:    s_waitcnt vmcnt(7)
+; GFX10-NEXT:    v_cndmask_b32_e64 v48, v28, v26, s14
+; GFX10-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
+; GFX10-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
+; GFX10-NEXT:    s_waitcnt vmcnt(5)
 ; GFX10-NEXT:    v_cndmask_b32_e64 v49, v24, v22, s15
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
-; GFX10-NEXT:    s_waitcnt vmcnt(5)
+; GFX10-NEXT:    s_waitcnt vmcnt(4)
 ; GFX10-NEXT:    v_cndmask_b32_e64 v50, v67, v20, s16
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v51, 16, v67
-; GFX10-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-NEXT:    s_waitcnt vmcnt(3)
 ; GFX10-NEXT:    v_cndmask_b32_e64 v52, v66, v18, s17
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
-; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_cndmask_b32_e64 v48, v28, v26, s14
-; GFX10-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
-; GFX10-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v53, 16, v66
+; GFX10-NEXT:    s_waitcnt vmcnt(2)
 ; GFX10-NEXT:    v_cndmask_b32_e64 v54, v65, v16, s18
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v55, 16, v65
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index 725c2d71ac5e35..5acca079beb0c8 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -5400,51 +5400,52 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
 ; VI:       ; %bb.0: ; %entry
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:28
-; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
 ; VI-NEXT:    s_getpc_b64 s[4:5]
 ; VI-NEXT:    s_add_u32 s4, s4, byval_align16_f64_arg at rel32@lo+4
 ; VI-NEXT:    s_addc_u32 s5, s5, byval_align16_f64_arg at rel32@hi+12
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:20
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:24
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32
+; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:16
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32
 ; VI-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; CI-LABEL: tail_call_byval_align16:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:28
-; CI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
 ; CI-NEXT:    s_getpc_b64 s[4:5]
 ; CI-NEXT:    s_add_u32 s4, s4, byval_align16_f64_arg at rel32@lo+4
 ; CI-NEXT:    s_addc_u32 s5, s5, byval_align16_f64_arg at rel32@hi+12
-; CI-NEXT:    s_waitcnt vmcnt(1)
+; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:20
 ; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:24
-; CI-NEXT:    s_waitcnt vmcnt(2)
-; CI-NEXT:    buffer_store_dword v32, off, s[0:3], s32
+; CI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
 ; CI-NEXT:    s_waitcnt vmcnt(1)
 ; CI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:16
+; CI-NEXT:    s_waitcnt vmcnt(1)
+; CI-NEXT:    buffer_store_dword v32, off, s[0:3], s32
 ; CI-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; GFX9-LABEL: tail_call_byval_align16:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, byval_align16_f64_arg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, byval_align16_f64_arg at rel32@hi+12
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:20
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:24
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32
 ; GFX9-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; GFX11-LABEL: tail_call_byval_align16:
@@ -5465,17 +5466,17 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
 ; HSA:       ; %bb.0: ; %entry
 ; HSA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; HSA-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:28
-; HSA-NEXT:    buffer_load_dword v32, off, s[0:3], s32
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, byval_align16_f64_arg at rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, byval_align16_f64_arg at rel32@hi+12
-; HSA-NEXT:    s_waitcnt vmcnt(1)
+; HSA-NEXT:    s_waitcnt vmcnt(0)
 ; HSA-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:20
 ; HSA-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:24
-; HSA-NEXT:    s_waitcnt vmcnt(2)
-; HSA-NEXT:    buffer_store_dword v32, off, s[0:3], s32
+; HSA-NEXT:    buffer_load_dword v32, off, s[0:3], s32
 ; HSA-NEXT:    s_waitcnt vmcnt(1)
 ; HSA-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:16
+; HSA-NEXT:    s_waitcnt vmcnt(1)
+; HSA-NEXT:    buffer_store_dword v32, off, s[0:3], s32
 ; HSA-NEXT:    s_setpc_b64 s[4:5]
 entry:
   %alloca = alloca double, align 8, addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
index bada3d904fbe38..35bfb629fac066 100644
--- a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
+++ b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
@@ -274,49 +274,49 @@ bb:
 define amdgpu_ps void @cluster_image_load(<8 x i32> inreg %src, <8 x i32> inreg %dst, i32 %x, i32 %y) {
 ; GFX9-LABEL: cluster_image_load:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    v_add_u32_e32 v2, 1, v0
-; GFX9-NEXT:    v_add_u32_e32 v3, 1, v1
-; GFX9-NEXT:    v_add_u32_e32 v6, 2, v0
-; GFX9-NEXT:    v_add_u32_e32 v7, 2, v1
+; GFX9-NEXT:    v_add_u32_e32 v6, 1, v0
+; GFX9-NEXT:    v_add_u32_e32 v7, 1, v1
+; GFX9-NEXT:    v_add_u32_e32 v2, 2, v0
+; GFX9-NEXT:    v_add_u32_e32 v3, 2, v1
 ; GFX9-NEXT:    image_load v[2:5], v[2:3], s[0:7] dmask:0xf unorm
 ; GFX9-NEXT:    image_load v[6:9], v[6:7], s[0:7] dmask:0xf unorm
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_f32_e32 v5, v5, v9
-; GFX9-NEXT:    v_add_f32_e32 v4, v4, v8
-; GFX9-NEXT:    v_add_f32_e32 v3, v3, v7
-; GFX9-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX9-NEXT:    v_add_f32_e32 v5, v9, v5
+; GFX9-NEXT:    v_add_f32_e32 v4, v8, v4
+; GFX9-NEXT:    v_add_f32_e32 v3, v7, v3
+; GFX9-NEXT:    v_add_f32_e32 v2, v6, v2
 ; GFX9-NEXT:    image_store v[2:5], v[0:1], s[8:15] dmask:0xf unorm
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: cluster_image_load:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    v_add_nc_u32_e32 v10, 1, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v11, 1, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v12, 2, v0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v13, 2, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v10, 1, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v11, 1, v1
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    image_load v[2:5], v[10:11], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
-; GFX10-NEXT:    image_load v[6:9], v[12:13], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
+; GFX10-NEXT:    image_load v[2:5], v[12:13], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
+; GFX10-NEXT:    image_load v[6:9], v[10:11], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_add_f32_e32 v5, v5, v9
-; GFX10-NEXT:    v_add_f32_e32 v4, v4, v8
-; GFX10-NEXT:    v_add_f32_e32 v3, v3, v7
-; GFX10-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX10-NEXT:    v_add_f32_e32 v5, v9, v5
+; GFX10-NEXT:    v_add_f32_e32 v4, v8, v4
+; GFX10-NEXT:    v_add_f32_e32 v3, v7, v3
+; GFX10-NEXT:    v_add_f32_e32 v2, v6, v2
 ; GFX10-NEXT:    image_store v[2:5], v[0:1], s[8:15] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: cluster_image_load:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 1, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 1, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 2, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 2, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 2, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 2, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, 1, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 1, v1
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    image_load v[2:5], v[2:3], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
 ; GFX11-NEXT:    image_load v[6:9], v[6:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_add_f32 v2, v2, v6 :: v_dual_add_f32 v5, v5, v9
-; GFX11-NEXT:    v_dual_add_f32 v4, v4, v8 :: v_dual_add_f32 v3, v3, v7
+; GFX11-NEXT:    v_dual_add_f32 v2, v6, v2 :: v_dual_add_f32 v5, v9, v5
+; GFX11-NEXT:    v_dual_add_f32 v4, v8, v4 :: v_dual_add_f32 v3, v7, v3
 ; GFX11-NEXT:    image_store v[2:5], v[0:1], s[8:15] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -398,51 +398,51 @@ define amdgpu_ps void @cluster_image_sample(<8 x i32> inreg %src, <4 x i32> inre
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v8, v0
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v9, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    v_mov_b32_e32 v5, v4
+; GFX9-NEXT:    v_mov_b32_e32 v10, 1.0
 ; GFX9-NEXT:    v_add_f32_e32 v2, 1.0, v8
 ; GFX9-NEXT:    v_add_f32_e32 v3, 1.0, v9
-; GFX9-NEXT:    v_mov_b32_e32 v6, v4
-; GFX9-NEXT:    v_mov_b32_e32 v7, v4
+; GFX9-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX9-NEXT:    v_add_f32_e32 v8, 2.0, v8
 ; GFX9-NEXT:    v_add_f32_e32 v9, 2.0, v9
-; GFX9-NEXT:    v_mov_b32_e32 v10, 1.0
 ; GFX9-NEXT:    v_mov_b32_e32 v11, v10
 ; GFX9-NEXT:    v_mov_b32_e32 v12, v10
 ; GFX9-NEXT:    v_mov_b32_e32 v13, v10
+; GFX9-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-NEXT:    v_mov_b32_e32 v7, v4
+; GFX9-NEXT:    image_sample_d v[8:11], v[8:13], s[0:7], s[8:11] dmask:0xf
 ; GFX9-NEXT:    image_sample_d v[2:5], v[2:7], s[0:7], s[8:11] dmask:0xf
-; GFX9-NEXT:    image_sample_d v[6:9], v[8:13], s[0:7], s[8:11] dmask:0xf
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_f32_e32 v5, v5, v9
-; GFX9-NEXT:    v_add_f32_e32 v4, v4, v8
-; GFX9-NEXT:    v_add_f32_e32 v3, v3, v7
-; GFX9-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX9-NEXT:    v_add_f32_e32 v5, v5, v11
+; GFX9-NEXT:    v_add_f32_e32 v4, v4, v10
+; GFX9-NEXT:    v_add_f32_e32 v3, v3, v9
+; GFX9-NEXT:    v_add_f32_e32 v2, v2, v8
 ; GFX9-NEXT:    image_store v[2:5], v[0:1], s[12:19] dmask:0xf unorm
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: cluster_image_sample:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v8, v0
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v7, v0
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v9, v1
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v10, 1.0
-; GFX10-NEXT:    v_add_f32_e32 v2, 1.0, v8
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    v_add_f32_e32 v8, 2.0, v7
 ; GFX10-NEXT:    v_add_f32_e32 v3, 1.0, v9
-; GFX10-NEXT:    v_mov_b32_e32 v5, v4
-; GFX10-NEXT:    v_mov_b32_e32 v6, v4
-; GFX10-NEXT:    v_mov_b32_e32 v7, v4
-; GFX10-NEXT:    v_add_f32_e32 v8, 2.0, v8
 ; GFX10-NEXT:    v_add_f32_e32 v9, 2.0, v9
 ; GFX10-NEXT:    v_mov_b32_e32 v11, v10
 ; GFX10-NEXT:    v_mov_b32_e32 v12, v10
 ; GFX10-NEXT:    v_mov_b32_e32 v13, v10
+; GFX10-NEXT:    v_add_f32_e32 v2, 1.0, v7
+; GFX10-NEXT:    v_mov_b32_e32 v5, v4
+; GFX10-NEXT:    v_mov_b32_e32 v6, v4
+; GFX10-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    image_sample_d v[14:17], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX10-NEXT:    image_sample_d v[18:21], v[8:13], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT:    image_sample_d v[14:17], v[8:13], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT:    image_sample_d v[18:21], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_add_f32_e32 v5, v17, v21
-; GFX10-NEXT:    v_add_f32_e32 v4, v16, v20
-; GFX10-NEXT:    v_add_f32_e32 v3, v15, v19
-; GFX10-NEXT:    v_add_f32_e32 v2, v14, v18
+; GFX10-NEXT:    v_add_f32_e32 v5, v21, v17
+; GFX10-NEXT:    v_add_f32_e32 v4, v20, v16
+; GFX10-NEXT:    v_add_f32_e32 v3, v19, v15
+; GFX10-NEXT:    v_add_f32_e32 v2, v18, v14
 ; GFX10-NEXT:    image_store v[2:5], v[0:1], s[12:19] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
 ; GFX10-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
index f9694dcd89abfb..35f99656530c50 100644
--- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
+++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
@@ -203,17 +203,17 @@ define i32 @private_load_2xi16_align1(ptr addrspace(5) %p) #0 {
 ; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-ALIGNED-NEXT:    v_add_i32_e32 v1, vcc, 2, v0
 ; GFX7-ALIGNED-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
+; GFX7-ALIGNED-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen
+; GFX7-ALIGNED-NEXT:    buffer_load_ubyte v2, v2, s[0:3], 0 offen
 ; GFX7-ALIGNED-NEXT:    v_add_i32_e32 v3, vcc, 3, v0
 ; GFX7-ALIGNED-NEXT:    buffer_load_ubyte v3, v3, s[0:3], 0 offen
-; GFX7-ALIGNED-NEXT:    buffer_load_ubyte v2, v2, s[0:3], 0 offen
-; GFX7-ALIGNED-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen
 ; GFX7-ALIGNED-NEXT:    buffer_load_ubyte v0, v0, s[0:3], 0 offen
 ; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(3)
-; GFX7-ALIGNED-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX7-ALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(2)
 ; GFX7-ALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-ALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-ALIGNED-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-ALIGNED-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX7-ALIGNED-NEXT:    v_or_b32_e32 v1, v3, v1
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
index 7af972b96ec68c..80c1a32d3833ab 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
@@ -3509,19 +3509,19 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
 ; CI-NEXT:    v_cvt_f32_f16_e32 v8, v10
 ; CI-NEXT:    v_cvt_f32_f16_e32 v10, v11
 ; CI-NEXT:    v_cvt_f16_f32_e32 v11, v18
-; CI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:124
-; CI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:112
-; CI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:116
+; CI-NEXT:    v_cvt_f32_f16_e32 v9, v9
 ; CI-NEXT:    v_cvt_f16_f32_e32 v8, v8
 ; CI-NEXT:    v_cvt_f16_f32_e32 v10, v10
 ; CI-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; CI-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; CI-NEXT:    v_cvt_f16_f32_e32 v12, v12
 ; CI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; CI-NEXT:    v_or_b32_e32 v8, v10, v8
 ; CI-NEXT:    v_cvt_f16_f32_e32 v10, v11
 ; CI-NEXT:    v_cvt_f16_f32_e32 v11, v13
 ; CI-NEXT:    v_cvt_f16_f32_e32 v13, v9
-; CI-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; CI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:124
+; CI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:112
+; CI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:116
 ; CI-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
 ; CI-NEXT:    v_or_b32_e32 v9, v11, v9
 ; CI-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
@@ -3589,11 +3589,12 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
 ; CI-NEXT:    v_or_b32_e32 v16, v24, v25
 ; CI-NEXT:    v_lshlrev_b32_e32 v24, 16, v27
 ; CI-NEXT:    v_or_b32_e32 v25, v28, v24
-; CI-NEXT:    s_waitcnt vmcnt(9)
+; CI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:8
+; CI-NEXT:    s_waitcnt vmcnt(10)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v18, v18
-; CI-NEXT:    s_waitcnt vmcnt(8)
+; CI-NEXT:    s_waitcnt vmcnt(9)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v19, v19
-; CI-NEXT:    s_waitcnt vmcnt(7)
+; CI-NEXT:    s_waitcnt vmcnt(8)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v20, v20
 ; CI-NEXT:    v_cvt_f32_f16_e32 v18, v18
 ; CI-NEXT:    v_cvt_f32_f16_e32 v19, v19
@@ -3605,7 +3606,6 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
 ; CI-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
 ; CI-NEXT:    v_or_b32_e32 v20, v19, v20
 ; CI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:20
-; CI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:8
 ; CI-NEXT:    s_waitcnt vmcnt(8)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v17, v17
 ; CI-NEXT:    s_waitcnt vmcnt(7)
@@ -3649,25 +3649,25 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
 ; CI-NEXT:    v_cvt_f16_f32_e32 v20, v20
 ; CI-NEXT:    v_cvt_f32_f16_e32 v29, v29
 ; CI-NEXT:    v_cvt_f16_f32_e32 v22, v22
+; CI-NEXT:    s_waitcnt vmcnt(11)
+; CI-NEXT:    v_cvt_f16_f32_e32 v23, v24
 ; CI-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
 ; CI-NEXT:    v_or_b32_e32 v20, v21, v20
 ; CI-NEXT:    v_add_i32_e32 v21, vcc, 0x6c, v0
 ; CI-NEXT:    buffer_store_dword v20, v21, s[0:3], 0 offen
+; CI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:28
+; CI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:16
 ; CI-NEXT:    v_lshlrev_b32_e32 v20, 16, v22
 ; CI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:24
+; CI-NEXT:    v_cvt_f32_f16_e32 v23, v23
 ; CI-NEXT:    v_cvt_f16_f32_e32 v29, v29
-; CI-NEXT:    s_waitcnt vmcnt(13)
+; CI-NEXT:    v_cvt_f16_f32_e32 v23, v23
+; CI-NEXT:    v_or_b32_e32 v20, v23, v20
+; CI-NEXT:    s_waitcnt vmcnt(14)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v19, v19
-; CI-NEXT:    s_waitcnt vmcnt(12)
-; CI-NEXT:    v_cvt_f16_f32_e32 v23, v24
-; CI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:28
-; CI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:16
 ; CI-NEXT:    v_cvt_f32_f16_e32 v19, v19
-; CI-NEXT:    v_cvt_f32_f16_e32 v23, v23
 ; CI-NEXT:    v_cvt_f16_f32_e32 v19, v19
-; CI-NEXT:    v_cvt_f16_f32_e32 v23, v23
 ; CI-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; CI-NEXT:    v_or_b32_e32 v20, v23, v20
 ; CI-NEXT:    s_waitcnt vmcnt(9)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v27, v27
 ; CI-NEXT:    s_waitcnt vmcnt(8)
@@ -3693,32 +3693,34 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
 ; CI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:32
 ; CI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:36
 ; CI-NEXT:    v_cvt_f32_f16_e32 v25, v25
+; CI-NEXT:    s_waitcnt vmcnt(5)
+; CI-NEXT:    v_cvt_f16_f32_e32 v21, v21
+; CI-NEXT:    s_waitcnt vmcnt(4)
+; CI-NEXT:    v_cvt_f16_f32_e32 v24, v24
 ; CI-NEXT:    v_cvt_f16_f32_e32 v18, v18
 ; CI-NEXT:    v_cvt_f16_f32_e32 v26, v26
+; CI-NEXT:    v_cvt_f32_f16_e32 v21, v21
+; CI-NEXT:    v_cvt_f32_f16_e32 v24, v24
 ; CI-NEXT:    v_cvt_f16_f32_e32 v17, v17
 ; CI-NEXT:    v_cvt_f16_f32_e32 v25, v25
+; CI-NEXT:    v_cvt_f16_f32_e32 v21, v21
+; CI-NEXT:    v_cvt_f16_f32_e32 v24, v24
 ; CI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
 ; CI-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; CI-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; CI-NEXT:    v_or_b32_e32 v19, v24, v19
+; CI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:44
+; CI-NEXT:    s_waitcnt vmcnt(4)
+; CI-NEXT:    v_cvt_f16_f32_e32 v22, v22
 ; CI-NEXT:    v_or_b32_e32 v17, v17, v18
 ; CI-NEXT:    v_add_i32_e32 v18, vcc, 0x64, v0
+; CI-NEXT:    v_cvt_f32_f16_e32 v22, v22
 ; CI-NEXT:    v_or_b32_e32 v25, v25, v26
 ; CI-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
 ; CI-NEXT:    v_add_i32_e32 v17, vcc, 0x60, v0
+; CI-NEXT:    v_cvt_f16_f32_e32 v22, v22
 ; CI-NEXT:    buffer_store_dword v25, v17, s[0:3], 0 offen
 ; CI-NEXT:    v_add_i32_e32 v17, vcc, 0x5c, v0
-; CI-NEXT:    s_waitcnt vmcnt(5)
-; CI-NEXT:    v_cvt_f16_f32_e32 v24, v24
-; CI-NEXT:    v_cvt_f16_f32_e32 v21, v21
-; CI-NEXT:    v_cvt_f32_f16_e32 v24, v24
-; CI-NEXT:    v_cvt_f32_f16_e32 v21, v21
-; CI-NEXT:    v_cvt_f16_f32_e32 v24, v24
-; CI-NEXT:    v_cvt_f16_f32_e32 v21, v21
-; CI-NEXT:    v_or_b32_e32 v19, v24, v19
-; CI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:44
-; CI-NEXT:    v_cvt_f16_f32_e32 v22, v22
-; CI-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; CI-NEXT:    v_cvt_f32_f16_e32 v22, v22
-; CI-NEXT:    v_cvt_f16_f32_e32 v22, v22
 ; CI-NEXT:    v_or_b32_e32 v21, v22, v21
 ; CI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:40
 ; CI-NEXT:    s_waitcnt vmcnt(5)
@@ -3730,7 +3732,7 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
 ; CI-NEXT:    v_cvt_f16_f32_e32 v23, v23
 ; CI-NEXT:    v_cvt_f16_f32_e32 v27, v27
 ; CI-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; CI-NEXT:    s_waitcnt vmcnt(1)
+; CI-NEXT:    s_waitcnt vmcnt(3)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v24, v24
 ; CI-NEXT:    v_cvt_f32_f16_e32 v24, v24
 ; CI-NEXT:    v_cvt_f16_f32_e32 v24, v24
@@ -3741,22 +3743,22 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
 ; CI-NEXT:    v_or_b32_e32 v22, v23, v27
 ; CI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:52
 ; CI-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
+; CI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:48
 ; CI-NEXT:    v_or_b32_e32 v23, v28, v23
 ; CI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:56
-; CI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:48
 ; CI-NEXT:    s_waitcnt vmcnt(2)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v27, v27
 ; CI-NEXT:    s_waitcnt vmcnt(1)
-; CI-NEXT:    v_cvt_f16_f32_e32 v28, v28
-; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v24, v24
 ; CI-NEXT:    v_cvt_f32_f16_e32 v27, v27
-; CI-NEXT:    v_cvt_f32_f16_e32 v28, v28
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_cvt_f16_f32_e32 v28, v28
 ; CI-NEXT:    v_cvt_f32_f16_e32 v24, v24
 ; CI-NEXT:    v_cvt_f16_f32_e32 v27, v27
-; CI-NEXT:    v_cvt_f16_f32_e32 v28, v28
+; CI-NEXT:    v_cvt_f32_f16_e32 v28, v28
 ; CI-NEXT:    v_cvt_f16_f32_e32 v24, v24
 ; CI-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; CI-NEXT:    v_cvt_f16_f32_e32 v28, v28
 ; CI-NEXT:    v_or_b32_e32 v24, v24, v27
 ; CI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:60
 ; CI-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
index 0045082eedb0a3..a63f5bbde5ef6e 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
@@ -306,8 +306,6 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp)  {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_getpc_b64 s[16:17]
 ; GFX9-NEXT:    s_add_u32 s16, s16, byval_align16_f64_arg at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s17, s17, byval_align16_f64_arg at rel32@hi+12
@@ -315,6 +313,8 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp)  {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:24
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32
diff --git a/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll b/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll
index 0adce2b84aa0d3..8da0949190987f 100644
--- a/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll
+++ b/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll
@@ -61,15 +61,16 @@ define void @issue92561(ptr addrspace(1) %arg) {
 ; SDAG-NEXT:    s_mov_b32 s7, s12
 ; SDAG-NEXT:    s_clause 0x2
 ; SDAG-NEXT:    image_sample_c_lz v0, [v1, v1, v0, v1], s[0:7], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
-; SDAG-NEXT:    image_sample_c_lz v3, [v1, v1, v1, v1], s[0:7], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
 ; SDAG-NEXT:    image_sample_c_lz v2, [v1, v2, v1, v1], s[0:7], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
+; SDAG-NEXT:    image_sample_c_lz v3, [v1, v1, v1, v1], s[0:7], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
 ; SDAG-NEXT:    v_mov_b32_e32 v4, v1
 ; SDAG-NEXT:    s_waitcnt vmcnt(2)
 ; SDAG-NEXT:    v_add_f32_e32 v0, v9, v0
-; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; SDAG-NEXT:    s_waitcnt vmcnt(1)
+; SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; SDAG-NEXT:    v_add_f32_e32 v0, v2, v0
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v1
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    v_dual_add_f32 v0, v3, v0 :: v_dual_mov_b32 v3, v1
 ; SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-NEXT:    v_mul_f32_e32 v0, 0x3e800000, v0
diff --git a/llvm/test/CodeGen/AMDGPU/live-interval-bug-in-rename-independent-subregs.mir b/llvm/test/CodeGen/AMDGPU/live-interval-bug-in-rename-independent-subregs.mir
index 98b1b69101e51d..3289ddd0181632 100644
--- a/llvm/test/CodeGen/AMDGPU/live-interval-bug-in-rename-independent-subregs.mir
+++ b/llvm/test/CodeGen/AMDGPU/live-interval-bug-in-rename-independent-subregs.mir
@@ -11,26 +11,26 @@ body: |
   ; REG_ALLOC-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
   ; REG_ALLOC-NEXT:   liveins: $sgpr0, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr10, $sgpr2_sgpr3, $sgpr4_sgpr5, $sgpr8_sgpr9_sgpr10_sgpr11
   ; REG_ALLOC-NEXT: {{  $}}
-  ; REG_ALLOC-NEXT:   renamable $vgpr5_vgpr6_vgpr7_vgpr8 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr3, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+  ; REG_ALLOC-NEXT:   renamable $vgpr11_vgpr12_vgpr13_vgpr14 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr3, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
   ; REG_ALLOC-NEXT:   renamable $vgpr15_vgpr16_vgpr17_vgpr18 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr2, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
-  ; REG_ALLOC-NEXT:   renamable $vgpr11_vgpr12_vgpr13_vgpr14 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr0, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+  ; REG_ALLOC-NEXT:   renamable $vgpr5_vgpr6_vgpr7_vgpr8 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr4, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+  ; REG_ALLOC-NEXT:   KILL killed renamable $vgpr4
   ; REG_ALLOC-NEXT:   KILL killed renamable $vgpr2
-  ; REG_ALLOC-NEXT:   KILL killed renamable $vgpr0
   ; REG_ALLOC-NEXT:   KILL killed renamable $vgpr3
-  ; REG_ALLOC-NEXT:   renamable $sgpr12 = V_READFIRSTLANE_B32 killed $vgpr5, implicit $exec
-  ; REG_ALLOC-NEXT:   renamable $vgpr4_vgpr5_vgpr6_vgpr7 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr4, killed renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+  ; REG_ALLOC-NEXT:   renamable $sgpr12 = V_READFIRSTLANE_B32 killed $vgpr11, implicit $exec
+  ; REG_ALLOC-NEXT:   renamable $vgpr11_vgpr12_vgpr13_vgpr14 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr0, killed renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
   ; REG_ALLOC-NEXT:   renamable $sgpr13 = V_READFIRSTLANE_B32 killed $vgpr15, implicit $exec
   ; REG_ALLOC-NEXT:   renamable $sgpr6_sgpr7 = V_CMP_NE_U32_e64 killed $vgpr1, 0, implicit $exec
   ; REG_ALLOC-NEXT:   S_CMP_EQ_U64 killed renamable $sgpr12_sgpr13, killed renamable $sgpr2_sgpr3, implicit-def $scc
   ; REG_ALLOC-NEXT:   renamable $sgpr1 = S_CSELECT_B32 1, 0, implicit $scc
-  ; REG_ALLOC-NEXT:   renamable $vgpr8 = IMPLICIT_DEF
+  ; REG_ALLOC-NEXT:   renamable $vgpr0 = IMPLICIT_DEF
   ; REG_ALLOC-NEXT:   $exec = S_MOV_B64_term renamable $sgpr6_sgpr7
   ; REG_ALLOC-NEXT:   S_CBRANCH_EXECZ %bb.1, implicit $exec
   ; REG_ALLOC-NEXT:   S_BRANCH %bb.2
   ; REG_ALLOC-NEXT: {{  $}}
   ; REG_ALLOC-NEXT: bb.1:
   ; REG_ALLOC-NEXT:   successors: %bb.3(0x40000000), %bb.5(0x40000000)
-  ; REG_ALLOC-NEXT:   liveins: $sgpr0, $vgpr8, $vgpr10, $sgpr4_sgpr5, $sgpr6_sgpr7, $vgpr11_vgpr12_vgpr13_vgpr14:0x0000000000000003, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7:0x0000000000000300
+  ; REG_ALLOC-NEXT:   liveins: $sgpr0, $vgpr0, $vgpr10, $sgpr4_sgpr5, $sgpr6_sgpr7, $vgpr11_vgpr12_vgpr13_vgpr14:0x0000000000000003, $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8:0x0000000000000300
   ; REG_ALLOC-NEXT: {{  $}}
   ; REG_ALLOC-NEXT:   renamable $sgpr2_sgpr3 = S_OR_SAVEEXEC_B64 killed renamable $sgpr6_sgpr7, implicit-def $exec, implicit-def $scc, implicit $exec
   ; REG_ALLOC-NEXT:   $exec = S_XOR_B64_term $exec, renamable $sgpr2_sgpr3, implicit-def $scc
@@ -42,33 +42,33 @@ body: |
   ; REG_ALLOC-NEXT:   liveins: $sgpr0, $sgpr1, $vgpr10, $sgpr4_sgpr5, $sgpr6_sgpr7
   ; REG_ALLOC-NEXT: {{  $}}
   ; REG_ALLOC-NEXT:   renamable $sgpr1 = S_OR_B32 killed renamable $sgpr1, 2, implicit-def dead $scc
-  ; REG_ALLOC-NEXT:   renamable $vgpr8 = COPY killed renamable $sgpr1
+  ; REG_ALLOC-NEXT:   renamable $vgpr0 = COPY killed renamable $sgpr1
   ; REG_ALLOC-NEXT:   renamable $vgpr11_vgpr12 = IMPLICIT_DEF
-  ; REG_ALLOC-NEXT:   renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF
+  ; REG_ALLOC-NEXT:   renamable $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 = IMPLICIT_DEF
   ; REG_ALLOC-NEXT:   S_BRANCH %bb.1
   ; REG_ALLOC-NEXT: {{  $}}
   ; REG_ALLOC-NEXT: bb.3:
   ; REG_ALLOC-NEXT:   successors: %bb.5(0x80000000)
-  ; REG_ALLOC-NEXT:   liveins: $sgpr0, $vgpr10, $sgpr2_sgpr3, $sgpr4_sgpr5, $vgpr11_vgpr12_vgpr13_vgpr14:0x0000000000000003, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7:0x0000000000000300
+  ; REG_ALLOC-NEXT:   liveins: $sgpr0, $vgpr10, $sgpr2_sgpr3, $sgpr4_sgpr5, $vgpr11_vgpr12_vgpr13_vgpr14:0x0000000000000003, $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8:0x0000000000000300
   ; REG_ALLOC-NEXT: {{  $}}
   ; REG_ALLOC-NEXT:   renamable $sgpr1 = V_READFIRSTLANE_B32 killed $vgpr11, implicit $exec
-  ; REG_ALLOC-NEXT:   renamable $sgpr6 = V_READFIRSTLANE_B32 killed $vgpr4, implicit $exec
+  ; REG_ALLOC-NEXT:   renamable $sgpr6 = V_READFIRSTLANE_B32 killed $vgpr5, implicit $exec
   ; REG_ALLOC-NEXT:   S_CMP_EQ_U32 killed renamable $sgpr6, killed renamable $sgpr1, implicit-def $scc
   ; REG_ALLOC-NEXT:   renamable $sgpr1 = S_CSELECT_B32 1, 0, implicit $scc
-  ; REG_ALLOC-NEXT:   renamable $vgpr8 = COPY killed renamable $sgpr1
+  ; REG_ALLOC-NEXT:   renamable $vgpr0 = COPY killed renamable $sgpr1
   ; REG_ALLOC-NEXT:   S_BRANCH %bb.5
   ; REG_ALLOC-NEXT: {{  $}}
   ; REG_ALLOC-NEXT: bb.4:
-  ; REG_ALLOC-NEXT:   liveins: $sgpr0, $vgpr8, $vgpr10, $sgpr4_sgpr5
+  ; REG_ALLOC-NEXT:   liveins: $sgpr0, $vgpr0, $vgpr10, $sgpr4_sgpr5
   ; REG_ALLOC-NEXT: {{  $}}
   ; REG_ALLOC-NEXT:   renamable $sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (invariant load (<4 x s32>), addrspace 4)
-  ; REG_ALLOC-NEXT:   renamable $vgpr0 = V_ADD_U32_e64 killed $sgpr0, killed $vgpr10, 0, implicit $exec
-  ; REG_ALLOC-NEXT:   BUFFER_STORE_DWORD_OFFEN_exact killed renamable $vgpr8, killed renamable $vgpr0, killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
+  ; REG_ALLOC-NEXT:   renamable $vgpr1 = V_ADD_U32_e64 killed $sgpr0, killed $vgpr10, 0, implicit $exec
+  ; REG_ALLOC-NEXT:   BUFFER_STORE_DWORD_OFFEN_exact killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
   ; REG_ALLOC-NEXT:   S_ENDPGM 0
   ; REG_ALLOC-NEXT: {{  $}}
   ; REG_ALLOC-NEXT: bb.5:
   ; REG_ALLOC-NEXT:   successors: %bb.4(0x80000000)
-  ; REG_ALLOC-NEXT:   liveins: $sgpr0, $vgpr8, $vgpr10, $sgpr2_sgpr3, $sgpr4_sgpr5
+  ; REG_ALLOC-NEXT:   liveins: $sgpr0, $vgpr0, $vgpr10, $sgpr2_sgpr3, $sgpr4_sgpr5
   ; REG_ALLOC-NEXT: {{  $}}
   ; REG_ALLOC-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def $scc
   ; REG_ALLOC-NEXT:   S_BRANCH %bb.4
@@ -78,26 +78,26 @@ body: |
   ; DEAD_INST_DEL-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
   ; DEAD_INST_DEL-NEXT:   liveins: $sgpr0, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr10, $sgpr2_sgpr3, $sgpr4_sgpr5, $sgpr8_sgpr9_sgpr10_sgpr11
   ; DEAD_INST_DEL-NEXT: {{  $}}
-  ; DEAD_INST_DEL-NEXT:   renamable $vgpr5_vgpr6_vgpr7_vgpr8 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr3, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+  ; DEAD_INST_DEL-NEXT:   renamable $vgpr11_vgpr12_vgpr13_vgpr14 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr3, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
   ; DEAD_INST_DEL-NEXT:   renamable $vgpr15_vgpr16_vgpr17_vgpr18 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr2, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
-  ; DEAD_INST_DEL-NEXT:   renamable $vgpr11_vgpr12_vgpr13_vgpr14 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr0, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+  ; DEAD_INST_DEL-NEXT:   renamable $vgpr5_vgpr6_vgpr7_vgpr8 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr4, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+  ; DEAD_INST_DEL-NEXT:   KILL killed renamable $vgpr4
   ; DEAD_INST_DEL-NEXT:   KILL killed renamable $vgpr2
-  ; DEAD_INST_DEL-NEXT:   KILL killed renamable $vgpr0
   ; DEAD_INST_DEL-NEXT:   KILL killed renamable $vgpr3
-  ; DEAD_INST_DEL-NEXT:   renamable $sgpr12 = V_READFIRSTLANE_B32 killed $vgpr5, implicit $exec
-  ; DEAD_INST_DEL-NEXT:   renamable $vgpr4_vgpr5_vgpr6_vgpr7 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr4, killed renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+  ; DEAD_INST_DEL-NEXT:   renamable $sgpr12 = V_READFIRSTLANE_B32 killed $vgpr11, implicit $exec
+  ; DEAD_INST_DEL-NEXT:   renamable $vgpr11_vgpr12_vgpr13_vgpr14 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr0, killed renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
   ; DEAD_INST_DEL-NEXT:   renamable $sgpr13 = V_READFIRSTLANE_B32 killed $vgpr15, implicit $exec
   ; DEAD_INST_DEL-NEXT:   renamable $sgpr6_sgpr7 = V_CMP_NE_U32_e64 killed $vgpr1, 0, implicit $exec
   ; DEAD_INST_DEL-NEXT:   S_CMP_EQ_U64 killed renamable $sgpr12_sgpr13, killed renamable $sgpr2_sgpr3, implicit-def $scc
   ; DEAD_INST_DEL-NEXT:   renamable $sgpr1 = S_CSELECT_B32 1, 0, implicit $scc
-  ; DEAD_INST_DEL-NEXT:   renamable $vgpr8 = IMPLICIT_DEF
+  ; DEAD_INST_DEL-NEXT:   renamable $vgpr0 = IMPLICIT_DEF
   ; DEAD_INST_DEL-NEXT:   $exec = S_MOV_B64_term renamable $sgpr6_sgpr7
   ; DEAD_INST_DEL-NEXT:   S_CBRANCH_EXECZ %bb.1, implicit $exec
   ; DEAD_INST_DEL-NEXT:   S_BRANCH %bb.2
   ; DEAD_INST_DEL-NEXT: {{  $}}
   ; DEAD_INST_DEL-NEXT: bb.1:
   ; DEAD_INST_DEL-NEXT:   successors: %bb.3(0x40000000), %bb.5(0x40000000)
-  ; DEAD_INST_DEL-NEXT:   liveins: $sgpr0, $vgpr8, $vgpr10, $sgpr4_sgpr5, $sgpr6_sgpr7, $vgpr11_vgpr12_vgpr13_vgpr14:0x0000000000000003, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7:0x0000000000000300
+  ; DEAD_INST_DEL-NEXT:   liveins: $sgpr0, $vgpr0, $vgpr10, $sgpr4_sgpr5, $sgpr6_sgpr7, $vgpr11_vgpr12_vgpr13_vgpr14:0x0000000000000003, $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8:0x0000000000000300
   ; DEAD_INST_DEL-NEXT: {{  $}}
   ; DEAD_INST_DEL-NEXT:   renamable $sgpr2_sgpr3 = S_OR_SAVEEXEC_B64 killed renamable $sgpr6_sgpr7, implicit-def $exec, implicit-def $scc, implicit $exec
   ; DEAD_INST_DEL-NEXT:   $exec = S_XOR_B64_term $exec, renamable $sgpr2_sgpr3, implicit-def $scc
@@ -109,33 +109,33 @@ body: |
   ; DEAD_INST_DEL-NEXT:   liveins: $sgpr0, $sgpr1, $vgpr10, $sgpr4_sgpr5, $sgpr6_sgpr7
   ; DEAD_INST_DEL-NEXT: {{  $}}
   ; DEAD_INST_DEL-NEXT:   renamable $sgpr1 = S_OR_B32 killed renamable $sgpr1, 2, implicit-def dead $scc
-  ; DEAD_INST_DEL-NEXT:   renamable $vgpr8 = COPY killed renamable $sgpr1
+  ; DEAD_INST_DEL-NEXT:   renamable $vgpr0 = COPY killed renamable $sgpr1
   ; DEAD_INST_DEL-NEXT:   renamable $vgpr11_vgpr12 = IMPLICIT_DEF
-  ; DEAD_INST_DEL-NEXT:   renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF
+  ; DEAD_INST_DEL-NEXT:   renamable $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 = IMPLICIT_DEF
   ; DEAD_INST_DEL-NEXT:   S_BRANCH %bb.1
   ; DEAD_INST_DEL-NEXT: {{  $}}
   ; DEAD_INST_DEL-NEXT: bb.3:
   ; DEAD_INST_DEL-NEXT:   successors: %bb.5(0x80000000)
-  ; DEAD_INST_DEL-NEXT:   liveins: $sgpr0, $vgpr10, $sgpr2_sgpr3, $sgpr4_sgpr5, $vgpr11_vgpr12_vgpr13_vgpr14:0x0000000000000003, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7:0x0000000000000300
+  ; DEAD_INST_DEL-NEXT:   liveins: $sgpr0, $vgpr10, $sgpr2_sgpr3, $sgpr4_sgpr5, $vgpr11_vgpr12_vgpr13_vgpr14:0x0000000000000003, $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8:0x0000000000000300
   ; DEAD_INST_DEL-NEXT: {{  $}}
   ; DEAD_INST_DEL-NEXT:   renamable $sgpr1 = V_READFIRSTLANE_B32 killed $vgpr11, implicit $exec
-  ; DEAD_INST_DEL-NEXT:   renamable $sgpr6 = V_READFIRSTLANE_B32 killed $vgpr4, implicit $exec
+  ; DEAD_INST_DEL-NEXT:   renamable $sgpr6 = V_READFIRSTLANE_B32 killed $vgpr5, implicit $exec
   ; DEAD_INST_DEL-NEXT:   S_CMP_EQ_U32 killed renamable $sgpr6, killed renamable $sgpr1, implicit-def $scc
   ; DEAD_INST_DEL-NEXT:   renamable $sgpr1 = S_CSELECT_B32 1, 0, implicit $scc
-  ; DEAD_INST_DEL-NEXT:   renamable $vgpr8 = COPY killed renamable $sgpr1
+  ; DEAD_INST_DEL-NEXT:   renamable $vgpr0 = COPY killed renamable $sgpr1
   ; DEAD_INST_DEL-NEXT:   S_BRANCH %bb.5
   ; DEAD_INST_DEL-NEXT: {{  $}}
   ; DEAD_INST_DEL-NEXT: bb.4:
-  ; DEAD_INST_DEL-NEXT:   liveins: $sgpr0, $vgpr8, $vgpr10, $sgpr4_sgpr5
+  ; DEAD_INST_DEL-NEXT:   liveins: $sgpr0, $vgpr0, $vgpr10, $sgpr4_sgpr5
   ; DEAD_INST_DEL-NEXT: {{  $}}
   ; DEAD_INST_DEL-NEXT:   renamable $sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (invariant load (<4 x s32>), addrspace 4)
-  ; DEAD_INST_DEL-NEXT:   renamable $vgpr0 = V_ADD_U32_e64 killed $sgpr0, killed $vgpr10, 0, implicit $exec
-  ; DEAD_INST_DEL-NEXT:   BUFFER_STORE_DWORD_OFFEN_exact killed renamable $vgpr8, killed renamable $vgpr0, killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
+  ; DEAD_INST_DEL-NEXT:   renamable $vgpr1 = V_ADD_U32_e64 killed $sgpr0, killed $vgpr10, 0, implicit $exec
+  ; DEAD_INST_DEL-NEXT:   BUFFER_STORE_DWORD_OFFEN_exact killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
   ; DEAD_INST_DEL-NEXT:   S_ENDPGM 0
   ; DEAD_INST_DEL-NEXT: {{  $}}
   ; DEAD_INST_DEL-NEXT: bb.5:
   ; DEAD_INST_DEL-NEXT:   successors: %bb.4(0x80000000)
-  ; DEAD_INST_DEL-NEXT:   liveins: $sgpr0, $vgpr8, $vgpr10, $sgpr2_sgpr3, $sgpr4_sgpr5
+  ; DEAD_INST_DEL-NEXT:   liveins: $sgpr0, $vgpr0, $vgpr10, $sgpr2_sgpr3, $sgpr4_sgpr5
   ; DEAD_INST_DEL-NEXT: {{  $}}
   ; DEAD_INST_DEL-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def $scc
   ; DEAD_INST_DEL-NEXT:   S_BRANCH %bb.4
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index 4cc47b09d813d6..07da2ea5ef733c 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -4592,106 +4592,106 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s6
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s7
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s11, s3
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[20:23], off, s[8:11], 0
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:16
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
 ; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[23:26], off, s[8:11], 0 offset:64
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[27:30], off, s[8:11], 0 offset:80
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[31:34], off, s[8:11], 0 offset:96
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[35:38], off, s[8:11], 0 offset:112
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:64
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:80
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[32:35], off, s[8:11], 0 offset:96
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[36:39], off, s[8:11], 0 offset:112
 ; GCN-NOHSA-VI-NEXT:    s_addc_u32 s89, s89, 0
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(7)
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v18, 16, v15
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v16, 16, v14
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v17, v15, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v15, v14, 0, 16
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v15, off, s[88:91], 0 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v7, 16, v23
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v5, 16, v22
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v6, v23, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v4, v22, 0, 16
+; GCN-NOHSA-VI-NEXT:    buffer_store_dword v4, off, s[88:91], 0 ; 4-byte Folded Spill
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v16, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v17, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v18, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v16, 16, v13
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v14, 16, v12
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v15, v13, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v13, v12, 0, 16
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v13, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    buffer_store_dword v5, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    buffer_store_dword v6, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    buffer_store_dword v7, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v7, 16, v21
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v5, 16, v20
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v6, v21, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v4, v20, 0, 16
+; GCN-NOHSA-VI-NEXT:    buffer_store_dword v4, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v14, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v15, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT:    buffer_store_dword v16, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v19, 16, v11
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v17, 16, v10
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v18, v11, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v16, v10, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v42, 16, v9
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v40, 16, v8
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v41, v9, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v39, v8, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v11, 16, v3
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v9, 16, v2
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v10, v3, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v8, v2, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v50, 16, v1
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v48, 16, v0
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v49, v1, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v47, v0, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v3, 16, v36
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v1, 16, v35
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v2, v36, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v0, v35, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v15, 16, v7
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v13, 16, v6
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v14, v7, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v12, v6, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v46, 16, v5
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v44, 16, v4
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v45, v5, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v43, v4, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v54, 16, v26
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v52, 16, v25
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v53, v26, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v51, v25, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v58, 16, v24
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v56, 16, v23
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v57, v24, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v55, v23, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v26, 16, v30
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v24, 16, v29
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v25, v30, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v23, v29, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v62, 16, v28
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v60, 16, v27
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v61, v28, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v59, v27, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v30, 16, v34
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v28, 16, v33
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v29, v34, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v27, v33, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v7, 16, v32
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v5, 16, v31
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v6, v32, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v4, v31, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v34, 16, v38
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v32, 16, v37
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v33, v38, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v31, v37, 0, 16
+; GCN-NOHSA-VI-NEXT:    buffer_store_dword v5, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    buffer_store_dword v6, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    buffer_store_dword v7, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v15, 16, v3
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v13, 16, v2
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v14, v3, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v12, v2, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v55, 16, v1
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v53, 16, v0
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v54, v1, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v52, v0, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v3, 16, v37
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v1, 16, v36
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v2, v37, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v0, v36, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v22, 16, v19
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v20, 16, v18
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v21, v19, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v19, v18, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v43, 16, v17
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v41, 16, v16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v42, v17, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v40, v16, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v47, 16, v11
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v45, 16, v10
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v46, v11, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v44, v10, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v51, 16, v9
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v49, 16, v8
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v50, v9, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v48, v8, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v11, 16, v27
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v9, 16, v26
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v10, v27, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v8, v26, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v59, 16, v25
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v57, 16, v24
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v58, v25, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v56, v24, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v26, 16, v31
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v24, 16, v30
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v25, v31, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v23, v30, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v63, 16, v29
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v61, 16, v28
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v62, v29, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v60, v28, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v30, 16, v35
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v28, 16, v34
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v29, v35, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v27, v34, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v7, 16, v33
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v5, 16, v32
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v6, v33, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v4, v32, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v34, 16, v39
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v32, 16, v38
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v33, v39, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v31, v38, 0, 16
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:240
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:208
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[59:62], off, s[0:3], 0 offset:160
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:160
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:176
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[55:58], off, s[0:3], 0 offset:128
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[51:54], off, s[0:3], 0 offset:144
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[47:50], off, s[0:3], 0 offset:96
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[43:46], off, s[0:3], 0 offset:64
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[39:42], off, s[0:3], 0 offset:32
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:128
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:144
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:96
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:112
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:64
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:80
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:32
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-VI-NEXT:    buffer_load_dword v0, off, s[88:91], 0 offset:16 ; 4-byte Folded Reload
 ; GCN-NOHSA-VI-NEXT:    buffer_load_dword v1, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload
 ; GCN-NOHSA-VI-NEXT:    buffer_load_dword v2, off, s[88:91], 0 offset:24 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
index 7cdf270810dea0..d566725efd3d2f 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
@@ -2313,8 +2313,8 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou
 ; SI-NOHSA-NEXT:    s_mov_b32 s9, s7
 ; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48
 ; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32
-; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16
-; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:16
 ; SI-NOHSA-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v19, 31, v3
 ; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v17, 31, v2
@@ -2333,24 +2333,23 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v26, v5
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v6
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v7
-; SI-NOHSA-NEXT:    s_waitcnt vmcnt(1)
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v7, 31, v11
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v10
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v31, 31, v9
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v29, 31, v8
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v28, v8
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v30, v9
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v4, v10
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v6, v11
 ; SI-NOHSA-NEXT:    s_waitcnt vmcnt(0)
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v11, 31, v15
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v14
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v35, 31, v13
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v33, 31, v12
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v32, v12
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v34, v13
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v8, v14
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v10, v15
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v7, 31, v15
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v14
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v31, 31, v13
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v29, 31, v12
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v28, v12
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v30, v13
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v4, v14
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v6, v15
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v15, 31, v11
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v13, 31, v10
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v35, 31, v9
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v33, 31, v8
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v32, v8
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v34, v9
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v12, v10
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v14, v11
 ; SI-NOHSA-NEXT:    s_mov_b32 s0, s4
 ; SI-NOHSA-NEXT:    s_mov_b32 s1, s5
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:96
@@ -2360,7 +2359,7 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:32
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[32:35], off, s[0:3], 0
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16
 ; SI-NOHSA-NEXT:    s_endpgm
 ;
 ; GCNX3-HSA-LABEL: global_sextload_v16i32_to_v16i64:
@@ -3005,17 +3004,17 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NOHSA-NEXT:    s_mov_b32 s8, s6
 ; SI-NOHSA-NEXT:    s_mov_b32 s9, s7
-; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:96
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:96
 ; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:112
-; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:80
 ; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:64
-; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:48
-; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:32
-; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:16
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:80
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:32
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:48
 ; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:16
 ; SI-NOHSA-NEXT:    s_waitcnt vmcnt(7)
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v47, 31, v31
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v45, 31, v30
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v47, 31, v27
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v45, 31, v26
 ; SI-NOHSA-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v39, 31, v15
 ; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v37, 31, v14
@@ -3025,12 +3024,12 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v42, v13
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v36, v14
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v38, v15
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v35, 31, v29
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v33, 31, v28
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v32, v28
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v34, v29
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v44, v30
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v46, v31
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v35, 31, v25
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v33, 31, v24
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v32, v24
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v34, v25
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v44, v26
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v46, v27
 ; SI-NOHSA-NEXT:    buffer_store_dword v44, off, s[12:15], 0 ; 4-byte Folded Spill
 ; SI-NOHSA-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NOHSA-NEXT:    buffer_store_dword v45, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
@@ -3053,38 +3052,38 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v50, v1
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v4, v2
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v6, v3
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v19
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v18
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v55, 31, v17
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v53, 31, v16
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v52, v16
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v54, v17
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v18
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v19
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v19, 31, v23
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v17, 31, v22
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v59, 31, v21
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v57, 31, v20
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v56, v20
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v58, v21
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v16, v22
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v18, v23
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v23, 31, v27
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v21, 31, v26
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v63, 31, v25
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v61, 31, v24
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v60, v24
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v62, v25
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v20, v26
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v22, v27
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v27, 31, v11
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v25, 31, v10
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v31, 31, v9
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v29, 31, v8
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v28, v8
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v30, v9
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v24, v10
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v26, v11
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v23
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v22
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v55, 31, v21
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v53, 31, v20
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v52, v20
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v54, v21
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v22
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v23
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v23, 31, v19
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v21, 31, v18
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v59, 31, v17
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v57, 31, v16
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v56, v16
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v58, v17
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v20, v18
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v22, v19
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v19, 31, v31
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v17, 31, v30
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v63, 31, v29
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v61, 31, v28
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v60, v28
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v62, v29
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v16, v30
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v18, v31
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v31, 31, v11
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v29, 31, v10
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v27, 31, v9
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v25, 31, v8
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v24, v8
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v26, v9
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v28, v10
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v30, v11
 ; SI-NOHSA-NEXT:    s_mov_b32 s0, s4
 ; SI-NOHSA-NEXT:    s_mov_b32 s1, s5
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:224
@@ -3105,11 +3104,11 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:96
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:64
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:32
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:48
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:16
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:16
 ; SI-NOHSA-NEXT:    s_endpgm
 ;
 ; GCNX3-HSA-LABEL: global_sextload_v32i32_to_v32i64:
@@ -3315,10 +3314,10 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s6
 ; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s7
-; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:96
-; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:112
-; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:80
-; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:64
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:80
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:96
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:112
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:64
 ; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:48
 ; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:32
 ; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[24:27], off, s[8:11], 0
@@ -3326,52 +3325,52 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; GCNX3-NOHSA-NEXT:    s_mov_b32 s0, s4
 ; GCNX3-NOHSA-NEXT:    s_mov_b32 s1, s5
 ; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(7)
-; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v35, 31, v11
-; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v33, 31, v10
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v35, 31, v3
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v33, 31, v2
 ; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(6)
-; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v39, 31, v15
-; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v37, 31, v14
-; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v43, 31, v13
-; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v41, 31, v12
-; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v40, v12
-; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v42, v13
-; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v36, v14
-; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v38, v15
-; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v15, 31, v9
-; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v13, 31, v8
-; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v12, v8
-; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v14, v9
-; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v32, v10
-; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v34, v11
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v39, 31, v7
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v37, 31, v6
 ; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(5)
-; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v11, 31, v7
-; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v6
-; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v47, 31, v5
-; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v45, 31, v4
-; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v44, v4
-; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v46, v5
-; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v8, v6
-; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v10, v7
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v43, 31, v11
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v41, 31, v10
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v47, 31, v9
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v45, 31, v8
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v44, v8
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v46, v9
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v40, v10
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v42, v11
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v11, 31, v5
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v4
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v8, v4
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v10, v5
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v36, v6
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v38, v7
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v0
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v4, v0
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v6, v1
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v32, v2
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v34, v3
 ; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(4)
-; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v7, 31, v3
-; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v2
-; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v4, v2
-; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v6, v3
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v15
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v14
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v0, v14
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v2, v15
 ; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(3)
-; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v19
-; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v2, v19
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v15, 31, v19
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v14, v19
 ; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(2)
 ; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v19, 31, v23
-; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v51, 31, v1
-; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v49, 31, v0
-; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v48, v0
-; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v50, v1
-; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v18
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v51, 31, v13
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v49, 31, v12
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v48, v12
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v50, v13
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v13, 31, v18
 ; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v55, 31, v17
 ; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v53, 31, v16
 ; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v52, v16
 ; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v54, v17
-; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v0, v18
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v12, v18
 ; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v17, 31, v22
 ; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v59, 31, v21
 ; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v57, 31, v20
@@ -3382,36 +3381,36 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(1)
 ; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v22, 31, v27
 ; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v20, 31, v26
-; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:224
-; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:240
-; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v42, 31, v25
-; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v40, 31, v24
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:224
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:240
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v46, 31, v25
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v44, 31, v24
 ; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(2)
-; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v38, 31, v31
-; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v36, 31, v30
-; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192
-; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v37, v31
-; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v15, 31, v29
-; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v13, 31, v28
-; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v12, v28
-; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v14, v29
-; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:208
-; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:160
-; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:176
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v42, 31, v31
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v40, 31, v30
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:192
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v41, v31
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v11, 31, v29
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v28
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v8, v28
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v10, v29
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:208
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:160
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:176
 ; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:128
-; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:144
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
 ; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:96
-; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:112
 ; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:64
 ; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80
-; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
-; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v35, v30
-; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v39, v24
-; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v41, v25
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v39, v30
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v43, v24
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v45, v25
 ; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v19, v26
 ; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v21, v27
-; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[35:38], off, s[0:3], 0 offset:48
-; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[39:42], off, s[0:3], 0
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[39:42], off, s[0:3], 0 offset:48
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[43:46], off, s[0:3], 0
 ; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:16
 ; GCNX3-NOHSA-NEXT:    s_endpgm
 ;
@@ -3823,31 +3822,31 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NOHSA-NEXT:    s_mov_b32 s8, s6
 ; SI-NOHSA-NEXT:    s_mov_b32 s9, s7
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:96
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:112
 ; SI-NOHSA-NEXT:    s_mov_b32 s0, s4
 ; SI-NOHSA-NEXT:    s_mov_b32 s1, s5
-; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:112
-; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:96
 ; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0
 ; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:16
 ; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:32
 ; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:48
-; SI-NOHSA-NEXT:    s_waitcnt vmcnt(5)
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v4
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v5
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(4)
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v8
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v9
 ; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:64
 ; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[32:35], off, s[8:11], 0 offset:80
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
 ; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v6
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v7
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v10
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v11
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
-; SI-NOHSA-NEXT:    s_waitcnt vmcnt(8) expcnt(0)
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v8
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v9
+; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v4
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v5
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192
 ; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v10
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v11
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v6
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v7
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208
 ; SI-NOHSA-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v32
@@ -4403,22 +4402,22 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa
 ; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
 ; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
 ; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
-; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:112
-; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:96
-; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:80
-; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:64
-; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:32
-; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:48
-; SI-NOHSA-NEXT:    s_waitcnt vmcnt(4)
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:96
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:112
-; SI-NOHSA-NEXT:    s_waitcnt vmcnt(4)
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0 offset:64
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:80
-; SI-NOHSA-NEXT:    s_waitcnt vmcnt(5)
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0 offset:32
-; SI-NOHSA-NEXT:    s_waitcnt vmcnt(5)
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0 offset:48
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:96
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:112
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:64
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:80
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(3)
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:96
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(3)
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0 offset:112
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(3)
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0 offset:64
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(3)
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0 offset:80
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:48
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
 ; SI-NOHSA-NEXT:    s_endpgm
@@ -4520,27 +4519,29 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa
 ; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GCNX3-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
 ; GCNX3-NOHSA-NEXT:    s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s7
 ; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
-; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:112
-; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:96
-; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:80
-; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:64
-; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:32
-; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:48
-; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[24:27], off, s[8:11], 0
-; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:16
 ; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
 ; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
-; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(6)
-; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:96
-; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:112
-; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(6)
-; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:64
-; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:80
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s0, s2
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s1, s3
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s2, s6
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s3, s7
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:96
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:112
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:64
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:80
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[16:19], off, s[0:3], 0 offset:32
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[20:23], off, s[0:3], 0 offset:48
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[24:27], off, s[0:3], 0
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[28:31], off, s[0:3], 0 offset:16
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(7)
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:96
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(7)
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:112
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(7)
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:64
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(7)
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:80
 ; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(7)
 ; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:32
 ; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(7)
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
index 0a76e169e9c385..8238e8d8319b93 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
@@ -594,12 +594,23 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s1
 ; CHECK-NEXT:    s_waitcnt vmcnt(17)
 ; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:15
+; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:29
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:14
+; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:28
+; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:13
+; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:27
+; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:12
+; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:26
+; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:11
+; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:25
+; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:10
+; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:24
+; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:9
 ; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:23
 ; CHECK-NEXT:    s_nop 0
@@ -621,19 +632,15 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %
 ; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:3
 ; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:17
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:1
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:47
+; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:2
+; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:16
 ; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:1
 ; CHECK-NEXT:    flat_store_byte v[0:1], v18
 ; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:31
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:2
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:47
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:46
 ; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:45
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:23
@@ -654,18 +661,18 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %
 ; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:18
 ; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:32
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:29
-; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:30
 ; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:44
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:17
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:63
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:29
+; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:43
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:16
 ; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:28
 ; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:42
 ; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:27
+; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:41
+; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:26
 ; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:40
 ; CHECK-NEXT:    s_nop 0
@@ -675,11 +682,11 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %
 ; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:24
 ; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:38
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:27
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:41
+; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:17
+; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:16
+; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:63
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:45
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:59
+; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:62
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:37
 ; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:51
@@ -693,24 +700,27 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %
 ; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:34
 ; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:48
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:46
-; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:47
 ; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:61
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:29
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:43
+; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:46
+; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:60
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:45
+; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:59
 ; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:44
 ; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:58
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:33
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:79
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:43
+; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:57
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:32
 ; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:42
 ; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:56
 ; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:41
+; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:55
+; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:40
 ; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:54
 ; CHECK-NEXT:    s_nop 0
@@ -720,11 +730,11 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %
 ; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:38
 ; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:52
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:41
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:55
+; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:33
+; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:32
+; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:79
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:59
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:73
+; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:78
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:51
 ; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:65
@@ -732,30 +742,33 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %
 ; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:50
 ; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:64
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:62
-; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:63
 ; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:77
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:46
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:60
+; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:62
+; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:76
 ; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:61
 ; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:75
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:43
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:57
+; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:60
+; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:74
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:59
+; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:73
 ; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:58
 ; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:72
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:49
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:95
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:57
+; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:71
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:48
 ; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:56
 ; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:70
 ; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:55
+; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:69
+; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:54
 ; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:68
 ; CHECK-NEXT:    s_nop 0
@@ -765,11 +778,11 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %
 ; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:52
 ; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:66
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:55
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:69
+; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:49
+; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:48
+; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:95
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:73
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:87
+; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:94
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:65
 ; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:111
@@ -777,27 +790,18 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %
 ; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:64
 ; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:110
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:62
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:76
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:77
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:91
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:60
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:74
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:75
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:89
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:57
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:71
-; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:72
 ; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:86
 ; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:71
+; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:85
+; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:70
 ; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:84
 ; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:69
+; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:82
+; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:68
 ; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:83
 ; CHECK-NEXT:    s_nop 0
@@ -807,60 +811,59 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %
 ; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:66
 ; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:80
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:78
-; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:79
 ; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:93
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:69
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:82
+; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:78
+; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:92
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:77
+; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:91
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:87
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:101
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:76
 ; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:90
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:91
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:105
+; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:75
+; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:89
 ; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:74
 ; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:88
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:89
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:103
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:71
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:85
-; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:73
+; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:87
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:86
 ; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:100
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:78
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:92
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:85
+; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:99
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:95
+; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:109
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:94
+; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:108
 ; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:93
 ; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:107
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:92
+; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:106
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:91
+; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:105
+; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:90
 ; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:104
 ; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:89
+; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:103
+; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:88
 ; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:102
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:85
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:99
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:94
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:95
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:109
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:92
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:106
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:94
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:108
+; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:87
+; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:101
 ; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:84
 ; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:83
@@ -875,33 +878,41 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %
 ; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:80
 ; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:111
 ; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:110
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:109
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:99
 ; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:121
 ; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:122
 ; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:123
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:100
+; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:115
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:99
+; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:109
 ; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:124
 ; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:108
+; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:125
+; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:107
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:105
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:104
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:103
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:106
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:102
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:101
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:100
 ; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:126
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:106
+; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:127
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:105
 ; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:116
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:104
 ; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:117
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:103
 ; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:118
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:102
 ; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:119
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:127
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:101
 ; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:114
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:115
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:108
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:125
 ; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:113
 ; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[16:19], 0 offen offset:112
@@ -909,9 +920,9 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %
 ; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:98
 ; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:97
 ; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:96
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:127
 ; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:126
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:125
 ; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:124
 ; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:123
@@ -1840,12 +1851,23 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s1
 ; CHECK-NEXT:    s_waitcnt vmcnt(17)
 ; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:15
+; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:29
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:14
+; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:28
+; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:13
+; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:27
+; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:12
+; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:26
+; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:11
+; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:25
+; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:10
+; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:24
+; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:9
 ; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:23
 ; CHECK-NEXT:    s_nop 0
@@ -1867,19 +1889,15 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %
 ; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:3
 ; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:17
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:1
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:47
+; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:2
+; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:16
 ; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:1
 ; CHECK-NEXT:    flat_store_byte v[0:1], v18
 ; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:31
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:2
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:47
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:46
 ; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:45
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:23
@@ -1900,18 +1918,18 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %
 ; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:18
 ; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:32
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:29
-; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:30
 ; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:44
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:17
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:63
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:29
+; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:43
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:16
 ; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:28
 ; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:42
 ; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:27
+; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:41
+; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:26
 ; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:40
 ; CHECK-NEXT:    s_nop 0
@@ -1921,11 +1939,11 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %
 ; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:24
 ; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:38
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:27
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:41
+; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:17
+; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:16
+; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:63
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:45
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:59
+; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:62
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:37
 ; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:51
@@ -1939,24 +1957,27 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %
 ; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:34
 ; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:48
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:46
-; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:47
 ; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:61
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:29
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:43
+; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:46
+; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:60
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:45
+; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:59
 ; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:44
 ; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:58
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:33
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:79
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:43
+; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:57
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:32
 ; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:42
 ; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:56
 ; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:41
+; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:55
+; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:40
 ; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:54
 ; CHECK-NEXT:    s_nop 0
@@ -1966,11 +1987,11 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %
 ; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:38
 ; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:52
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:41
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:55
+; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:33
+; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:32
+; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:79
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:59
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:73
+; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:78
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:51
 ; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:65
@@ -1978,30 +1999,33 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %
 ; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:50
 ; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:64
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:62
-; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:63
 ; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:77
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:46
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:60
+; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:62
+; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:76
 ; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:61
 ; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:75
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:43
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:57
+; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:60
+; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:74
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:59
+; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:73
 ; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:58
 ; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:72
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:49
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:95
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:57
+; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:71
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:48
 ; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:56
 ; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:70
 ; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:55
+; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:69
+; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:54
 ; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:68
 ; CHECK-NEXT:    s_nop 0
@@ -2011,11 +2035,11 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %
 ; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:52
 ; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:66
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:55
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:69
+; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:49
+; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:48
+; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:95
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:73
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:87
+; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:94
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:65
 ; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:111
@@ -2023,27 +2047,18 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %
 ; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:64
 ; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:110
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:62
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:76
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:77
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:91
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:60
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:74
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:75
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:89
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:57
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:71
-; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:72
 ; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:86
 ; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:71
+; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:85
+; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:70
 ; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:84
 ; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:69
+; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:82
+; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:68
 ; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:83
 ; CHECK-NEXT:    s_nop 0
@@ -2053,60 +2068,59 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %
 ; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:66
 ; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:80
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:78
-; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:79
 ; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:93
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:69
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:82
+; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:78
+; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:92
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:77
+; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:91
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:87
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:101
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:76
 ; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:90
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:91
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:105
+; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:75
+; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:89
 ; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:74
 ; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:88
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:89
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:103
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:71
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:85
-; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:73
+; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:87
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:86
 ; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:100
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:78
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:92
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:85
+; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:99
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:95
+; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:109
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:94
+; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:108
 ; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:93
 ; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:107
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:92
+; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:106
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:91
+; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:105
+; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:90
 ; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:104
 ; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:89
+; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:103
+; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:88
 ; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:102
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:85
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:99
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:94
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:95
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:109
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:92
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:106
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:94
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:108
+; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:87
+; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:101
 ; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:84
 ; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:83
@@ -2121,33 +2135,41 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %
 ; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:80
 ; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:111
 ; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:110
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:109
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:99
 ; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:121
 ; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:122
 ; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:123
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:100
+; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:115
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:99
+; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:109
 ; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:124
 ; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:108
+; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:125
+; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:107
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:105
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:104
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:103
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:106
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:102
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:101
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:100
 ; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:126
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:106
+; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:127
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:105
 ; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:116
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:104
 ; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:117
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:103
 ; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:118
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:102
 ; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:119
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:127
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:101
 ; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:114
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:115
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:108
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:125
 ; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:113
 ; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[16:19], 0 offen offset:112
@@ -2155,9 +2177,9 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %
 ; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:98
 ; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:97
 ; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:96
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:127
 ; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:126
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:125
 ; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:124
 ; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:123
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll b/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll
index 7575782c1b2acd..f84384dfcbe8df 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll
@@ -2802,9 +2802,9 @@ define void @memcpy_p0_p5_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr addr
 ; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:25
 ; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:26
 ; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
 ; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
@@ -2829,10 +2829,10 @@ define void @memcpy_p0_p5_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr addr
 ; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
 ; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:29
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:30
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:27
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:28
+; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:30
+; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:27
+; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2902,9 +2902,9 @@ define void @memcpy_p0_p5_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr a
 ; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:25
 ; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:26
 ; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
 ; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
@@ -2929,10 +2929,10 @@ define void @memcpy_p0_p5_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr a
 ; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
 ; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:29
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:30
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:27
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:28
+; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:30
+; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:27
+; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4107,14 +4107,14 @@ define void @memcpy_p1_p5_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr
 ; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:13
 ; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:14
 ; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:17
+; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:18
+; CHECK-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:21
+; CHECK-NEXT:    buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:22
+; CHECK-NEXT:    buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:23
 ; CHECK-NEXT:    buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:29
@@ -4139,20 +4139,20 @@ define void @memcpy_p1_p5_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr
 ; CHECK-NEXT:    v_lshl_or_b32 v6, v16, 8, v15
 ; CHECK-NEXT:    s_waitcnt vmcnt(15)
 ; CHECK-NEXT:    v_lshl_or_b32 v9, v18, 8, v17
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v21, 8, v20
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v23, 8, v22
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v19, 8, v25
+; CHECK-NEXT:    s_waitcnt vmcnt(13)
+; CHECK-NEXT:    v_lshl_or_b32 v15, v20, 8, v19
+; CHECK-NEXT:    s_waitcnt vmcnt(11)
+; CHECK-NEXT:    v_lshl_or_b32 v14, v22, 8, v21
+; CHECK-NEXT:    s_waitcnt vmcnt(9)
+; CHECK-NEXT:    v_lshl_or_b32 v12, v24, 8, v23
 ; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v26, 8, v24
+; CHECK-NEXT:    v_lshl_or_b32 v13, v26, 8, v25
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
 ; CHECK-NEXT:    v_lshl_or_b32 v16, v28, 8, v27
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
 ; CHECK-NEXT:    v_lshl_or_b32 v17, v30, 8, v29
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v18, v31, 8, v21
+; CHECK-NEXT:    v_lshl_or_b32 v18, v31, 8, v26
 ; CHECK-NEXT:    v_lshl_or_b32 v7, v13, 16, v12
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_lshl_or_b32 v19, v2, 8, v32
@@ -4192,14 +4192,14 @@ define void @memcpy_p1_p5_sz32_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr
 ; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:13
 ; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:14
 ; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:17
+; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:18
+; CHECK-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:21
+; CHECK-NEXT:    buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:22
+; CHECK-NEXT:    buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:23
 ; CHECK-NEXT:    buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:29
 ; CHECK-NEXT:    buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:30
@@ -4225,14 +4225,14 @@ define void @memcpy_p1_p5_sz32_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr
 ; CHECK-NEXT:    v_lshl_or_b32 v6, v16, 8, v15
 ; CHECK-NEXT:    s_waitcnt vmcnt(16)
 ; CHECK-NEXT:    v_lshl_or_b32 v9, v18, 8, v17
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v21, 8, v20
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v23, 8, v22
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v19, 8, v25
+; CHECK-NEXT:    s_waitcnt vmcnt(14)
+; CHECK-NEXT:    v_lshl_or_b32 v15, v20, 8, v19
+; CHECK-NEXT:    s_waitcnt vmcnt(12)
+; CHECK-NEXT:    v_lshl_or_b32 v14, v22, 8, v21
+; CHECK-NEXT:    s_waitcnt vmcnt(10)
+; CHECK-NEXT:    v_lshl_or_b32 v12, v24, 8, v23
 ; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v26, 8, v24
+; CHECK-NEXT:    v_lshl_or_b32 v13, v26, 8, v25
 ; CHECK-NEXT:    s_waitcnt vmcnt(6)
 ; CHECK-NEXT:    v_lshl_or_b32 v16, v28, 8, v27
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
@@ -5767,14 +5767,14 @@ define void @memcpy_p3_p5_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr addr
 ; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:13
 ; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:14
 ; CHECK-NEXT:    buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:17
+; CHECK-NEXT:    buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:18
+; CHECK-NEXT:    buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:21
+; CHECK-NEXT:    buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:22
+; CHECK-NEXT:    buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:23
 ; CHECK-NEXT:    buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:29
@@ -5799,20 +5799,20 @@ define void @memcpy_p3_p5_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr addr
 ; CHECK-NEXT:    v_lshl_or_b32 v5, v15, 8, v14
 ; CHECK-NEXT:    s_waitcnt vmcnt(15)
 ; CHECK-NEXT:    v_lshl_or_b32 v8, v17, 8, v16
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v20, 8, v19
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v22, 8, v21
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v18, 8, v24
+; CHECK-NEXT:    s_waitcnt vmcnt(13)
+; CHECK-NEXT:    v_lshl_or_b32 v14, v19, 8, v18
+; CHECK-NEXT:    s_waitcnt vmcnt(11)
+; CHECK-NEXT:    v_lshl_or_b32 v13, v21, 8, v20
+; CHECK-NEXT:    s_waitcnt vmcnt(9)
+; CHECK-NEXT:    v_lshl_or_b32 v11, v23, 8, v22
 ; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v25, 8, v23
+; CHECK-NEXT:    v_lshl_or_b32 v12, v25, 8, v24
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
 ; CHECK-NEXT:    v_lshl_or_b32 v15, v27, 8, v26
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
 ; CHECK-NEXT:    v_lshl_or_b32 v16, v29, 8, v28
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v17, v30, 8, v20
+; CHECK-NEXT:    v_lshl_or_b32 v17, v30, 8, v25
 ; CHECK-NEXT:    v_lshl_or_b32 v6, v12, 16, v11
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_lshl_or_b32 v18, v1, 8, v31
@@ -5853,14 +5853,14 @@ define void @memcpy_p3_p5_sz32_align_1_1(ptr addrspace(3) align 1 %dst, ptr addr
 ; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:13
 ; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:14
 ; CHECK-NEXT:    buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:17
+; CHECK-NEXT:    buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:18
+; CHECK-NEXT:    buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:21
+; CHECK-NEXT:    buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:22
+; CHECK-NEXT:    buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:23
 ; CHECK-NEXT:    buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:29
 ; CHECK-NEXT:    buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:30
@@ -5886,14 +5886,14 @@ define void @memcpy_p3_p5_sz32_align_1_1(ptr addrspace(3) align 1 %dst, ptr addr
 ; CHECK-NEXT:    v_lshl_or_b32 v5, v15, 8, v14
 ; CHECK-NEXT:    s_waitcnt vmcnt(16)
 ; CHECK-NEXT:    v_lshl_or_b32 v8, v17, 8, v16
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v20, 8, v19
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v22, 8, v21
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v18, 8, v24
+; CHECK-NEXT:    s_waitcnt vmcnt(14)
+; CHECK-NEXT:    v_lshl_or_b32 v14, v19, 8, v18
+; CHECK-NEXT:    s_waitcnt vmcnt(12)
+; CHECK-NEXT:    v_lshl_or_b32 v13, v21, 8, v20
+; CHECK-NEXT:    s_waitcnt vmcnt(10)
+; CHECK-NEXT:    v_lshl_or_b32 v11, v23, 8, v22
 ; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v25, 8, v23
+; CHECK-NEXT:    v_lshl_or_b32 v12, v25, 8, v24
 ; CHECK-NEXT:    s_waitcnt vmcnt(6)
 ; CHECK-NEXT:    v_lshl_or_b32 v15, v27, 8, v26
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
diff --git a/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll b/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll
index cc5256620bfe08..dc8defecbede04 100644
--- a/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll
+++ b/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll
@@ -3853,12 +3853,12 @@ define void @memmove_p1_p5_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr add
 ; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:13
 ; CHECK-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:14
 ; CHECK-NEXT:    buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:18
+; CHECK-NEXT:    buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:21
+; CHECK-NEXT:    buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:22
+; CHECK-NEXT:    buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:23
 ; CHECK-NEXT:    buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:17
 ; CHECK-NEXT:    buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:28
@@ -3888,13 +3888,13 @@ define void @memmove_p1_p5_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr add
 ; CHECK-NEXT:    v_lshl_or_b32 v10, v21, 8, v20
 ; CHECK-NEXT:    s_waitcnt vmcnt(10)
 ; CHECK-NEXT:    v_lshl_or_b32 v11, v23, 8, v22
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v26, 8, v25
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v24, 8, v28
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v16, v29, 8, v27
+; CHECK-NEXT:    s_waitcnt vmcnt(8)
+; CHECK-NEXT:    v_lshl_or_b32 v16, v25, 8, v24
+; CHECK-NEXT:    s_waitcnt vmcnt(6)
+; CHECK-NEXT:    v_lshl_or_b32 v14, v27, 8, v26
 ; CHECK-NEXT:    v_lshl_or_b32 v5, v11, 16, v10
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
+; CHECK-NEXT:    v_lshl_or_b32 v15, v29, 8, v28
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
 ; CHECK-NEXT:    v_lshl_or_b32 v17, v31, 8, v30
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
@@ -3934,14 +3934,14 @@ define void @memmove_p1_p5_sz32_align_1_1(ptr addrspace(1) align 1 %dst, ptr add
 ; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:13
 ; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:14
 ; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:17
+; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:18
+; CHECK-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:21
+; CHECK-NEXT:    buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:22
+; CHECK-NEXT:    buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:23
 ; CHECK-NEXT:    buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:29
 ; CHECK-NEXT:    buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:30
@@ -3967,14 +3967,14 @@ define void @memmove_p1_p5_sz32_align_1_1(ptr addrspace(1) align 1 %dst, ptr add
 ; CHECK-NEXT:    v_lshl_or_b32 v6, v16, 8, v15
 ; CHECK-NEXT:    s_waitcnt vmcnt(16)
 ; CHECK-NEXT:    v_lshl_or_b32 v9, v18, 8, v17
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v21, 8, v20
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v23, 8, v22
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v19, 8, v25
+; CHECK-NEXT:    s_waitcnt vmcnt(14)
+; CHECK-NEXT:    v_lshl_or_b32 v15, v20, 8, v19
+; CHECK-NEXT:    s_waitcnt vmcnt(12)
+; CHECK-NEXT:    v_lshl_or_b32 v14, v22, 8, v21
+; CHECK-NEXT:    s_waitcnt vmcnt(10)
+; CHECK-NEXT:    v_lshl_or_b32 v12, v24, 8, v23
 ; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v26, 8, v24
+; CHECK-NEXT:    v_lshl_or_b32 v13, v26, 8, v25
 ; CHECK-NEXT:    s_waitcnt vmcnt(6)
 ; CHECK-NEXT:    v_lshl_or_b32 v16, v28, 8, v27
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
@@ -5469,12 +5469,12 @@ define void @memmove_p3_p5_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr add
 ; CHECK-NEXT:    buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:13
 ; CHECK-NEXT:    buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:14
 ; CHECK-NEXT:    buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:18
+; CHECK-NEXT:    buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:21
+; CHECK-NEXT:    buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:22
+; CHECK-NEXT:    buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:23
 ; CHECK-NEXT:    buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:17
 ; CHECK-NEXT:    buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:28
@@ -5504,13 +5504,13 @@ define void @memmove_p3_p5_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr add
 ; CHECK-NEXT:    v_lshl_or_b32 v1, v7, 16, v6
 ; CHECK-NEXT:    s_waitcnt vmcnt(10)
 ; CHECK-NEXT:    v_lshl_or_b32 v9, v22, 8, v21
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v25, 8, v24
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v23, 8, v27
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v28, 8, v26
+; CHECK-NEXT:    s_waitcnt vmcnt(8)
+; CHECK-NEXT:    v_lshl_or_b32 v14, v24, 8, v23
+; CHECK-NEXT:    s_waitcnt vmcnt(6)
+; CHECK-NEXT:    v_lshl_or_b32 v12, v26, 8, v25
 ; CHECK-NEXT:    v_lshl_or_b32 v4, v9, 16, v8
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
+; CHECK-NEXT:    v_lshl_or_b32 v13, v28, 8, v27
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
 ; CHECK-NEXT:    v_lshl_or_b32 v15, v30, 8, v29
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
@@ -5552,14 +5552,14 @@ define void @memmove_p3_p5_sz32_align_1_1(ptr addrspace(3) align 1 %dst, ptr add
 ; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:13
 ; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:14
 ; CHECK-NEXT:    buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:17
+; CHECK-NEXT:    buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:18
+; CHECK-NEXT:    buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:21
+; CHECK-NEXT:    buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:22
+; CHECK-NEXT:    buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:23
 ; CHECK-NEXT:    buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:29
 ; CHECK-NEXT:    buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:30
@@ -5585,14 +5585,14 @@ define void @memmove_p3_p5_sz32_align_1_1(ptr addrspace(3) align 1 %dst, ptr add
 ; CHECK-NEXT:    v_lshl_or_b32 v5, v15, 8, v14
 ; CHECK-NEXT:    s_waitcnt vmcnt(16)
 ; CHECK-NEXT:    v_lshl_or_b32 v8, v17, 8, v16
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v20, 8, v19
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v22, 8, v21
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v18, 8, v24
+; CHECK-NEXT:    s_waitcnt vmcnt(14)
+; CHECK-NEXT:    v_lshl_or_b32 v14, v19, 8, v18
+; CHECK-NEXT:    s_waitcnt vmcnt(12)
+; CHECK-NEXT:    v_lshl_or_b32 v13, v21, 8, v20
+; CHECK-NEXT:    s_waitcnt vmcnt(10)
+; CHECK-NEXT:    v_lshl_or_b32 v11, v23, 8, v22
 ; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v25, 8, v23
+; CHECK-NEXT:    v_lshl_or_b32 v12, v25, 8, v24
 ; CHECK-NEXT:    s_waitcnt vmcnt(6)
 ; CHECK-NEXT:    v_lshl_or_b32 v15, v27, 8, v26
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
diff --git a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
index a1099554559afa..c85059072c7fa7 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
@@ -123,13 +123,13 @@ define amdgpu_kernel void @test_umul24_i16_vgpr_sext(ptr addrspace(1) %out, ptr
 ; SI-NEXT:    v_mov_b32_e32 v1, v3
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    buffer_load_ushort v2, v[2:3], s[8:11], 0 addr64
 ; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64
+; SI-NEXT:    buffer_load_ushort v1, v[2:3], s[8:11], 0 addr64
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mul_u32_u24_e32 v0, v2, v0
+; SI-NEXT:    v_mul_u32_u24_e32 v0, v1, v0
 ; SI-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
@@ -245,13 +245,13 @@ define amdgpu_kernel void @test_umul24_i16_vgpr(ptr addrspace(1) %out, ptr addrs
 ; SI-NEXT:    v_mov_b32_e32 v1, v3
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    buffer_load_ushort v2, v[2:3], s[8:11], 0 addr64
 ; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64
+; SI-NEXT:    buffer_load_ushort v1, v[2:3], s[8:11], 0 addr64
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mul_u32_u24_e32 v0, v2, v0
+; SI-NEXT:    v_mul_u32_u24_e32 v0, v1, v0
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
index 0d88466fc31b3e..d57e168e9f4f2a 100644
--- a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
@@ -158,8 +158,8 @@ define i64 @cmpxchg_private_i64(ptr addrspace(5) %ptr) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v2, v0
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, 4, v2
-; GCN-NEXT:    buffer_load_dword v1, v3, s[0:3], 0 offen
 ; GCN-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GCN-NEXT:    buffer_load_dword v1, v3, s[0:3], 0 offen
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll
index cc109595d8d703..61aaec3b943641 100644
--- a/llvm/test/CodeGen/AMDGPU/select.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll
@@ -1906,21 +1906,21 @@ define <16 x half> @v_vselect_v16f16(<16 x half> %a, <16 x half> %b, <16 x i32>
 ; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:56
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v16
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32
 ; SI-NEXT:    v_cndmask_b32_e32 v12, v18, v12, vcc
 ; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32
 ; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v17
 ; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:64
 ; SI-NEXT:    v_cndmask_b32_e32 v13, v19, v13, vcc
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v18
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v16, v16
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v18
 ; SI-NEXT:    v_cndmask_b32_e32 v14, v20, v14, vcc
+; SI-NEXT:    v_cvt_f32_f16_e32 v16, v16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v17
-; SI-NEXT:    v_cvt_f32_f16_e32 v16, v16
 ; SI-NEXT:    v_cndmask_b32_e32 v15, v16, v15, vcc
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2109,6 +2109,7 @@ define <32 x half> @v_select_v32f16(<32 x half> %a, <32 x half> %b, i32 %cond) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v25, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
 ; SI-NEXT:    v_or_b32_e32 v12, v12, v13
 ; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:56
@@ -2124,40 +2125,39 @@ define <32 x half> @v_select_v32f16(<32 x half> %a, <32 x half> %b, i32 %cond) {
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:16
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8
-; SI-NEXT:    v_cvt_f16_f32_e32 v25, v25
+; SI-NEXT:    v_cvt_f16_f32_e32 v24, v24
 ; SI-NEXT:    v_or_b32_e32 v22, v22, v23
 ; SI-NEXT:    v_cvt_f16_f32_e32 v23, v30
-; SI-NEXT:    v_cvt_f16_f32_e32 v24, v24
 ; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; SI-NEXT:    v_or_b32_e32 v24, v24, v25
+; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:124
 ; SI-NEXT:    v_cvt_f16_f32_e32 v27, v27
 ; SI-NEXT:    v_cvt_f16_f32_e32 v26, v26
-; SI-NEXT:    v_or_b32_e32 v24, v24, v25
 ; SI-NEXT:    v_cvt_f16_f32_e32 v29, v29
+; SI-NEXT:    v_cvt_f16_f32_e32 v28, v28
 ; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
 ; SI-NEXT:    v_or_b32_e32 v26, v26, v27
-; SI-NEXT:    v_cvt_f16_f32_e32 v28, v28
+; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:116
 ; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; SI-NEXT:    v_or_b32_e32 v28, v28, v29
+; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:108
+; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:100
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:84
 ; SI-NEXT:    v_cvt_f16_f32_e32 v19, v19
 ; SI-NEXT:    v_cvt_f16_f32_e32 v18, v18
-; SI-NEXT:    v_or_b32_e32 v28, v28, v29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v17, v17
+; SI-NEXT:    v_cvt_f16_f32_e32 v16, v16
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
 ; SI-NEXT:    v_or_b32_e32 v18, v18, v19
-; SI-NEXT:    v_cvt_f16_f32_e32 v16, v16
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; SI-NEXT:    v_or_b32_e32 v16, v16, v17
 ; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
 ; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; SI-NEXT:    v_or_b32_e32 v16, v16, v17
-; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:124
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; SI-NEXT:    v_or_b32_e32 v14, v14, v15
-; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:116
-; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:108
-; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:100
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:92
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:84
 ; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:80
 ; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:72
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; SI-NEXT:    v_or_b32_e32 v14, v14, v15
 ; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:64
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v21, v21
@@ -2185,60 +2185,58 @@ define <32 x half> @v_select_v32f16(<32 x half> %a, <32 x half> %b, i32 %cond) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v25, v25
-; SI-NEXT:    s_waitcnt vmcnt(8)
-; SI-NEXT:    v_cvt_f16_f32_e32 v27, v27
-; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_cvt_f16_f32_e32 v29, v29
-; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_cvt_f16_f32_e32 v30, v30
-; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_cvt_f16_f32_e32 v32, v32
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_cvt_f16_f32_e32 v19, v19
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_cvt_f16_f32_e32 v17, v17
+; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v23, v23
 ; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
 ; SI-NEXT:    v_or_b32_e32 v23, v25, v23
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:120
+; SI-NEXT:    v_cvt_f16_f32_e32 v27, v27
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v25, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
 ; SI-NEXT:    v_or_b32_e32 v25, v27, v25
 ; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:112
+; SI-NEXT:    v_cvt_f16_f32_e32 v29, v29
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v27, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
 ; SI-NEXT:    v_or_b32_e32 v27, v29, v27
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:104
+; SI-NEXT:    v_cvt_f16_f32_e32 v30, v30
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v29, v29
 ; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
 ; SI-NEXT:    v_or_b32_e32 v29, v30, v29
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:96
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v30, v30
 ; SI-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
 ; SI-NEXT:    v_or_b32_e32 v30, v31, v30
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:88
+; SI-NEXT:    v_cvt_f16_f32_e32 v32, v32
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
 ; SI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
 ; SI-NEXT:    v_or_b32_e32 v31, v32, v31
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:76
-; SI-NEXT:    v_cvt_f16_f32_e32 v19, v19
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v32, v32
 ; SI-NEXT:    v_or_b32_e32 v19, v32, v19
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
-; SI-NEXT:    v_cvt_f16_f32_e32 v17, v17
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v32, v32
 ; SI-NEXT:    v_or_b32_e32 v17, v32, v17
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
-; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v32, v32
 ; SI-NEXT:    v_or_b32_e32 v15, v32, v15
@@ -2815,36 +2813,36 @@ define <32 x half> @v_vselect_v32f16(<32 x half> %a, <32 x half> %b, <32 x i32>
 ; VI-NEXT:    v_cndmask_b32_e32 v51, v43, v55, vcc
 ; VI-NEXT:    v_lshrrev_b32_e32 v55, 16, v2
 ; VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v18
-; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:108
-; VI-NEXT:    s_waitcnt vmcnt(5)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v49
 ; VI-NEXT:    v_cndmask_b32_e32 v49, v43, v55, vcc
-; VI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:100
-; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:92
 ; VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v1
 ; VI-NEXT:    v_lshrrev_b32_e32 v46, 16, v17
-; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:84
-; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:76
-; VI-NEXT:    s_waitcnt vmcnt(8)
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v48
+; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:108
+; VI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:100
+; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:92
 ; VI-NEXT:    v_cndmask_b32_e32 v48, v46, v43, vcc
 ; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:68
-; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:60
-; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:52
 ; VI-NEXT:    v_lshrrev_b32_e32 v46, 16, v0
 ; VI-NEXT:    v_lshrrev_b32_e32 v58, 16, v16
-; VI-NEXT:    s_waitcnt vmcnt(10)
+; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v40
-; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:44
+; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:84
+; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:76
 ; VI-NEXT:    v_cndmask_b32_e32 v46, v58, v46, vcc
-; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:36
-; VI-NEXT:    s_waitcnt vmcnt(11)
+; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v41
-; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:28
 ; VI-NEXT:    v_cndmask_b32_e32 v15, v37, v15, vcc
-; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:20
-; VI-NEXT:    s_waitcnt vmcnt(12)
+; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v42
+; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:60
+; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:52
+; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:44
+; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:36
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:20
 ; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:12
 ; VI-NEXT:    v_cndmask_b32_e32 v14, v30, v14, vcc
 ; VI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:4
@@ -2857,13 +2855,12 @@ define <32 x half> @v_vselect_v32f16(<32 x half> %a, <32 x half> %b, <32 x i32>
 ; VI-NEXT:    s_waitcnt vmcnt(11)
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v45
 ; VI-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc
-; VI-NEXT:    s_waitcnt vmcnt(10)
+; VI-NEXT:    s_waitcnt vmcnt(9)
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v47
 ; VI-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc
-; VI-NEXT:    s_waitcnt vmcnt(9)
+; VI-NEXT:    s_waitcnt vmcnt(8)
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v56
 ; VI-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc
-; VI-NEXT:    s_waitcnt vmcnt(8)
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v43
 ; VI-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(7)
diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
index b87439a9d6fae7..6ee21acc6d7c25 100644
--- a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
+++ b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
@@ -69,6 +69,7 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[S_ADD_U32_3:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
   ; CHECK-NEXT:   [[S_ASHR_I32_3:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 undef %169:sreg_32, 31, implicit-def dead $scc
   ; CHECK-NEXT:   undef [[S_ADD_U32_4:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], undef %169:sreg_32, implicit-def $scc
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM4:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_3]], 64, 0 :: (invariant load (s128) from %ir.99, addrspace 4)
   ; CHECK-NEXT:   [[S_ADD_U32_4:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc
   ; CHECK-NEXT:   undef [[S_ADD_U32_5:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_]], implicit-def $scc
   ; CHECK-NEXT:   [[S_ADD_U32_5:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
@@ -92,7 +93,6 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_MOV_B32_]], 16, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %356:sgpr_128, undef %357:sreg_32, 0, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %367:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM4:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_3]], 64, 0 :: (invariant load (s128) from %ir.99, addrspace 4)
   ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_4]], 64, 0 :: (invariant load (s128) from %ir.107, addrspace 4)
   ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 0, 0 :: (invariant load (s128) from %ir.117, addrspace 4)
   ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 0, 0 :: (invariant load (s128) from %ir.124, addrspace 4)
@@ -100,11 +100,12 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %351:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %362:sgpr_128, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM3]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+  ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM]], -98, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_3:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM1]], -114, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_4:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM2]], -130, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_ADD_I32_5:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM2]], -178, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM8:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 0, 0 :: (invariant load (s128) from %ir.112, addrspace 4)
+  ; CHECK-NEXT:   [[S_ADD_I32_5:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM2]], -178, implicit-def dead $scc
   ; CHECK-NEXT:   undef [[S_ADD_U32_12:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY10]], [[S_LSHL_B32_]], implicit-def $scc
   ; CHECK-NEXT:   [[S_ADD_U32_12:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %42:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
   ; CHECK-NEXT:   undef [[S_ADD_U32_13:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_]], implicit-def $scc
@@ -114,14 +115,13 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   undef [[S_ADD_U32_15:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_2]], implicit-def $scc
   ; CHECK-NEXT:   [[S_ADD_U32_15:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
   ; CHECK-NEXT:   [[S_LSHL_B32_3:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY12]], 4, implicit-def dead $scc
-  ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_3]], 16, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %383:sgpr_128, [[S_ADD_I32_6]], 0, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN5:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 224, 0 :: (invariant load (s128) from %ir.129, addrspace 4)
   ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY7]], 224, 0 :: (invariant load (s128) from %ir.145, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM11:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 576, 0 :: (invariant load (s128) from %ir.150, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN6:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM8]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM11:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 576, 0 :: (invariant load (s128) from %ir.150, addrspace 4)
   ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM12:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 224, 0 :: (invariant load (s128) from %ir.134, addrspace 4)
   ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM13:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 576, 0 :: (invariant load (s128) from %ir.162, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN7:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM6]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
@@ -130,8 +130,8 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[S_ADD_I32_8:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -233, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_9:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM5]], -249, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_10:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM3]], -297, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_ADD_I32_11:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -313, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 224, 0 :: (invariant load (s128) from %ir.140, addrspace 4)
+  ; CHECK-NEXT:   [[S_ADD_I32_11:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -313, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_12:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -329, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_13:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -345, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_14:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM6]], -441, implicit-def dead $scc
@@ -154,12 +154,12 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM11]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub3:sgpr_128 = S_MOV_B32 553734060
   ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 -1
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
   ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM16:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_9]], 0, 0 :: (invariant load (s128) from %ir.170, addrspace 4)
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
   ; CHECK-NEXT:   [[COPY15:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub1
   ; CHECK-NEXT:   [[COPY15:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORD_IMM]]
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY15]], 0, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN14:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM15]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY15]], 0, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN15:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM13]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_10]], 0, 0 :: (invariant load (s128) from %ir.178, addrspace 4)
   ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_11]], 0, 0 :: (invariant load (s128) from %ir.183, addrspace 4)
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index ab84c0c905771b..f7a5d0e03ef7be 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -3270,16 +3270,16 @@ define amdgpu_ps float @test_wqm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX9-W64-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 idxen
 ; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
+; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v0
+; GFX9-W64-NEXT:    image_sample v0, v0, s[8:15], s[16:19] dmask:0x1
 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
 ; GFX9-W64-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 idxen
 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v0
-; GFX9-W64-NEXT:    image_sample v0, v0, s[8:15], s[16:19] dmask:0x1
-; GFX9-W64-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-W64-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v0
+; GFX9-W64-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v3
 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[20:21]
 ; GFX9-W64-NEXT:    image_sample v0, v0, s[8:15], s[16:19] dmask:0x1
@@ -3315,6 +3315,9 @@ define amdgpu_ps float @test_wqm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1
 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX10-W32-NEXT:    buffer_load_dword v0, v3, s[16:19], 0 idxen
 ; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
+; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v0
+; GFX10-W32-NEXT:    image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_1D
 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX10-W32-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 idxen
 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
@@ -3322,9 +3325,6 @@ define amdgpu_ps float @test_wqm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1
 ; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v0
 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-W32-NEXT:    image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_1D
-; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v0
 ; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v3
 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s20
 ; GFX10-W32-NEXT:    image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_1D



More information about the llvm-commits mailing list