[llvm] [AMDGPU] Reschedule loads in clauses to improve throughput (RFC) (PR #102595)

Carl Ritson via llvm-commits llvm-commits at lists.llvm.org
Fri Oct 31 02:18:52 PDT 2025


https://github.com/perlfu updated https://github.com/llvm/llvm-project/pull/102595

>From fe48798bedb47fdab5ed777314c6f4fea9759111 Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Fri, 9 Aug 2024 18:18:41 +0900
Subject: [PATCH] [AMDGPU] Reschedule loads in clauses to improve throughput

After clauses are formed their internal loads can be reordered
to facilitate some additional opportunities for overlapping
computation.
This late stage rescheduling causes no change register pressure.
---
 llvm/lib/Target/AMDGPU/SIPostRABundler.cpp    |  147 +-
 .../CodeGen/AMDGPU/GlobalISel/add.vni16.ll    |    6 +-
 .../GlobalISel/combine-fma-add-fma-mul.ll     |   64 +-
 .../GlobalISel/image-waterfall-loop-O0.ll     |   29 +-
 .../CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll  |   53 +-
 .../AMDGPU/GlobalISel/load-constant.96.ll     |   32 +-
 .../AMDGPU/GlobalISel/load-local.128.ll       |  103 +-
 .../AMDGPU/GlobalISel/load-local.96.ll        |   72 +-
 .../AMDGPU/GlobalISel/load-unaligned.ll       |   57 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll  | 5661 +++++++++--------
 .../CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll   |   35 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll   |  425 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll   | 1919 +++---
 .../CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll   |   52 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll   |  141 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll   |  189 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll   |  230 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll   |  339 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll   |  415 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll   |  402 +-
 llvm/test/CodeGen/AMDGPU/bf16.ll              |  723 ++-
 .../AMDGPU/buffer-fat-pointers-memcpy.ll      |   52 +-
 llvm/test/CodeGen/AMDGPU/collapse-endcf.ll    |   21 +-
 llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll   |   26 +-
 llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll   |   26 +-
 llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll     |   42 +-
 llvm/test/CodeGen/AMDGPU/div_i128.ll          |  679 +-
 llvm/test/CodeGen/AMDGPU/ds-alignment.ll      |   73 +-
 llvm/test/CodeGen/AMDGPU/ds_read2.ll          |   42 +-
 .../fast-unaligned-load-store.global.ll       |    6 +-
 .../fast-unaligned-load-store.private.ll      |    6 +-
 llvm/test/CodeGen/AMDGPU/freeze.ll            |  249 +-
 llvm/test/CodeGen/AMDGPU/function-args.ll     |  230 +-
 .../AMDGPU/gfx-callable-return-types.ll       |  111 +-
 llvm/test/CodeGen/AMDGPU/idot4u.ll            |   12 +-
 .../CodeGen/AMDGPU/indirect-addressing-si.ll  |  233 +-
 ...e92561-restore-undef-scc-verifier-error.ll |   11 +-
 llvm/test/CodeGen/AMDGPU/kernel-args.ll       |   12 +-
 .../test/CodeGen/AMDGPU/lds-misaligned-bug.ll |    7 +-
 llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll  |   12 +-
 llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll  |  127 +-
 llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll  |  127 +-
 llvm/test/CodeGen/AMDGPU/load-constant-i16.ll |   89 +-
 llvm/test/CodeGen/AMDGPU/load-global-f32.ll   |    5 +-
 llvm/test/CodeGen/AMDGPU/load-global-i16.ll   |   68 +-
 llvm/test/CodeGen/AMDGPU/load-global-i32.ll   |   53 +-
 llvm/test/CodeGen/AMDGPU/load-global-i8.ll    |    9 +-
 llvm/test/CodeGen/AMDGPU/load-local-i16.ll    |   15 +-
 .../AMDGPU/load-local-redundant-copies.ll     |   23 +-
 llvm/test/CodeGen/AMDGPU/load-local.128.ll    |   39 +-
 llvm/test/CodeGen/AMDGPU/load-local.96.ll     |   30 +-
 llvm/test/CodeGen/AMDGPU/max.i16.ll           |    2 +-
 llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll    |   32 +-
 .../AMDGPU/memcpy-param-combinations.ll       |   30 +-
 .../CodeGen/AMDGPU/memintrinsic-unroll.ll     |  434 +-
 .../AMDGPU/memmove-param-combinations.ll      |  216 +-
 .../CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll  |  104 +-
 ...uf-legalize-operands-non-ptr-intrinsics.ll |   13 +-
 .../CodeGen/AMDGPU/mubuf-legalize-operands.ll |   24 +-
 llvm/test/CodeGen/AMDGPU/permute_i8.ll        |    4 +-
 .../AMDGPU/preserve-wwm-copy-dst-reg.ll       |  116 +-
 .../AMDGPU/promote-constOffset-to-imm.ll      |   11 +-
 llvm/test/CodeGen/AMDGPU/rem_i128.ll          |  394 +-
 .../AMDGPU/reschedule-bundle-loads.mir        |  198 +
 .../CodeGen/AMDGPU/spill-scavenge-offset.ll   |   25 +-
 .../test/CodeGen/AMDGPU/vector-reduce-fadd.ll |    4 +-
 .../test/CodeGen/AMDGPU/vector-reduce-fmul.ll |    4 +-
 .../CodeGen/AMDGPU/whole-wave-functions.ll    |   10 +-
 .../test/CodeGen/AMDGPU/wwm-reserved-spill.ll |   34 +-
 llvm/test/CodeGen/AMDGPU/wwm-reserved.ll      |    8 +-
 70 files changed, 8285 insertions(+), 6907 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/reschedule-bundle-loads.mir

diff --git a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
index 5720b978aada0..80cca7bcfde9c 100644
--- a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
@@ -17,6 +17,7 @@
 #include "GCNSubtarget.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include <deque>
 
 using namespace llvm;
 
@@ -50,6 +51,7 @@ class SIPostRABundler {
   bool run(MachineFunction &MF);
 
 private:
+  const SIInstrInfo *TII = nullptr;
   const SIRegisterInfo *TRI;
 
   SmallSet<Register, 16> Defs;
@@ -60,6 +62,9 @@ class SIPostRABundler {
   bool isBundleCandidate(const MachineInstr &MI) const;
   bool isDependentLoad(const MachineInstr &MI) const;
   bool canBundle(const MachineInstr &MI, const MachineInstr &NextMI) const;
+  void reorderLoads(MachineBasicBlock &MBB,
+                    MachineBasicBlock::instr_iterator &BundleStart,
+                    MachineBasicBlock::instr_iterator Next);
 };
 
 constexpr uint64_t MemFlags = SIInstrFlags::MTBUF | SIInstrFlags::MUBUF |
@@ -129,6 +134,141 @@ bool SIPostRABundler::canBundle(const MachineInstr &MI,
           !isDependentLoad(NextMI));
 }
 
+static Register getDef(MachineInstr &MI) {
+  assert(MI.getNumExplicitDefs() > 0);
+  return MI.defs().begin()->getReg();
+}
+
+void SIPostRABundler::reorderLoads(
+    MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &BundleStart,
+    MachineBasicBlock::instr_iterator Next) {
+  // Don't reorder ALU, store or scalar clauses.
+  if (!BundleStart->mayLoad() || BundleStart->mayStore() ||
+      SIInstrInfo::isSMRD(*BundleStart) || !BundleStart->getNumExplicitDefs())
+    return;
+
+  // Search to find the usage distance of each defined register in the clause.
+  const unsigned SearchDistance = std::max(Defs.size(), 100UL);
+  SmallDenseMap<Register, unsigned> UseDistance;
+  unsigned MaxDistance = 0;
+  for (MachineBasicBlock::iterator SearchI = Next;
+       SearchI != MBB.end() && MaxDistance < SearchDistance &&
+       UseDistance.size() < Defs.size();
+       ++SearchI, ++MaxDistance) {
+    for (Register Reg : Defs) {
+      if (UseDistance.contains(Reg))
+        continue;
+      if (SearchI->readsRegister(Reg, TRI))
+        UseDistance[Reg] = MaxDistance;
+    }
+  }
+
+  if (UseDistance.empty())
+    return;
+
+  LLVM_DEBUG(dbgs() << "Try bundle reordering\n");
+
+  // Build schedule based on use distance of register uses.
+  // Attempt to preserve exist order (NativeOrder) where possible.
+  std::deque<std::pair<MachineInstr *, unsigned>> Schedule;
+  unsigned NativeOrder = 0, LastOrder = 0;
+  bool Reordered = false;
+  for (auto II = BundleStart; II != Next; ++II, ++NativeOrder) {
+    // Bail out if we encounter anything that seems risky to reorder.
+    if (!II->getNumExplicitDefs() || II->isKill() ||
+        llvm::any_of(II->memoperands(), [&](const MachineMemOperand *MMO) {
+          return MMO->isAtomic() || MMO->isVolatile();
+        })) {
+      LLVM_DEBUG(dbgs() << " Abort\n");
+      return;
+    }
+
+    Register Reg = getDef(*II);
+    unsigned NewOrder =
+        UseDistance.contains(Reg) ? UseDistance[Reg] : MaxDistance;
+    LLVM_DEBUG(dbgs() << "  Order: " << NewOrder << "," << NativeOrder
+                      << ", MI: " << *II);
+    unsigned Order = (NewOrder << 16 | NativeOrder);
+    Schedule.emplace_back(&*II, Order);
+    Reordered |= Order < LastOrder;
+    LastOrder = Order;
+  }
+
+  // No reordering found.
+  if (!Reordered) {
+    LLVM_DEBUG(dbgs() << " No changes\n");
+    return;
+  }
+
+  // Apply sort on new ordering.
+  std::sort(Schedule.begin(), Schedule.end(),
+            [](std::pair<MachineInstr *, unsigned> A,
+               std::pair<MachineInstr *, unsigned> B) {
+              return A.second < B.second;
+            });
+
+  // Rebuild clause order.
+  // Schedule holds ideal order for the load operations; however, each def
+  // can only be scheduled when it will no longer clobber any uses.
+  SmallVector<MachineInstr *> Clause;
+  while (!Schedule.empty()) {
+    // Try to schedule next instruction in schedule.
+    // Iterate until we find something that can be placed.
+    auto It = Schedule.begin();
+    while (It != Schedule.end()) {
+      MachineInstr *MI = It->first;
+      LLVM_DEBUG(dbgs() << "Try schedule: " << *MI);
+
+      if (MI->getNumExplicitDefs() == 0) {
+        // No defs, always schedule.
+        LLVM_DEBUG(dbgs() << "  Trivially OK\n");
+        break;
+      }
+
+      Register DefReg = getDef(*MI);
+      bool DefRegHasUse = false;
+      for (auto SearchIt = std::next(It);
+           SearchIt != Schedule.end() && !DefRegHasUse; ++SearchIt)
+        DefRegHasUse = SearchIt->first->readsRegister(DefReg, TRI);
+      if (DefRegHasUse) {
+        // A future use would be clobbered; try next instruction in the
+        // schedule.
+        LLVM_DEBUG(dbgs() << "  Clobbers uses\n");
+        It++;
+        continue;
+      }
+
+      // Safe to schedule.
+      LLVM_DEBUG(dbgs() << "  OK!\n");
+      break;
+    }
+
+    // Place schedule instruction into clause order.
+    assert(It != Schedule.end());
+    MachineInstr *MI = It->first;
+    Schedule.erase(It);
+    Clause.push_back(MI);
+
+    // Clear kill flags for later uses.
+    for (auto &Use : MI->all_uses()) {
+      if (!Use.isReg() || !Use.isKill())
+        continue;
+      Register UseReg = Use.getReg();
+      if (llvm::any_of(Schedule, [&](std::pair<MachineInstr *, unsigned> &SI) {
+            return SI.first->readsRegister(UseReg, TRI);
+          }))
+        Use.setIsKill(false);
+    }
+  }
+
+  // Apply order to instructions.
+  for (MachineInstr *MI : Clause)
+    MI->moveBefore(&*Next);
+
+  // Update start of bundle.
+  BundleStart = Clause[0]->getIterator();
+}
+
 bool SIPostRABundlerLegacy::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
@@ -143,6 +283,8 @@ PreservedAnalyses SIPostRABundlerPass::run(MachineFunction &MF,
 
 bool SIPostRABundler::run(MachineFunction &MF) {
 
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  TII = ST.getInstrInfo();
   TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
   BitVector BundleUsedRegUnits(TRI->getNumRegUnits());
   BitVector KillUsedRegUnits(TRI->getNumRegUnits());
@@ -170,7 +312,7 @@ bool SIPostRABundler::run(MachineFunction &MF) {
       assert(Defs.empty());
 
       if (I->getNumExplicitDefs() != 0)
-        Defs.insert(I->defs().begin()->getReg());
+        Defs.insert(getDef(*I));
 
       MachineBasicBlock::instr_iterator BundleStart = I;
       MachineBasicBlock::instr_iterator BundleEnd = I;
@@ -182,7 +324,7 @@ bool SIPostRABundler::run(MachineFunction &MF) {
         if (canBundle(*BundleEnd, *I)) {
           BundleEnd = I;
           if (I->getNumExplicitDefs() != 0)
-            Defs.insert(I->defs().begin()->getReg());
+            Defs.insert(getDef(*I));
           ++ClauseLength;
         } else if (!I->isMetaInstruction() ||
                    I->getOpcode() == AMDGPU::SCHED_BARRIER) {
@@ -234,6 +376,7 @@ bool SIPostRABundler::run(MachineFunction &MF) {
           BundleUsedRegUnits.reset();
         }
 
+        reorderLoads(MBB, BundleStart, Next);
         finalizeBundle(MBB, BundleStart, Next);
       }
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll
index b67080bd4798d..c04f86391c44b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll
@@ -716,17 +716,17 @@ define void @add_v11i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addr
 ; GFX9-LABEL: add_v11i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off
 ; GFX9-NEXT:    global_load_ushort v14, v[0:1], off offset:16
 ; GFX9-NEXT:    global_load_ushort v15, v[2:3], off offset:16
+; GFX9-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off
 ; GFX9-NEXT:    global_load_dwordx4 v[10:13], v[2:3], off
 ; GFX9-NEXT:    global_load_ushort v16, v[2:3], off offset:20
 ; GFX9-NEXT:    global_load_ushort v17, v[0:1], off offset:20
 ; GFX9-NEXT:    global_load_ushort v18, v[0:1], off offset:18
 ; GFX9-NEXT:    global_load_ushort v19, v[2:3], off offset:18
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff, v15
 ; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_pk_add_u16 v0, v6, v10
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll
index 6ea0a9446ff9d..7fca4d628d023 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll
@@ -750,20 +750,20 @@ define <4 x double> @test_f64_add_mul(<4 x double> %a, <4 x double> %b, <4 x dou
 ; GFX10-CONTRACT:       ; %bb.0: ; %.entry
 ; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    s_clause 0x8
-; GFX10-CONTRACT-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX10-CONTRACT-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GFX10-CONTRACT-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; GFX10-CONTRACT-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:12
 ; GFX10-CONTRACT-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:16
 ; GFX10-CONTRACT-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:20
 ; GFX10-CONTRACT-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:24
+; GFX10-CONTRACT-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX10-CONTRACT-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:28
 ; GFX10-CONTRACT-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:32
-; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(6)
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(7)
 ; GFX10-CONTRACT-NEXT:    v_fma_f64 v[16:17], v[16:17], v[24:25], v[32:33]
-; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(5)
 ; GFX10-CONTRACT-NEXT:    v_fma_f64 v[18:19], v[18:19], v[26:27], v[34:35]
-; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(3)
 ; GFX10-CONTRACT-NEXT:    v_fma_f64 v[20:21], v[20:21], v[28:29], v[36:37]
 ; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CONTRACT-NEXT:    v_fma_f64 v[22:23], v[22:23], v[30:31], v[38:39]
@@ -777,20 +777,20 @@ define <4 x double> @test_f64_add_mul(<4 x double> %a, <4 x double> %b, <4 x dou
 ; GFX10-DENORM:       ; %bb.0: ; %.entry
 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_clause 0x8
-; GFX10-DENORM-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX10-DENORM-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GFX10-DENORM-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; GFX10-DENORM-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:12
 ; GFX10-DENORM-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:16
 ; GFX10-DENORM-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:20
 ; GFX10-DENORM-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:24
+; GFX10-DENORM-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX10-DENORM-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:28
 ; GFX10-DENORM-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:32
-; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(6)
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(7)
 ; GFX10-DENORM-NEXT:    v_fma_f64 v[16:17], v[16:17], v[24:25], v[32:33]
-; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(5)
 ; GFX10-DENORM-NEXT:    v_fma_f64 v[18:19], v[18:19], v[26:27], v[34:35]
-; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(3)
 ; GFX10-DENORM-NEXT:    v_fma_f64 v[20:21], v[20:21], v[28:29], v[36:37]
 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-NEXT:    v_fma_f64 v[22:23], v[22:23], v[30:31], v[38:39]
@@ -804,20 +804,20 @@ define <4 x double> @test_f64_add_mul(<4 x double> %a, <4 x double> %b, <4 x dou
 ; GFX11-CONTRACT:       ; %bb.0: ; %.entry
 ; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-CONTRACT-NEXT:    s_clause 0x8
-; GFX11-CONTRACT-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-CONTRACT-NEXT:    scratch_load_b32 v32, off, s32 offset:4
 ; GFX11-CONTRACT-NEXT:    scratch_load_b32 v33, off, s32 offset:8
 ; GFX11-CONTRACT-NEXT:    scratch_load_b32 v34, off, s32 offset:12
 ; GFX11-CONTRACT-NEXT:    scratch_load_b32 v35, off, s32 offset:16
 ; GFX11-CONTRACT-NEXT:    scratch_load_b32 v36, off, s32 offset:20
 ; GFX11-CONTRACT-NEXT:    scratch_load_b32 v37, off, s32 offset:24
+; GFX11-CONTRACT-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-CONTRACT-NEXT:    scratch_load_b32 v38, off, s32 offset:28
 ; GFX11-CONTRACT-NEXT:    scratch_load_b32 v39, off, s32 offset:32
-; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(7)
 ; GFX11-CONTRACT-NEXT:    v_fma_f64 v[16:17], v[16:17], v[24:25], v[32:33]
-; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(5)
 ; GFX11-CONTRACT-NEXT:    v_fma_f64 v[18:19], v[18:19], v[26:27], v[34:35]
-; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(3)
 ; GFX11-CONTRACT-NEXT:    v_fma_f64 v[20:21], v[20:21], v[28:29], v[36:37]
 ; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CONTRACT-NEXT:    v_fma_f64 v[22:23], v[22:23], v[30:31], v[38:39]
@@ -833,20 +833,20 @@ define <4 x double> @test_f64_add_mul(<4 x double> %a, <4 x double> %b, <4 x dou
 ; GFX11-DENORM:       ; %bb.0: ; %.entry
 ; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    s_clause 0x8
-; GFX11-DENORM-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-DENORM-NEXT:    scratch_load_b32 v32, off, s32 offset:4
 ; GFX11-DENORM-NEXT:    scratch_load_b32 v33, off, s32 offset:8
 ; GFX11-DENORM-NEXT:    scratch_load_b32 v34, off, s32 offset:12
 ; GFX11-DENORM-NEXT:    scratch_load_b32 v35, off, s32 offset:16
 ; GFX11-DENORM-NEXT:    scratch_load_b32 v36, off, s32 offset:20
 ; GFX11-DENORM-NEXT:    scratch_load_b32 v37, off, s32 offset:24
+; GFX11-DENORM-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-DENORM-NEXT:    scratch_load_b32 v38, off, s32 offset:28
 ; GFX11-DENORM-NEXT:    scratch_load_b32 v39, off, s32 offset:32
-; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(7)
 ; GFX11-DENORM-NEXT:    v_fma_f64 v[16:17], v[16:17], v[24:25], v[32:33]
-; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(5)
 ; GFX11-DENORM-NEXT:    v_fma_f64 v[18:19], v[18:19], v[26:27], v[34:35]
-; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(3)
 ; GFX11-DENORM-NEXT:    v_fma_f64 v[20:21], v[20:21], v[28:29], v[36:37]
 ; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-NEXT:    v_fma_f64 v[22:23], v[22:23], v[30:31], v[38:39]
@@ -921,20 +921,20 @@ define <4 x double> @test_f64_add_mul_rhs(<4 x double> %a, <4 x double> %b, <4 x
 ; GFX10-CONTRACT:       ; %bb.0: ; %.entry
 ; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    s_clause 0x8
-; GFX10-CONTRACT-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX10-CONTRACT-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GFX10-CONTRACT-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; GFX10-CONTRACT-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:12
 ; GFX10-CONTRACT-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:16
 ; GFX10-CONTRACT-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:20
 ; GFX10-CONTRACT-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:24
+; GFX10-CONTRACT-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX10-CONTRACT-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:28
 ; GFX10-CONTRACT-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:32
-; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(6)
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(7)
 ; GFX10-CONTRACT-NEXT:    v_fma_f64 v[16:17], v[16:17], v[24:25], v[32:33]
-; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(5)
 ; GFX10-CONTRACT-NEXT:    v_fma_f64 v[18:19], v[18:19], v[26:27], v[34:35]
-; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(3)
 ; GFX10-CONTRACT-NEXT:    v_fma_f64 v[20:21], v[20:21], v[28:29], v[36:37]
 ; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CONTRACT-NEXT:    v_fma_f64 v[22:23], v[22:23], v[30:31], v[38:39]
@@ -948,20 +948,20 @@ define <4 x double> @test_f64_add_mul_rhs(<4 x double> %a, <4 x double> %b, <4 x
 ; GFX10-DENORM:       ; %bb.0: ; %.entry
 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_clause 0x8
-; GFX10-DENORM-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX10-DENORM-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GFX10-DENORM-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; GFX10-DENORM-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:12
 ; GFX10-DENORM-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:16
 ; GFX10-DENORM-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:20
 ; GFX10-DENORM-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:24
+; GFX10-DENORM-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX10-DENORM-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:28
 ; GFX10-DENORM-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:32
-; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(6)
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(7)
 ; GFX10-DENORM-NEXT:    v_fma_f64 v[16:17], v[16:17], v[24:25], v[32:33]
-; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(5)
 ; GFX10-DENORM-NEXT:    v_fma_f64 v[18:19], v[18:19], v[26:27], v[34:35]
-; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(3)
 ; GFX10-DENORM-NEXT:    v_fma_f64 v[20:21], v[20:21], v[28:29], v[36:37]
 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-NEXT:    v_fma_f64 v[22:23], v[22:23], v[30:31], v[38:39]
@@ -975,20 +975,20 @@ define <4 x double> @test_f64_add_mul_rhs(<4 x double> %a, <4 x double> %b, <4 x
 ; GFX11-CONTRACT:       ; %bb.0: ; %.entry
 ; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-CONTRACT-NEXT:    s_clause 0x8
-; GFX11-CONTRACT-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-CONTRACT-NEXT:    scratch_load_b32 v32, off, s32 offset:4
 ; GFX11-CONTRACT-NEXT:    scratch_load_b32 v33, off, s32 offset:8
 ; GFX11-CONTRACT-NEXT:    scratch_load_b32 v34, off, s32 offset:12
 ; GFX11-CONTRACT-NEXT:    scratch_load_b32 v35, off, s32 offset:16
 ; GFX11-CONTRACT-NEXT:    scratch_load_b32 v36, off, s32 offset:20
 ; GFX11-CONTRACT-NEXT:    scratch_load_b32 v37, off, s32 offset:24
+; GFX11-CONTRACT-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-CONTRACT-NEXT:    scratch_load_b32 v38, off, s32 offset:28
 ; GFX11-CONTRACT-NEXT:    scratch_load_b32 v39, off, s32 offset:32
-; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(7)
 ; GFX11-CONTRACT-NEXT:    v_fma_f64 v[16:17], v[16:17], v[24:25], v[32:33]
-; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(5)
 ; GFX11-CONTRACT-NEXT:    v_fma_f64 v[18:19], v[18:19], v[26:27], v[34:35]
-; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(3)
 ; GFX11-CONTRACT-NEXT:    v_fma_f64 v[20:21], v[20:21], v[28:29], v[36:37]
 ; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CONTRACT-NEXT:    v_fma_f64 v[22:23], v[22:23], v[30:31], v[38:39]
@@ -1004,20 +1004,20 @@ define <4 x double> @test_f64_add_mul_rhs(<4 x double> %a, <4 x double> %b, <4 x
 ; GFX11-DENORM:       ; %bb.0: ; %.entry
 ; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    s_clause 0x8
-; GFX11-DENORM-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-DENORM-NEXT:    scratch_load_b32 v32, off, s32 offset:4
 ; GFX11-DENORM-NEXT:    scratch_load_b32 v33, off, s32 offset:8
 ; GFX11-DENORM-NEXT:    scratch_load_b32 v34, off, s32 offset:12
 ; GFX11-DENORM-NEXT:    scratch_load_b32 v35, off, s32 offset:16
 ; GFX11-DENORM-NEXT:    scratch_load_b32 v36, off, s32 offset:20
 ; GFX11-DENORM-NEXT:    scratch_load_b32 v37, off, s32 offset:24
+; GFX11-DENORM-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-DENORM-NEXT:    scratch_load_b32 v38, off, s32 offset:28
 ; GFX11-DENORM-NEXT:    scratch_load_b32 v39, off, s32 offset:32
-; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(7)
 ; GFX11-DENORM-NEXT:    v_fma_f64 v[16:17], v[16:17], v[24:25], v[32:33]
-; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(5)
 ; GFX11-DENORM-NEXT:    v_fma_f64 v[18:19], v[18:19], v[26:27], v[34:35]
-; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(3)
 ; GFX11-DENORM-NEXT:    v_fma_f64 v[20:21], v[20:21], v[28:29], v[36:37]
 ; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-NEXT:    v_fma_f64 v[22:23], v[22:23], v[30:31], v[38:39]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll
index 4ed1cb2d1260e..745beaaf43330 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll
@@ -69,6 +69,14 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
 ; CHECK-NEXT:    s_or_saveexec_b32 s21, -1
 ; CHECK-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b32 exec_lo, s21
+; CHECK-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
@@ -77,22 +85,21 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
 ; CHECK-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_waitcnt vmcnt(15)
 ; CHECK-NEXT:    v_readfirstlane_b32 s12, v7
+; CHECK-NEXT:    s_waitcnt vmcnt(14)
 ; CHECK-NEXT:    v_readfirstlane_b32 s10, v6
+; CHECK-NEXT:    s_waitcnt vmcnt(13)
 ; CHECK-NEXT:    v_readfirstlane_b32 s9, v5
+; CHECK-NEXT:    s_waitcnt vmcnt(12)
 ; CHECK-NEXT:    v_readfirstlane_b32 s8, v4
+; CHECK-NEXT:    s_waitcnt vmcnt(11)
 ; CHECK-NEXT:    v_readfirstlane_b32 s7, v3
+; CHECK-NEXT:    s_waitcnt vmcnt(10)
 ; CHECK-NEXT:    v_readfirstlane_b32 s6, v2
+; CHECK-NEXT:    s_waitcnt vmcnt(9)
 ; CHECK-NEXT:    v_readfirstlane_b32 s5, v1
+; CHECK-NEXT:    s_waitcnt vmcnt(8)
 ; CHECK-NEXT:    v_readfirstlane_b32 s4, v0
 ; CHECK-NEXT:    ; kill: def $sgpr12 killed $sgpr12 def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
 ; CHECK-NEXT:    s_mov_b32 s13, s10
@@ -110,12 +117,16 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
 ; CHECK-NEXT:    v_writelane_b32 v16, s17, 10
 ; CHECK-NEXT:    v_writelane_b32 v16, s18, 11
 ; CHECK-NEXT:    v_writelane_b32 v16, s19, 12
+; CHECK-NEXT:    s_waitcnt vmcnt(6)
 ; CHECK-NEXT:    v_mov_b32_e32 v6, v8
 ; CHECK-NEXT:    v_mov_b32_e32 v7, v9
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
 ; CHECK-NEXT:    v_mov_b32_e32 v4, v10
 ; CHECK-NEXT:    v_mov_b32_e32 v5, v11
+; CHECK-NEXT:    s_waitcnt vmcnt(2)
 ; CHECK-NEXT:    v_mov_b32_e32 v2, v12
 ; CHECK-NEXT:    v_mov_b32_e32 v3, v13
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, v14
 ; CHECK-NEXT:    v_mov_b32_e32 v1, v15
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[12:13]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll
index e0016b0a5a64d..21d1b04e1aeee 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll
@@ -16,41 +16,41 @@ define amdgpu_cs void @memcpy_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src)
 ; LOOP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; LOOP-NEXT:    v_add_i32_e32 v6, vcc, v2, v4
 ; LOOP-NEXT:    v_addc_u32_e32 v7, vcc, v3, v5, vcc
-; LOOP-NEXT:    buffer_load_ubyte v26, v[6:7], s[0:3], 0 addr64
 ; LOOP-NEXT:    s_waitcnt expcnt(5)
 ; LOOP-NEXT:    buffer_load_ubyte v29, v[6:7], s[0:3], 0 addr64 offset:1
+; LOOP-NEXT:    buffer_load_ubyte v26, v[6:7], s[0:3], 0 addr64
+; LOOP-NEXT:    buffer_load_ubyte v32, v[6:7], s[0:3], 0 addr64 offset:3
 ; LOOP-NEXT:    s_waitcnt expcnt(2)
 ; LOOP-NEXT:    buffer_load_ubyte v31, v[6:7], s[0:3], 0 addr64 offset:2
-; LOOP-NEXT:    buffer_load_ubyte v32, v[6:7], s[0:3], 0 addr64 offset:3
-; LOOP-NEXT:    buffer_load_ubyte v36, v[6:7], s[0:3], 0 addr64 offset:4
 ; LOOP-NEXT:    buffer_load_ubyte v37, v[6:7], s[0:3], 0 addr64 offset:5
-; LOOP-NEXT:    buffer_load_ubyte v38, v[6:7], s[0:3], 0 addr64 offset:6
 ; LOOP-NEXT:    buffer_load_ubyte v39, v[6:7], s[0:3], 0 addr64 offset:7
-; LOOP-NEXT:    buffer_load_ubyte v8, v[6:7], s[0:3], 0 addr64 offset:8
+; LOOP-NEXT:    buffer_load_ubyte v38, v[6:7], s[0:3], 0 addr64 offset:6
+; LOOP-NEXT:    buffer_load_ubyte v36, v[6:7], s[0:3], 0 addr64 offset:4
 ; LOOP-NEXT:    buffer_load_ubyte v11, v[6:7], s[0:3], 0 addr64 offset:9
-; LOOP-NEXT:    buffer_load_ubyte v12, v[6:7], s[0:3], 0 addr64 offset:10
 ; LOOP-NEXT:    s_waitcnt expcnt(0)
 ; LOOP-NEXT:    buffer_load_ubyte v13, v[6:7], s[0:3], 0 addr64 offset:11
-; LOOP-NEXT:    buffer_load_ubyte v9, v[6:7], s[0:3], 0 addr64 offset:12
+; LOOP-NEXT:    buffer_load_ubyte v12, v[6:7], s[0:3], 0 addr64 offset:10
 ; LOOP-NEXT:    buffer_load_ubyte v15, v[6:7], s[0:3], 0 addr64 offset:13
-; LOOP-NEXT:    buffer_load_ubyte v16, v[6:7], s[0:3], 0 addr64 offset:14
 ; LOOP-NEXT:    buffer_load_ubyte v17, v[6:7], s[0:3], 0 addr64 offset:15
-; LOOP-NEXT:    buffer_load_ubyte v10, v[6:7], s[0:3], 0 addr64 offset:16
+; LOOP-NEXT:    buffer_load_ubyte v16, v[6:7], s[0:3], 0 addr64 offset:14
 ; LOOP-NEXT:    buffer_load_ubyte v19, v[6:7], s[0:3], 0 addr64 offset:17
-; LOOP-NEXT:    buffer_load_ubyte v20, v[6:7], s[0:3], 0 addr64 offset:18
 ; LOOP-NEXT:    buffer_load_ubyte v21, v[6:7], s[0:3], 0 addr64 offset:19
-; LOOP-NEXT:    buffer_load_ubyte v14, v[6:7], s[0:3], 0 addr64 offset:20
+; LOOP-NEXT:    buffer_load_ubyte v20, v[6:7], s[0:3], 0 addr64 offset:18
 ; LOOP-NEXT:    buffer_load_ubyte v23, v[6:7], s[0:3], 0 addr64 offset:21
-; LOOP-NEXT:    buffer_load_ubyte v24, v[6:7], s[0:3], 0 addr64 offset:22
 ; LOOP-NEXT:    buffer_load_ubyte v25, v[6:7], s[0:3], 0 addr64 offset:23
-; LOOP-NEXT:    buffer_load_ubyte v18, v[6:7], s[0:3], 0 addr64 offset:24
+; LOOP-NEXT:    buffer_load_ubyte v24, v[6:7], s[0:3], 0 addr64 offset:22
 ; LOOP-NEXT:    buffer_load_ubyte v27, v[6:7], s[0:3], 0 addr64 offset:25
-; LOOP-NEXT:    buffer_load_ubyte v28, v[6:7], s[0:3], 0 addr64 offset:26
 ; LOOP-NEXT:    buffer_load_ubyte v30, v[6:7], s[0:3], 0 addr64 offset:27
-; LOOP-NEXT:    buffer_load_ubyte v22, v[6:7], s[0:3], 0 addr64 offset:28
+; LOOP-NEXT:    buffer_load_ubyte v28, v[6:7], s[0:3], 0 addr64 offset:26
 ; LOOP-NEXT:    buffer_load_ubyte v33, v[6:7], s[0:3], 0 addr64 offset:29
-; LOOP-NEXT:    buffer_load_ubyte v34, v[6:7], s[0:3], 0 addr64 offset:30
 ; LOOP-NEXT:    buffer_load_ubyte v35, v[6:7], s[0:3], 0 addr64 offset:31
+; LOOP-NEXT:    buffer_load_ubyte v34, v[6:7], s[0:3], 0 addr64 offset:30
+; LOOP-NEXT:    buffer_load_ubyte v8, v[6:7], s[0:3], 0 addr64 offset:8
+; LOOP-NEXT:    buffer_load_ubyte v9, v[6:7], s[0:3], 0 addr64 offset:12
+; LOOP-NEXT:    buffer_load_ubyte v10, v[6:7], s[0:3], 0 addr64 offset:16
+; LOOP-NEXT:    buffer_load_ubyte v14, v[6:7], s[0:3], 0 addr64 offset:20
+; LOOP-NEXT:    buffer_load_ubyte v18, v[6:7], s[0:3], 0 addr64 offset:24
+; LOOP-NEXT:    buffer_load_ubyte v22, v[6:7], s[0:3], 0 addr64 offset:28
 ; LOOP-NEXT:    s_waitcnt vmcnt(14)
 ; LOOP-NEXT:    v_lshlrev_b32_e32 v6, 8, v29
 ; LOOP-NEXT:    v_or_b32_e32 v26, v6, v26
@@ -74,34 +74,41 @@ define amdgpu_cs void @memcpy_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src)
 ; LOOP-NEXT:    v_lshlrev_b32_e32 v17, 24, v17
 ; LOOP-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
 ; LOOP-NEXT:    v_lshlrev_b32_e32 v19, 8, v19
-; LOOP-NEXT:    s_waitcnt vmcnt(12)
 ; LOOP-NEXT:    v_lshlrev_b32_e32 v21, 24, v21
 ; LOOP-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; LOOP-NEXT:    s_waitcnt vmcnt(10)
 ; LOOP-NEXT:    v_lshlrev_b32_e32 v23, 8, v23
-; LOOP-NEXT:    s_waitcnt vmcnt(8)
+; LOOP-NEXT:    s_waitcnt vmcnt(13)
 ; LOOP-NEXT:    v_lshlrev_b32_e32 v25, 24, v25
+; LOOP-NEXT:    s_waitcnt vmcnt(12)
 ; LOOP-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; LOOP-NEXT:    s_waitcnt vmcnt(6)
+; LOOP-NEXT:    s_waitcnt vmcnt(11)
 ; LOOP-NEXT:    v_lshlrev_b32_e32 v27, 8, v27
-; LOOP-NEXT:    s_waitcnt vmcnt(4)
+; LOOP-NEXT:    s_waitcnt vmcnt(10)
 ; LOOP-NEXT:    v_lshlrev_b32_e32 v30, 24, v30
+; LOOP-NEXT:    s_waitcnt vmcnt(9)
 ; LOOP-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; LOOP-NEXT:    s_waitcnt vmcnt(2)
+; LOOP-NEXT:    s_waitcnt vmcnt(8)
 ; LOOP-NEXT:    v_lshlrev_b32_e32 v33, 8, v33
-; LOOP-NEXT:    s_waitcnt vmcnt(0)
+; LOOP-NEXT:    s_waitcnt vmcnt(7)
 ; LOOP-NEXT:    v_lshlrev_b32_e32 v35, 24, v35
+; LOOP-NEXT:    s_waitcnt vmcnt(6)
 ; LOOP-NEXT:    v_lshlrev_b32_e32 v34, 16, v34
+; LOOP-NEXT:    s_waitcnt vmcnt(5)
 ; LOOP-NEXT:    v_or_b32_e32 v8, v11, v8
 ; LOOP-NEXT:    v_or_b32_e32 v11, v13, v12
+; LOOP-NEXT:    s_waitcnt vmcnt(4)
 ; LOOP-NEXT:    v_or_b32_e32 v9, v15, v9
 ; LOOP-NEXT:    v_or_b32_e32 v12, v17, v16
+; LOOP-NEXT:    s_waitcnt vmcnt(3)
 ; LOOP-NEXT:    v_or_b32_e32 v10, v19, v10
 ; LOOP-NEXT:    v_or_b32_e32 v13, v21, v20
+; LOOP-NEXT:    s_waitcnt vmcnt(2)
 ; LOOP-NEXT:    v_or_b32_e32 v14, v23, v14
 ; LOOP-NEXT:    v_or_b32_e32 v15, v25, v24
+; LOOP-NEXT:    s_waitcnt vmcnt(1)
 ; LOOP-NEXT:    v_or_b32_e32 v16, v27, v18
 ; LOOP-NEXT:    v_or_b32_e32 v17, v30, v28
+; LOOP-NEXT:    s_waitcnt vmcnt(0)
 ; LOOP-NEXT:    v_or_b32_e32 v18, v33, v22
 ; LOOP-NEXT:    v_or_b32_e32 v19, v35, v34
 ; LOOP-NEXT:    v_or_b32_e32 v20, v29, v26
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
index efa51ead0d196..232b738d1ad71 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
@@ -32,12 +32,12 @@ define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) {
 ; GFX12-NOUNALIGNED-NEXT:    s_clause 0xb
 ; GFX12-NOUNALIGNED-NEXT:    global_load_u8 v2, v[0:1], off
 ; GFX12-NOUNALIGNED-NEXT:    global_load_u8 v3, v[0:1], off offset:1
-; GFX12-NOUNALIGNED-NEXT:    global_load_u8 v4, v[0:1], off offset:2
 ; GFX12-NOUNALIGNED-NEXT:    global_load_u8 v5, v[0:1], off offset:3
+; GFX12-NOUNALIGNED-NEXT:    global_load_u8 v4, v[0:1], off offset:2
 ; GFX12-NOUNALIGNED-NEXT:    global_load_u8 v6, v[0:1], off offset:4
 ; GFX12-NOUNALIGNED-NEXT:    global_load_u8 v7, v[0:1], off offset:5
-; GFX12-NOUNALIGNED-NEXT:    global_load_u8 v8, v[0:1], off offset:6
 ; GFX12-NOUNALIGNED-NEXT:    global_load_u8 v9, v[0:1], off offset:7
+; GFX12-NOUNALIGNED-NEXT:    global_load_u8 v8, v[0:1], off offset:6
 ; GFX12-NOUNALIGNED-NEXT:    global_load_u8 v10, v[0:1], off offset:8
 ; GFX12-NOUNALIGNED-NEXT:    global_load_u8 v11, v[0:1], off offset:9
 ; GFX12-NOUNALIGNED-NEXT:    global_load_u8 v12, v[0:1], off offset:11
@@ -45,15 +45,15 @@ define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) {
 ; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0xa
 ; GFX12-NOUNALIGNED-NEXT:    v_lshl_or_b32 v1, v3, 8, v2
 ; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x9
-; GFX12-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x8
 ; GFX12-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 24, v5
+; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x8
+; GFX12-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
 ; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x6
 ; GFX12-NOUNALIGNED-NEXT:    v_lshl_or_b32 v4, v7, 8, v6
 ; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x5
-; GFX12-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v6, 16, v8
-; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x4
 ; GFX12-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v5, 24, v9
+; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x4
+; GFX12-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v6, 16, v8
 ; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x2
 ; GFX12-NOUNALIGNED-NEXT:    v_lshl_or_b32 v7, v11, 8, v10
 ; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x1
@@ -81,12 +81,12 @@ define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) {
 ; GFX1250-NOUNALIGNED-NEXT:    s_clause 0xb
 ; GFX1250-NOUNALIGNED-NEXT:    global_load_u8 v2, v[0:1], off
 ; GFX1250-NOUNALIGNED-NEXT:    global_load_u8 v3, v[0:1], off offset:1
-; GFX1250-NOUNALIGNED-NEXT:    global_load_u8 v4, v[0:1], off offset:2
 ; GFX1250-NOUNALIGNED-NEXT:    global_load_u8 v5, v[0:1], off offset:3
+; GFX1250-NOUNALIGNED-NEXT:    global_load_u8 v4, v[0:1], off offset:2
 ; GFX1250-NOUNALIGNED-NEXT:    global_load_u8 v6, v[0:1], off offset:4
 ; GFX1250-NOUNALIGNED-NEXT:    global_load_u8 v7, v[0:1], off offset:5
-; GFX1250-NOUNALIGNED-NEXT:    global_load_u8 v8, v[0:1], off offset:6
 ; GFX1250-NOUNALIGNED-NEXT:    global_load_u8 v9, v[0:1], off offset:7
+; GFX1250-NOUNALIGNED-NEXT:    global_load_u8 v8, v[0:1], off offset:6
 ; GFX1250-NOUNALIGNED-NEXT:    global_load_u8 v10, v[0:1], off offset:8
 ; GFX1250-NOUNALIGNED-NEXT:    global_load_u8 v11, v[0:1], off offset:9
 ; GFX1250-NOUNALIGNED-NEXT:    global_load_u8 v12, v[0:1], off offset:11
@@ -95,11 +95,11 @@ define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) {
 ; GFX1250-NOUNALIGNED-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NOUNALIGNED-NEXT:    v_lshl_or_b32 v0, v3, 8, v2
 ; GFX1250-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x8
-; GFX1250-NOUNALIGNED-NEXT:    v_dual_lshlrev_b32 v2, 16, v4 :: v_dual_lshlrev_b32 v1, 24, v5
+; GFX1250-NOUNALIGNED-NEXT:    v_dual_lshlrev_b32 v1, 24, v5 :: v_dual_lshlrev_b32 v2, 16, v4
 ; GFX1250-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x6
 ; GFX1250-NOUNALIGNED-NEXT:    v_lshl_or_b32 v3, v7, 8, v6
 ; GFX1250-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x4
-; GFX1250-NOUNALIGNED-NEXT:    v_dual_lshlrev_b32 v5, 16, v8 :: v_dual_lshlrev_b32 v4, 24, v9
+; GFX1250-NOUNALIGNED-NEXT:    v_dual_lshlrev_b32 v4, 24, v9 :: v_dual_lshlrev_b32 v5, 16, v8
 ; GFX1250-NOUNALIGNED-NEXT:    v_or3_b32 v0, v1, v2, v0
 ; GFX1250-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x2
 ; GFX1250-NOUNALIGNED-NEXT:    v_lshl_or_b32 v6, v11, 8, v10
@@ -122,12 +122,12 @@ define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) {
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v3, v[0:1], off offset:1
-; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v4, v[0:1], off offset:2
 ; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v5, v[0:1], off offset:3
+; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v4, v[0:1], off offset:2
 ; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v6, v[0:1], off offset:4
 ; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v7, v[0:1], off offset:5
-; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v8, v[0:1], off offset:6
 ; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v9, v[0:1], off offset:7
+; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v8, v[0:1], off offset:6
 ; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v10, v[0:1], off offset:8
 ; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v11, v[0:1], off offset:9
 ; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v12, v[0:1], off offset:11
@@ -135,16 +135,16 @@ define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) {
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(10)
 ; GFX9-NOUNALIGNED-NEXT:    v_lshl_or_b32 v0, v3, 8, v2
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(9)
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
-; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 24, v5
+; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(8)
+; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
 ; GFX9-NOUNALIGNED-NEXT:    v_or3_b32 v0, v1, v2, v0
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NOUNALIGNED-NEXT:    v_lshl_or_b32 v3, v7, 8, v6
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(5)
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v5, 16, v8
-; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v4, 24, v9
+; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v5, 16, v8
 ; GFX9-NOUNALIGNED-NEXT:    v_or3_b32 v1, v4, v5, v3
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NOUNALIGNED-NEXT:    v_lshl_or_b32 v6, v11, 8, v10
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll
index d7fcbd5d623c9..8718777e8d067 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll
@@ -46,36 +46,39 @@ define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ds_read_u8 v1, v0
 ; GFX9-NEXT:    ds_read_u8 v2, v0 offset:1
-; GFX9-NEXT:    ds_read_u8 v3, v0 offset:2
 ; GFX9-NEXT:    ds_read_u8 v4, v0 offset:3
+; GFX9-NEXT:    ds_read_u8 v3, v0 offset:2
 ; GFX9-NEXT:    ds_read_u8 v5, v0 offset:4
 ; GFX9-NEXT:    ds_read_u8 v6, v0 offset:5
-; GFX9-NEXT:    ds_read_u8 v7, v0 offset:6
 ; GFX9-NEXT:    ds_read_u8 v8, v0 offset:7
+; GFX9-NEXT:    ds_read_u8 v7, v0 offset:6
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(6)
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(4)
+; GFX9-NEXT:    s_waitcnt lgkmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
+; GFX9-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX9-NEXT:    v_or3_b32 v4, v2, v3, v1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v6, 8, v5
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 24, v8
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
 ; GFX9-NEXT:    v_or3_b32 v1, v2, v3, v1
 ; GFX9-NEXT:    ds_read_u8 v2, v0 offset:8
 ; GFX9-NEXT:    ds_read_u8 v3, v0 offset:9
-; GFX9-NEXT:    ds_read_u8 v5, v0 offset:10
 ; GFX9-NEXT:    ds_read_u8 v6, v0 offset:11
+; GFX9-NEXT:    ds_read_u8 v5, v0 offset:10
 ; GFX9-NEXT:    ds_read_u8 v7, v0 offset:12
 ; GFX9-NEXT:    ds_read_u8 v8, v0 offset:13
 ; GFX9-NEXT:    ds_read_u8 v9, v0 offset:14
 ; GFX9-NEXT:    ds_read_u8 v0, v0 offset:15
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(6)
 ; GFX9-NEXT:    v_lshl_or_b32 v2, v3, 8, v2
-; GFX9-NEXT:    s_waitcnt lgkmcnt(4)
+; GFX9-NEXT:    s_waitcnt lgkmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 24, v6
+; GFX9-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX9-NEXT:    v_or3_b32 v2, v3, v5, v2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(2)
@@ -91,47 +94,52 @@ define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) {
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 m0, -1
-; GFX7-NEXT:    ds_read_u8 v1, v0
 ; GFX7-NEXT:    ds_read_u8 v2, v0 offset:1
-; GFX7-NEXT:    ds_read_u8 v3, v0 offset:2
+; GFX7-NEXT:    ds_read_u8 v1, v0
 ; GFX7-NEXT:    ds_read_u8 v4, v0 offset:3
-; GFX7-NEXT:    ds_read_u8 v5, v0 offset:4
+; GFX7-NEXT:    ds_read_u8 v3, v0 offset:2
 ; GFX7-NEXT:    ds_read_u8 v6, v0 offset:5
-; GFX7-NEXT:    ds_read_u8 v7, v0 offset:6
+; GFX7-NEXT:    ds_read_u8 v5, v0 offset:4
 ; GFX7-NEXT:    ds_read_u8 v8, v0 offset:7
-; GFX7-NEXT:    s_waitcnt lgkmcnt(6)
+; GFX7-NEXT:    ds_read_u8 v7, v0 offset:6
+; GFX7-NEXT:    s_waitcnt lgkmcnt(7)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(6)
 ; GFX7-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(5)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX7-NEXT:    v_or_b32_e32 v4, v2, v1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(3)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v6
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v8
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v5
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX7-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX7-NEXT:    ds_read_u8 v2, v0 offset:8
 ; GFX7-NEXT:    ds_read_u8 v3, v0 offset:9
-; GFX7-NEXT:    ds_read_u8 v5, v0 offset:10
+; GFX7-NEXT:    ds_read_u8 v2, v0 offset:8
 ; GFX7-NEXT:    ds_read_u8 v6, v0 offset:11
-; GFX7-NEXT:    ds_read_u8 v7, v0 offset:12
+; GFX7-NEXT:    ds_read_u8 v5, v0 offset:10
 ; GFX7-NEXT:    ds_read_u8 v8, v0 offset:13
+; GFX7-NEXT:    ds_read_u8 v7, v0 offset:12
 ; GFX7-NEXT:    ds_read_u8 v9, v0 offset:14
 ; GFX7-NEXT:    ds_read_u8 v0, v0 offset:15
-; GFX7-NEXT:    s_waitcnt lgkmcnt(6)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(7)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(6)
 ; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(5)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v6
+; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v5
 ; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(3)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v8
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
@@ -147,16 +155,16 @@ define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    ds_read_u8 v1, v0
 ; GFX10-NEXT:    ds_read_u8 v2, v0 offset:1
-; GFX10-NEXT:    ds_read_u8 v3, v0 offset:2
 ; GFX10-NEXT:    ds_read_u8 v4, v0 offset:3
+; GFX10-NEXT:    ds_read_u8 v3, v0 offset:2
 ; GFX10-NEXT:    ds_read_u8 v5, v0 offset:4
 ; GFX10-NEXT:    ds_read_u8 v6, v0 offset:5
-; GFX10-NEXT:    ds_read_u8 v7, v0 offset:6
 ; GFX10-NEXT:    ds_read_u8 v8, v0 offset:7
+; GFX10-NEXT:    ds_read_u8 v7, v0 offset:6
 ; GFX10-NEXT:    ds_read_u8 v9, v0 offset:8
 ; GFX10-NEXT:    ds_read_u8 v10, v0 offset:9
-; GFX10-NEXT:    ds_read_u8 v11, v0 offset:10
 ; GFX10-NEXT:    ds_read_u8 v12, v0 offset:11
+; GFX10-NEXT:    ds_read_u8 v11, v0 offset:10
 ; GFX10-NEXT:    ds_read_u8 v13, v0 offset:12
 ; GFX10-NEXT:    ds_read_u8 v14, v0 offset:13
 ; GFX10-NEXT:    ds_read_u8 v15, v0 offset:15
@@ -164,21 +172,21 @@ define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) {
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(14)
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(13)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    s_waitcnt lgkmcnt(12)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
+; GFX10-NEXT:    s_waitcnt lgkmcnt(12)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(10)
 ; GFX10-NEXT:    v_lshl_or_b32 v4, v6, 8, v5
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(9)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX10-NEXT:    s_waitcnt lgkmcnt(8)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 24, v8
+; GFX10-NEXT:    s_waitcnt lgkmcnt(8)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(6)
 ; GFX10-NEXT:    v_lshl_or_b32 v7, v10, 8, v9
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(5)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v11
-; GFX10-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 24, v12
+; GFX10-NEXT:    s_waitcnt lgkmcnt(4)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v11
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX10-NEXT:    v_lshl_or_b32 v10, v14, 8, v13
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(1)
@@ -196,16 +204,16 @@ define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    ds_load_u8 v1, v0
 ; GFX11-NEXT:    ds_load_u8 v2, v0 offset:1
-; GFX11-NEXT:    ds_load_u8 v3, v0 offset:2
 ; GFX11-NEXT:    ds_load_u8 v4, v0 offset:3
+; GFX11-NEXT:    ds_load_u8 v3, v0 offset:2
 ; GFX11-NEXT:    ds_load_u8 v5, v0 offset:4
 ; GFX11-NEXT:    ds_load_u8 v6, v0 offset:5
-; GFX11-NEXT:    ds_load_u8 v7, v0 offset:6
 ; GFX11-NEXT:    ds_load_u8 v8, v0 offset:7
+; GFX11-NEXT:    ds_load_u8 v7, v0 offset:6
 ; GFX11-NEXT:    ds_load_u8 v9, v0 offset:8
 ; GFX11-NEXT:    ds_load_u8 v10, v0 offset:9
-; GFX11-NEXT:    ds_load_u8 v11, v0 offset:10
 ; GFX11-NEXT:    ds_load_u8 v12, v0 offset:11
+; GFX11-NEXT:    ds_load_u8 v11, v0 offset:10
 ; GFX11-NEXT:    ds_load_u8 v13, v0 offset:12
 ; GFX11-NEXT:    ds_load_u8 v14, v0 offset:13
 ; GFX11-NEXT:    ds_load_u8 v15, v0 offset:15
@@ -213,21 +221,21 @@ define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) {
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(14)
 ; GFX11-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(13)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    s_waitcnt lgkmcnt(12)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
+; GFX11-NEXT:    s_waitcnt lgkmcnt(12)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(10)
 ; GFX11-NEXT:    v_lshl_or_b32 v4, v6, 8, v5
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(9)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX11-NEXT:    s_waitcnt lgkmcnt(8)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 24, v8
+; GFX11-NEXT:    s_waitcnt lgkmcnt(8)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(6)
 ; GFX11-NEXT:    v_lshl_or_b32 v7, v10, 8, v9
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(5)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v11
-; GFX11-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 24, v12
+; GFX11-NEXT:    s_waitcnt lgkmcnt(4)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v11
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX11-NEXT:    v_lshl_or_b32 v10, v14, 8, v13
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(1)
@@ -270,25 +278,28 @@ define <4 x i32> @load_lds_v4i32_align2(ptr addrspace(3) %ptr) {
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 m0, -1
-; GFX7-NEXT:    ds_read_u16 v1, v0
 ; GFX7-NEXT:    ds_read_u16 v2, v0 offset:2
-; GFX7-NEXT:    ds_read_u16 v3, v0 offset:4
+; GFX7-NEXT:    ds_read_u16 v1, v0
 ; GFX7-NEXT:    ds_read_u16 v4, v0 offset:6
-; GFX7-NEXT:    ds_read_u16 v5, v0 offset:8
+; GFX7-NEXT:    ds_read_u16 v3, v0 offset:4
 ; GFX7-NEXT:    ds_read_u16 v6, v0 offset:10
-; GFX7-NEXT:    ds_read_u16 v7, v0 offset:12
+; GFX7-NEXT:    ds_read_u16 v5, v0 offset:8
 ; GFX7-NEXT:    ds_read_u16 v8, v0 offset:14
-; GFX7-NEXT:    s_waitcnt lgkmcnt(6)
+; GFX7-NEXT:    ds_read_u16 v7, v0 offset:12
+; GFX7-NEXT:    s_waitcnt lgkmcnt(7)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(6)
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(5)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(3)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v7
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll
index 191f2e0670e15..2a8334ab31b7b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll
@@ -46,16 +46,17 @@ define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ds_read_u8 v1, v0
 ; GFX9-NEXT:    ds_read_u8 v2, v0 offset:1
-; GFX9-NEXT:    ds_read_u8 v3, v0 offset:2
 ; GFX9-NEXT:    ds_read_u8 v4, v0 offset:3
+; GFX9-NEXT:    ds_read_u8 v3, v0 offset:2
 ; GFX9-NEXT:    ds_read_u8 v5, v0 offset:4
 ; GFX9-NEXT:    ds_read_u8 v6, v0 offset:5
-; GFX9-NEXT:    ds_read_u8 v7, v0 offset:6
 ; GFX9-NEXT:    ds_read_u8 v8, v0 offset:7
+; GFX9-NEXT:    ds_read_u8 v7, v0 offset:6
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(6)
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(4)
+; GFX9-NEXT:    s_waitcnt lgkmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
+; GFX9-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX9-NEXT:    v_or3_b32 v3, v2, v3, v1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(2)
@@ -64,8 +65,9 @@ define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) {
 ; GFX9-NEXT:    ds_read_u8 v4, v0 offset:9
 ; GFX9-NEXT:    ds_read_u8 v5, v0 offset:10
 ; GFX9-NEXT:    ds_read_u8 v0, v0 offset:11
-; GFX9-NEXT:    s_waitcnt lgkmcnt(4)
+; GFX9-NEXT:    s_waitcnt lgkmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 24, v8
+; GFX9-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX9-NEXT:    v_lshl_or_b32 v2, v4, 8, v2
@@ -82,36 +84,38 @@ define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) {
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 m0, -1
-; GFX7-NEXT:    ds_read_u8 v1, v0
 ; GFX7-NEXT:    ds_read_u8 v2, v0 offset:1
-; GFX7-NEXT:    ds_read_u8 v3, v0 offset:2
+; GFX7-NEXT:    ds_read_u8 v1, v0
 ; GFX7-NEXT:    ds_read_u8 v4, v0 offset:3
-; GFX7-NEXT:    ds_read_u8 v5, v0 offset:4
+; GFX7-NEXT:    ds_read_u8 v3, v0 offset:2
 ; GFX7-NEXT:    ds_read_u8 v6, v0 offset:5
-; GFX7-NEXT:    ds_read_u8 v7, v0 offset:6
+; GFX7-NEXT:    ds_read_u8 v5, v0 offset:4
 ; GFX7-NEXT:    ds_read_u8 v8, v0 offset:7
-; GFX7-NEXT:    s_waitcnt lgkmcnt(6)
+; GFX7-NEXT:    ds_read_u8 v7, v0 offset:6
+; GFX7-NEXT:    s_waitcnt lgkmcnt(7)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(6)
 ; GFX7-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(5)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX7-NEXT:    v_or_b32_e32 v3, v2, v1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(3)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v6
+; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v5
-; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
-; GFX7-NEXT:    ds_read_u8 v5, v0 offset:8
 ; GFX7-NEXT:    ds_read_u8 v6, v0 offset:9
+; GFX7-NEXT:    ds_read_u8 v5, v0 offset:8
 ; GFX7-NEXT:    ds_read_u8 v7, v0 offset:10
 ; GFX7-NEXT:    ds_read_u8 v0, v0 offset:11
-; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v8
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX7-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(3)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v6
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
@@ -127,12 +131,12 @@ define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    ds_read_u8 v1, v0
 ; GFX10-NEXT:    ds_read_u8 v2, v0 offset:1
-; GFX10-NEXT:    ds_read_u8 v3, v0 offset:2
 ; GFX10-NEXT:    ds_read_u8 v4, v0 offset:3
+; GFX10-NEXT:    ds_read_u8 v3, v0 offset:2
 ; GFX10-NEXT:    ds_read_u8 v5, v0 offset:4
 ; GFX10-NEXT:    ds_read_u8 v6, v0 offset:5
-; GFX10-NEXT:    ds_read_u8 v7, v0 offset:6
 ; GFX10-NEXT:    ds_read_u8 v8, v0 offset:7
+; GFX10-NEXT:    ds_read_u8 v7, v0 offset:6
 ; GFX10-NEXT:    ds_read_u8 v9, v0 offset:8
 ; GFX10-NEXT:    ds_read_u8 v10, v0 offset:9
 ; GFX10-NEXT:    ds_read_u8 v11, v0 offset:11
@@ -140,15 +144,15 @@ define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) {
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(10)
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(9)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    s_waitcnt lgkmcnt(8)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
+; GFX10-NEXT:    s_waitcnt lgkmcnt(8)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(6)
 ; GFX10-NEXT:    v_lshl_or_b32 v4, v6, 8, v5
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(5)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX10-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 24, v8
+; GFX10-NEXT:    s_waitcnt lgkmcnt(4)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX10-NEXT:    v_lshl_or_b32 v7, v10, 8, v9
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(1)
@@ -165,12 +169,12 @@ define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    ds_load_u8 v1, v0
 ; GFX11-NEXT:    ds_load_u8 v2, v0 offset:1
-; GFX11-NEXT:    ds_load_u8 v3, v0 offset:2
 ; GFX11-NEXT:    ds_load_u8 v4, v0 offset:3
+; GFX11-NEXT:    ds_load_u8 v3, v0 offset:2
 ; GFX11-NEXT:    ds_load_u8 v5, v0 offset:4
 ; GFX11-NEXT:    ds_load_u8 v6, v0 offset:5
-; GFX11-NEXT:    ds_load_u8 v7, v0 offset:6
 ; GFX11-NEXT:    ds_load_u8 v8, v0 offset:7
+; GFX11-NEXT:    ds_load_u8 v7, v0 offset:6
 ; GFX11-NEXT:    ds_load_u8 v9, v0 offset:8
 ; GFX11-NEXT:    ds_load_u8 v10, v0 offset:9
 ; GFX11-NEXT:    ds_load_u8 v11, v0 offset:11
@@ -178,15 +182,15 @@ define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) {
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(10)
 ; GFX11-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(9)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    s_waitcnt lgkmcnt(8)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
+; GFX11-NEXT:    s_waitcnt lgkmcnt(8)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(6)
 ; GFX11-NEXT:    v_lshl_or_b32 v4, v6, 8, v5
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(5)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX11-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 24, v8
+; GFX11-NEXT:    s_waitcnt lgkmcnt(4)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX11-NEXT:    v_lshl_or_b32 v7, v10, 8, v9
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(1)
@@ -224,20 +228,22 @@ define <3 x i32> @load_lds_v3i32_align2(ptr addrspace(3) %ptr) {
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 m0, -1
-; GFX7-NEXT:    ds_read_u16 v1, v0
 ; GFX7-NEXT:    ds_read_u16 v2, v0 offset:2
-; GFX7-NEXT:    ds_read_u16 v3, v0 offset:4
+; GFX7-NEXT:    ds_read_u16 v1, v0
 ; GFX7-NEXT:    ds_read_u16 v4, v0 offset:6
-; GFX7-NEXT:    ds_read_u16 v5, v0 offset:8
+; GFX7-NEXT:    ds_read_u16 v3, v0 offset:4
 ; GFX7-NEXT:    ds_read_u16 v6, v0 offset:10
-; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
+; GFX7-NEXT:    ds_read_u16 v5, v0 offset:8
+; GFX7-NEXT:    s_waitcnt lgkmcnt(5)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(3)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v5
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
index b1de0eff05d30..d319ae066aae2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
@@ -22,47 +22,52 @@ define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) {
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 m0, -1
-; GFX7-NEXT:    ds_read_u8 v1, v0
 ; GFX7-NEXT:    ds_read_u8 v2, v0 offset:1
-; GFX7-NEXT:    ds_read_u8 v3, v0 offset:2
+; GFX7-NEXT:    ds_read_u8 v1, v0
 ; GFX7-NEXT:    ds_read_u8 v4, v0 offset:3
-; GFX7-NEXT:    ds_read_u8 v5, v0 offset:4
+; GFX7-NEXT:    ds_read_u8 v3, v0 offset:2
 ; GFX7-NEXT:    ds_read_u8 v6, v0 offset:5
-; GFX7-NEXT:    ds_read_u8 v7, v0 offset:6
+; GFX7-NEXT:    ds_read_u8 v5, v0 offset:4
 ; GFX7-NEXT:    ds_read_u8 v8, v0 offset:7
-; GFX7-NEXT:    s_waitcnt lgkmcnt(6)
+; GFX7-NEXT:    ds_read_u8 v7, v0 offset:6
+; GFX7-NEXT:    s_waitcnt lgkmcnt(7)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(6)
 ; GFX7-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(5)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX7-NEXT:    v_or_b32_e32 v4, v2, v1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(3)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v6
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v8
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v5
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX7-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX7-NEXT:    ds_read_u8 v2, v0 offset:8
 ; GFX7-NEXT:    ds_read_u8 v3, v0 offset:9
-; GFX7-NEXT:    ds_read_u8 v5, v0 offset:10
+; GFX7-NEXT:    ds_read_u8 v2, v0 offset:8
 ; GFX7-NEXT:    ds_read_u8 v6, v0 offset:11
-; GFX7-NEXT:    ds_read_u8 v7, v0 offset:12
+; GFX7-NEXT:    ds_read_u8 v5, v0 offset:10
 ; GFX7-NEXT:    ds_read_u8 v8, v0 offset:13
+; GFX7-NEXT:    ds_read_u8 v7, v0 offset:12
 ; GFX7-NEXT:    ds_read_u8 v9, v0 offset:14
 ; GFX7-NEXT:    ds_read_u8 v0, v0 offset:15
-; GFX7-NEXT:    s_waitcnt lgkmcnt(6)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(7)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(6)
 ; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(5)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v6
+; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v5
 ; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(3)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v8
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
@@ -104,36 +109,38 @@ define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) {
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 m0, -1
-; GFX7-NEXT:    ds_read_u8 v1, v0
 ; GFX7-NEXT:    ds_read_u8 v2, v0 offset:1
-; GFX7-NEXT:    ds_read_u8 v3, v0 offset:2
+; GFX7-NEXT:    ds_read_u8 v1, v0
 ; GFX7-NEXT:    ds_read_u8 v4, v0 offset:3
-; GFX7-NEXT:    ds_read_u8 v5, v0 offset:4
+; GFX7-NEXT:    ds_read_u8 v3, v0 offset:2
 ; GFX7-NEXT:    ds_read_u8 v6, v0 offset:5
-; GFX7-NEXT:    ds_read_u8 v7, v0 offset:6
+; GFX7-NEXT:    ds_read_u8 v5, v0 offset:4
 ; GFX7-NEXT:    ds_read_u8 v8, v0 offset:7
-; GFX7-NEXT:    s_waitcnt lgkmcnt(6)
+; GFX7-NEXT:    ds_read_u8 v7, v0 offset:6
+; GFX7-NEXT:    s_waitcnt lgkmcnt(7)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(6)
 ; GFX7-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(5)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX7-NEXT:    v_or_b32_e32 v3, v2, v1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(3)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v6
+; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v5
-; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
-; GFX7-NEXT:    ds_read_u8 v5, v0 offset:8
 ; GFX7-NEXT:    ds_read_u8 v6, v0 offset:9
+; GFX7-NEXT:    ds_read_u8 v5, v0 offset:8
 ; GFX7-NEXT:    ds_read_u8 v7, v0 offset:10
 ; GFX7-NEXT:    ds_read_u8 v0, v0 offset:11
-; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v8
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX7-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(3)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v6
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index 74552a500ac51..4078058ea2196 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -3121,8 +3121,8 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
 ; SI-NEXT:    ; implicit-def: $vgpr36
 ; SI-NEXT:    ; kill: killed $vgpr36
@@ -3284,7 +3284,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    ; kill: killed $vgpr36
 ; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -3522,6 +3522,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB12_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_add_i32_e32 v31, vcc, 3, v31
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v32, vcc, 3, v32
@@ -4333,8 +4334,8 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; VI-NEXT:    ; implicit-def: $vgpr39
 ; VI-NEXT:    ; kill: killed $vgpr39
@@ -5302,8 +5303,8 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
 ; GFX9-NEXT:    ; kill: killed $vgpr40
@@ -5493,7 +5494,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(29)
+; GFX9-NEXT:    s_waitcnt vmcnt(30)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    ; kill: killed $vgpr33
@@ -5502,6 +5503,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB12_2
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
+; GFX9-NEXT:    s_waitcnt vmcnt(29)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v32
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v32
@@ -5697,6 +5699,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB12_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
+; GFX9-NEXT:    s_waitcnt vmcnt(29)
 ; GFX9-NEXT:    v_add_u32_e32 v32, 3, v32
 ; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    v_add_u32_e32 v31, 3, v31
@@ -11806,28 +11809,29 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v52
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:188
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:192
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:200
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:208
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:216
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:188
 ; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:196
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:196
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v3
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:204
@@ -11837,28 +11841,29 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:212
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:220
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:224
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:232
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:240
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:220
 ; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:228
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:228
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v3
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:236
@@ -11868,28 +11873,29 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:244
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:252
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:256
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:264
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:272
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:280
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:252
 ; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:260
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:260
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v3
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:268
@@ -11899,28 +11905,29 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:276
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:284
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:288
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:296
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:304
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:312
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:284
 ; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:292
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:292
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v3
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:300
@@ -11930,28 +11937,29 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:308
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:316
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:320
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:328
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:336
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:344
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:316
 ; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:324
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:324
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v3
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:332
@@ -11961,26 +11969,28 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:340
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:348
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:352
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:360
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:368
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:376
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:348
 ; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:356
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:356
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v43, 8, v3
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:364
@@ -11990,12 +12000,12 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:372
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:384
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:380
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:384
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:44
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:36
@@ -13322,25 +13332,27 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v56
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:192
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:200
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
 ; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:204
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -13348,25 +13360,27 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:212
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:220
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:224
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:232
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:240
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:248
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:220
 ; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:228
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:228
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:236
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -13374,25 +13388,27 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:244
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:252
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:256
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:264
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:272
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:280
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:252
 ; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:260
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:260
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:268
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -13400,25 +13416,27 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:276
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:284
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:288
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:296
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:304
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:312
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:284
 ; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:292
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:292
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:300
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -13426,25 +13444,27 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:308
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:316
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:320
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:328
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:336
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:344
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:316
 ; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:324
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:324
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:332
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -13452,25 +13472,27 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:340
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:348
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:352
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:360
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:368
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:376
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:348
 ; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:356
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:356
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -13478,11 +13500,9 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
 ; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
@@ -13498,6 +13518,8 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
 ; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
 ; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
+; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -14555,26 +14577,28 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v56
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:192
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
 ; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:192
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:200
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:204
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -14582,26 +14606,28 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:212
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:220
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:224
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:232
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:240
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:248
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:220
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:228
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:228
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:236
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -14609,26 +14635,28 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:244
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:252
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:256
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:264
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:272
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:280
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:252
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:260
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:260
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:268
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -14636,26 +14664,28 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:276
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:284
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:288
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:296
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:304
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:312
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:284
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:292
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:292
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:300
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -14663,26 +14693,28 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:308
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:316
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:320
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:328
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:336
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:344
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:316
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:324
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:324
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:332
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -14690,26 +14722,28 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:340
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:348
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:352
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:360
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:368
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:376
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:348
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:356
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:356
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -14717,11 +14751,10 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
 ; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
@@ -15639,50 +15672,6 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:384
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:380
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:376
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:372
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v51, off, s32 offset:368
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:364
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v51, off, s32 offset:360
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:356
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v50, off, s32 offset:352
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:348
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v52, off, s32 offset:344
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:340
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v52, off, s32 offset:336
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:332
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v54, off, s32 offset:328
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:324
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v54, off, s32 offset:320
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:316
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v53, off, s32 offset:312
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:308
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v55, off, s32 offset:304
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:300
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v55, off, s32 offset:296
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:292
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v67, off, s32 offset:288
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:284
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:280
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:276
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:272
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v39, off, s32 offset:268
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v67, off, s32 offset:264
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v39, off, s32 offset:260
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v68, off, s32 offset:256
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v48, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v71, off, s32 offset:248
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v48, off, s32 offset:244
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v71, off, s32 offset:240
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v49, off, s32 offset:236
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v70, off, s32 offset:232
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v49, off, s32 offset:228
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:224
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v50, off, s32 offset:220
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v80, off, s32 offset:216
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v113, off, s32 offset:388
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v83, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v84, off, s32 offset:8
@@ -15704,7 +15693,6 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v161, off, s32 offset:136
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v161, off, s32 offset:144
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v162, off, s32 offset:152
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v162, off, s32 offset:160
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v163, off, s32 offset:168
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v163, off, s32 offset:176
@@ -15712,6 +15700,51 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v164, off, s32 offset:192
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v165, off, s32 offset:200
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v165, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v80, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v70, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v71, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v71, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v68, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v67, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v67, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v55, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v55, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v53, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v54, off, s32 offset:320
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v54, off, s32 offset:328
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v52, off, s32 offset:336
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v52, off, s32 offset:344
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v50, off, s32 offset:352
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v51, off, s32 offset:360
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v51, off, s32 offset:368
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:376
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:384
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:380
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:372
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:364
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:356
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:348
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:340
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:332
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:324
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:316
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v39, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v39, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v48, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v48, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v49, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v49, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v50, off, s32 offset:220
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v53, off, s32 offset:212
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:204
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v64, off, s32 offset:196
@@ -15772,21 +15805,6 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v144.h, 8, v27.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v130.l, 8, v29.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(62)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.h, 8, v51.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v51.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(56)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v50.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v54.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v54.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(26)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.h, 8, v53.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v67.l, 8, v67.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v66.h, 8, v66.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v66.l, 8, v66.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v71.h, 8, v71.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v71.l, 8, v71.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(18)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v70.h, 8, v70.h
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v113
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v130.h, 8, v83.h
@@ -15821,12 +15839,24 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v84.h, 8, v165.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v85.l, 8, v80.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v85.h, 8, v80.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v70.h, 8, v70.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v71.l, 8, v71.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v71.h, 8, v71.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v80.l, 8, v68.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v80.h, 8, v67.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v66.l, 8, v66.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v66.h, 8, v66.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v67.l, 8, v67.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v67.h, 8, v55.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v68.l, 8, v55.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.h, 8, v53.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v54.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v54.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.l, 8, v52.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.h, 8, v52.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v50.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v51.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.h, 8, v51.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.l, 8, v31.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.h, 8, v31.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
@@ -16424,50 +16454,6 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0
 ; GFX11-FAKE16-NEXT:    s_clause 0x1f
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:384
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v64, off, s32 offset:380
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32 offset:376
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v65, off, s32 offset:372
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:368
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v66, off, s32 offset:364
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:360
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v67, off, s32 offset:356
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:352
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v68, off, s32 offset:348
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:344
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v69, off, s32 offset:340
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v12, off, s32 offset:336
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v70, off, s32 offset:332
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32 offset:328
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v71, off, s32 offset:324
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v16, off, s32 offset:320
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v80, off, s32 offset:316
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v18, off, s32 offset:312
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v81, off, s32 offset:308
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v20, off, s32 offset:304
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v82, off, s32 offset:300
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v22, off, s32 offset:296
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v83, off, s32 offset:292
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v24, off, s32 offset:288
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v84, off, s32 offset:284
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v26, off, s32 offset:280
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v85, off, s32 offset:276
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v28, off, s32 offset:272
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v86, off, s32 offset:268
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v30, off, s32 offset:264
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v87, off, s32 offset:260
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v31, off, s32 offset:256
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v96, off, s32 offset:252
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v99, off, s32 offset:248
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v97, off, s32 offset:244
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v100, off, s32 offset:240
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v98, off, s32 offset:236
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v101, off, s32 offset:232
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v112, off, s32 offset:228
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v102, off, s32 offset:224
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v113, off, s32 offset:220
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v103, off, s32 offset:216
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v114, off, s32 offset:388
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v115, off, s32
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v116, off, s32 offset:8
@@ -16489,7 +16475,6 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v165, off, s32 offset:136
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v166, off, s32 offset:144
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v179, off, s32 offset:152
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v180, off, s32 offset:160
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v181, off, s32 offset:168
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v182, off, s32 offset:176
@@ -16497,6 +16482,51 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v136, off, s32 offset:192
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v137, off, s32 offset:200
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v138, off, s32 offset:208
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v103, off, s32 offset:216
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v102, off, s32 offset:224
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v101, off, s32 offset:232
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v100, off, s32 offset:240
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v99, off, s32 offset:248
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v31, off, s32 offset:256
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v30, off, s32 offset:264
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v28, off, s32 offset:272
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v26, off, s32 offset:280
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v24, off, s32 offset:288
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v22, off, s32 offset:296
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v20, off, s32 offset:304
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v18, off, s32 offset:312
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v16, off, s32 offset:320
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32 offset:328
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v12, off, s32 offset:336
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:344
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:352
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:360
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:368
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32 offset:376
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:384
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v64, off, s32 offset:380
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v65, off, s32 offset:372
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v66, off, s32 offset:364
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v67, off, s32 offset:356
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v68, off, s32 offset:348
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v69, off, s32 offset:340
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v70, off, s32 offset:332
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v71, off, s32 offset:324
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v80, off, s32 offset:316
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v81, off, s32 offset:308
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v82, off, s32 offset:300
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v83, off, s32 offset:292
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v84, off, s32 offset:284
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v85, off, s32 offset:276
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v86, off, s32 offset:268
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v87, off, s32 offset:260
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v96, off, s32 offset:252
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v97, off, s32 offset:244
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v98, off, s32 offset:236
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v112, off, s32 offset:228
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v113, off, s32 offset:220
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v132, off, s32 offset:212
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v133, off, s32 offset:204
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v134, off, s32 offset:196
@@ -16540,61 +16570,34 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v109, 8, v25
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v110, 8, v27
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v93, 8, v29
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(54)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(62)
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v114
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(53)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v94, 8, v115
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(52)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v95, 8, v116
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(51)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v104, 8, v117
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(50)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v105, 8, v118
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(49)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v79, 8, v119
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(48)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v88, 8, v128
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(47)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v89, 8, v129
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(46)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v90, 8, v130
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(45)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v91, 8, v131
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(44)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v61, 8, v144
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(43)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v62, 8, v145
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(42)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v63, 8, v146
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(41)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v72, 8, v147
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(40)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v73, 8, v148
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(39)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v45, 8, v162
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(38)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v46, 8, v163
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(37)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v47, 8, v164
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(36)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v56, 8, v165
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(35)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v57, 8, v166
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(34)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v179, 8, v179
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(33)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v180, 8, v180
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(32)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v181, 8, v181
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(31)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v182, 8, v182
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(30)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v183, 8, v183
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(29)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v162, 8, v136
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(28)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v163, 8, v137
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(27)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v164, 8, v138
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v165, 8, v103
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v166, 8, v102
@@ -16604,19 +16607,33 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v147, 8, v31
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v148, 8, v30
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v119, 8, v28
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(61)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v128, 8, v26
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(60)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v129, 8, v24
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(59)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v130, 8, v22
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(58)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v131, 8, v20
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(57)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v114, 8, v18
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(56)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v115, 8, v16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(55)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v116, 8, v14
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(54)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v117, 8, v12
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(53)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v118, 8, v10
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(52)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v99, 8, v8
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(51)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v100, 8, v6
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(50)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v101, 8, v4
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(49)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v102, 8, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(48)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v103, 8, v0
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -17606,30 +17623,29 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:44
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:52
-; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:76
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:84
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:92
-; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:100
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:108
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:116
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:124
-; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:132
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:140
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:148
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:156
-; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:164
-; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:172
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:180
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:188
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:180
+; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:172
+; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:164
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:156
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:148
+; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:140
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:132
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:124
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:116
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:108
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:100
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:84
+; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:44
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:196
@@ -17643,14 +17659,14 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:212
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:220
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:260
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:244
 ; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:228
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:220
 ; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:236
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:244
 ; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:252
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:260
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:268
@@ -18840,11 +18856,11 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:312
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:320
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:328
-; VI-NEXT:    buffer_load_ushort v11, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_ushort v12, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_ushort v20, off, s[0:3], s32 offset:28
 ; VI-NEXT:    buffer_load_ushort v13, off, s[0:3], s32 offset:36
+; VI-NEXT:    buffer_load_ushort v20, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_ushort v12, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v11, off, s[0:3], s32 offset:4
 ; VI-NEXT:    s_waitcnt vmcnt(12)
 ; VI-NEXT:    v_lshlrev_b32_e32 v31, 8, v5
 ; VI-NEXT:    s_waitcnt vmcnt(11)
@@ -18859,44 +18875,44 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:44
-; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:52
-; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:60
-; VI-NEXT:    buffer_load_ushort v15, off, s[0:3], s32 offset:68
-; VI-NEXT:    buffer_load_ushort v22, off, s[0:3], s32 offset:76
-; VI-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:84
-; VI-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:92
-; VI-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:100
-; VI-NEXT:    buffer_load_ushort v24, off, s[0:3], s32 offset:108
-; VI-NEXT:    buffer_load_ushort v21, off, s[0:3], s32 offset:116
-; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:124
-; VI-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:132
-; VI-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:140
-; VI-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:148
-; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:156
-; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:164
-; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:172
-; VI-NEXT:    buffer_load_ushort v34, off, s[0:3], s32 offset:180
-; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:188
-; VI-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:196
-; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:204
-; VI-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:212
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:276
+; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:260
+; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:244
+; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:236
 ; VI-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:220
+; VI-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:212
+; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:204
+; VI-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:196
+; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:188
+; VI-NEXT:    buffer_load_ushort v34, off, s[0:3], s32 offset:180
+; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:172
+; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:164
+; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:156
+; VI-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:148
+; VI-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:140
+; VI-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:132
+; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:124
+; VI-NEXT:    buffer_load_ushort v21, off, s[0:3], s32 offset:116
+; VI-NEXT:    buffer_load_ushort v24, off, s[0:3], s32 offset:108
+; VI-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:100
+; VI-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:92
+; VI-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:84
+; VI-NEXT:    buffer_load_ushort v22, off, s[0:3], s32 offset:76
+; VI-NEXT:    buffer_load_ushort v15, off, s[0:3], s32 offset:68
+; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:60
+; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:52
+; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:44
 ; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:228
-; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:236
-; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:244
 ; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:252
-; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:260
 ; VI-NEXT:    buffer_load_ushort v36, off, s[0:3], s32 offset:268
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:276
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:316
+; VI-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:300
 ; VI-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:284
 ; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:292
-; VI-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:300
 ; VI-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:308
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:316
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:324
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -19848,11 +19864,11 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:312
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:320
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:328
+; GFX9-NEXT:    buffer_load_ushort v13, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:28
 ; GFX9-NEXT:    buffer_load_ushort v11, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:12
 ; GFX9-NEXT:    buffer_load_ushort v12, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_ushort v13, off, s[0:3], s32 offset:36
 ; GFX9-NEXT:    s_waitcnt vmcnt(11)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v60, 8, v5
 ; GFX9-NEXT:    s_waitcnt vmcnt(10)
@@ -19864,32 +19880,33 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v21, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    buffer_load_ushort v14, off, s[0:3], s32 offset:52
-; GFX9-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:60
-; GFX9-NEXT:    buffer_load_ushort v34, off, s[0:3], s32 offset:68
-; GFX9-NEXT:    buffer_load_ushort v16, off, s[0:3], s32 offset:76
-; GFX9-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:84
-; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:92
-; GFX9-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:100
-; GFX9-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:108
-; GFX9-NEXT:    buffer_load_ushort v20, off, s[0:3], s32 offset:116
-; GFX9-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:124
-; GFX9-NEXT:    buffer_load_ushort v22, off, s[0:3], s32 offset:132
-; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:140
-; GFX9-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:148
-; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:156
-; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:164
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:220
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:204
+; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:196
+; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:188
 ; GFX9-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:172
+; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:164
+; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:156
+; GFX9-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:148
+; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:140
+; GFX9-NEXT:    buffer_load_ushort v22, off, s[0:3], s32 offset:132
+; GFX9-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:124
+; GFX9-NEXT:    buffer_load_ushort v20, off, s[0:3], s32 offset:116
+; GFX9-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:108
+; GFX9-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:100
+; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:92
+; GFX9-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:84
+; GFX9-NEXT:    buffer_load_ushort v16, off, s[0:3], s32 offset:76
+; GFX9-NEXT:    buffer_load_ushort v34, off, s[0:3], s32 offset:68
+; GFX9-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:60
+; GFX9-NEXT:    buffer_load_ushort v14, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    buffer_load_ushort v21, off, s[0:3], s32 offset:44
 ; GFX9-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:180
-; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:188
-; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:196
-; GFX9-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:204
 ; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:212
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:220
 ; GFX9-NEXT:    s_waitcnt vmcnt(29)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v42, 8, v3
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(22)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:228
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -19900,10 +19917,10 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:244
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:252
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:260
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:252
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:268
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -22598,9 +22615,9 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:8
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:8
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32
 ; SI-NEXT:    ; implicit-def: $vgpr60
@@ -22633,7 +22650,7 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr35
 ; SI-NEXT:    ; implicit-def: $vgpr32
 ; SI-NEXT:    ; implicit-def: $vgpr33
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:    ; kill: killed $vgpr31
@@ -22752,6 +22769,7 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v20, 0xffff0000, v19
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_and_b32_e32 v31, 0xffff0000, v62
 ; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -22856,6 +22874,7 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB16_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_add_i32_e32 v32, vcc, 3, v62
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v31, vcc, 3, v63
@@ -31052,9 +31071,9 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:8
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:8
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32
 ; SI-NEXT:    ; implicit-def: $vgpr60
@@ -31087,7 +31106,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr36
 ; SI-NEXT:    ; implicit-def: $vgpr34
 ; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:    ; kill: killed $vgpr31
@@ -31160,6 +31179,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB20_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v62
 ; SI-NEXT:    v_cvt_f32_f16_e32 v32, v31
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -35016,8 +35036,8 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
 ; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    ; implicit-def: $vgpr60
@@ -35051,7 +35071,7 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr49
 ; SI-NEXT:    ; kill: killed $vgpr39
 ; SI-NEXT:    ; implicit-def: $vgpr39
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -35102,6 +35122,7 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB24_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_add_i32_e32 v31, vcc, 3, v31
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v32, vcc, 3, v32
@@ -36348,12 +36369,12 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:116
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:4
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:116
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v31
@@ -37061,24 +37082,27 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:76
 ; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:24
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:28
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:36
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:44
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:56
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:52
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:72
+; SI-NEXT:    s_waitcnt expcnt(5)
 ; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:68
+; SI-NEXT:    s_waitcnt expcnt(3)
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:44
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:60
 ; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v5
@@ -37098,31 +37122,32 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v55
 ; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v40
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v36
-; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v10
-; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v12
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v33
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v50
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v49
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
@@ -37178,6 +37203,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    v_or_b32_e32 v21, v0, v21
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v30
 ; SI-NEXT:    v_or_b32_e32 v22, v0, v61
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v32
 ; SI-NEXT:    v_or_b32_e32 v23, v0, v23
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v53
@@ -37248,6 +37274,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v35, v34
 ; SI-NEXT:    v_mov_b32_e32 v34, v54
 ; SI-NEXT:    v_mov_b32_e32 v54, v14
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_mov_b32_e32 v33, v32
 ; SI-NEXT:    v_mov_b32_e32 v53, v63
 ; SI-NEXT:    v_mov_b32_e32 v62, v52
@@ -40049,8 +40076,8 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
 ; SI-NEXT:    ; implicit-def: $vgpr36
 ; SI-NEXT:    ; kill: killed $vgpr36
@@ -40212,7 +40239,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    ; kill: killed $vgpr36
 ; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -40450,6 +40477,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB36_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_add_f32_e32 v31, 1.0, v31
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_f32_e32 v32, 1.0, v32
@@ -41261,8 +41289,8 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; VI-NEXT:    ; implicit-def: $vgpr39
 ; VI-NEXT:    ; kill: killed $vgpr39
@@ -42230,8 +42258,8 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
 ; GFX9-NEXT:    ; kill: killed $vgpr40
@@ -42421,7 +42449,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(29)
+; GFX9-NEXT:    s_waitcnt vmcnt(30)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    ; kill: killed $vgpr33
@@ -42430,6 +42458,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB36_2
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
+; GFX9-NEXT:    s_waitcnt vmcnt(29)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v32
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v32
@@ -42625,6 +42654,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB36_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
+; GFX9-NEXT:    s_waitcnt vmcnt(29)
 ; GFX9-NEXT:    v_add_f32_e32 v32, 1.0, v32
 ; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    v_add_f32_e32 v31, 1.0, v31
@@ -49870,28 +49900,29 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v52
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:188
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:192
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:200
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:208
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:216
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:188
 ; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:196
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:196
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v3
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:204
@@ -49901,28 +49932,29 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:212
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:220
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:224
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:232
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:240
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:220
 ; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:228
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:228
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v3
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:236
@@ -49932,28 +49964,29 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:244
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:252
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:256
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:264
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:272
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:280
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:252
 ; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:260
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:260
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v3
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:268
@@ -49963,28 +49996,29 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:276
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:284
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:288
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:296
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:304
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:312
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:284
 ; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:292
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:292
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v3
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:300
@@ -49994,28 +50028,29 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:308
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:316
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:320
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:328
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:336
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:344
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:316
 ; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:324
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:324
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v3
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:332
@@ -50025,26 +50060,28 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:340
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:348
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:352
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:360
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:368
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:376
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:348
 ; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:356
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:356
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v43, 8, v3
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:364
@@ -50054,12 +50091,12 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:372
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:384
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:380
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:384
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:44
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:36
@@ -51386,25 +51423,27 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v56
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:192
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:200
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
 ; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:204
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -51412,25 +51451,27 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:212
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:220
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:224
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:232
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:240
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:248
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:220
 ; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:228
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:228
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:236
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -51438,25 +51479,27 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:244
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:252
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:256
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:264
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:272
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:280
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:252
 ; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:260
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:260
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:268
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -51464,25 +51507,27 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:276
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:284
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:288
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:296
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:304
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:312
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:284
 ; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:292
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:292
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:300
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -51490,25 +51535,27 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:308
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:316
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:320
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:328
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:336
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:344
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:316
 ; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:324
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:324
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:332
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -51516,25 +51563,27 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:340
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:348
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:352
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:360
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:368
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:376
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:348
 ; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:356
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:356
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -51542,11 +51591,9 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
 ; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
@@ -51562,6 +51609,8 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
 ; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
 ; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
+; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -52619,26 +52668,28 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v56
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:192
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
 ; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:192
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:200
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:204
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -52646,26 +52697,28 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:212
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:220
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:224
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:232
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:240
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:248
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:220
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:228
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:228
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:236
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -52673,26 +52726,28 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:244
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:252
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:256
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:264
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:272
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:280
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:252
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:260
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:260
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:268
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -52700,26 +52755,28 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:276
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:284
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:288
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:296
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:304
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:312
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:284
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:292
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:292
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:300
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -52727,26 +52784,28 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:308
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:316
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:320
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:328
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:336
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:344
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:316
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:324
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:324
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:332
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -52754,26 +52813,28 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:340
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:348
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:352
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:360
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:368
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:376
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:348
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:356
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:356
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -52781,11 +52842,10 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
 ; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
@@ -53703,50 +53763,6 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:384
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:380
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:376
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:372
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v51, off, s32 offset:368
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:364
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v51, off, s32 offset:360
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:356
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v50, off, s32 offset:352
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:348
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v52, off, s32 offset:344
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:340
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v52, off, s32 offset:336
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:332
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v54, off, s32 offset:328
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:324
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v54, off, s32 offset:320
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:316
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v53, off, s32 offset:312
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:308
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v55, off, s32 offset:304
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:300
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v55, off, s32 offset:296
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:292
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v67, off, s32 offset:288
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:284
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:280
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:276
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:272
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v39, off, s32 offset:268
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v67, off, s32 offset:264
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v39, off, s32 offset:260
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v68, off, s32 offset:256
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v48, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v71, off, s32 offset:248
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v48, off, s32 offset:244
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v71, off, s32 offset:240
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v49, off, s32 offset:236
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v70, off, s32 offset:232
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v49, off, s32 offset:228
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:224
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v50, off, s32 offset:220
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v80, off, s32 offset:216
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v113, off, s32 offset:388
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v83, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v84, off, s32 offset:8
@@ -53768,7 +53784,6 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v161, off, s32 offset:136
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v161, off, s32 offset:144
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v162, off, s32 offset:152
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v162, off, s32 offset:160
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v163, off, s32 offset:168
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v163, off, s32 offset:176
@@ -53776,6 +53791,51 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v164, off, s32 offset:192
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v165, off, s32 offset:200
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v165, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v80, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v70, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v71, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v71, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v68, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v67, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v67, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v55, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v55, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v53, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v54, off, s32 offset:320
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v54, off, s32 offset:328
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v52, off, s32 offset:336
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v52, off, s32 offset:344
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v50, off, s32 offset:352
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v51, off, s32 offset:360
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v51, off, s32 offset:368
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:376
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:384
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:380
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:372
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:364
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:356
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:348
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:340
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:332
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:324
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:316
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v39, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v39, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v48, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v48, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v49, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v49, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v50, off, s32 offset:220
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v53, off, s32 offset:212
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:204
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v64, off, s32 offset:196
@@ -53836,21 +53896,6 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v144.h, 8, v27.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v130.l, 8, v29.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(62)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.h, 8, v51.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v51.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(56)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v50.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v54.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v54.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(26)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.h, 8, v53.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v67.l, 8, v67.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v66.h, 8, v66.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v66.l, 8, v66.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v71.h, 8, v71.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v71.l, 8, v71.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(18)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v70.h, 8, v70.h
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v113
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v130.h, 8, v83.h
@@ -53885,12 +53930,24 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v84.h, 8, v165.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v85.l, 8, v80.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v85.h, 8, v80.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v70.h, 8, v70.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v71.l, 8, v71.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v71.h, 8, v71.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v80.l, 8, v68.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v80.h, 8, v67.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v66.l, 8, v66.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v66.h, 8, v66.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v67.l, 8, v67.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v67.h, 8, v55.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v68.l, 8, v55.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.h, 8, v53.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v54.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v54.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.l, 8, v52.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.h, 8, v52.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v50.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v51.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.h, 8, v51.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.l, 8, v31.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.h, 8, v31.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
@@ -54488,50 +54545,6 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0
 ; GFX11-FAKE16-NEXT:    s_clause 0x1f
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:384
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v64, off, s32 offset:380
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32 offset:376
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v65, off, s32 offset:372
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:368
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v66, off, s32 offset:364
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:360
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v67, off, s32 offset:356
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:352
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v68, off, s32 offset:348
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:344
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v69, off, s32 offset:340
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v12, off, s32 offset:336
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v70, off, s32 offset:332
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32 offset:328
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v71, off, s32 offset:324
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v16, off, s32 offset:320
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v80, off, s32 offset:316
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v18, off, s32 offset:312
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v81, off, s32 offset:308
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v20, off, s32 offset:304
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v82, off, s32 offset:300
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v22, off, s32 offset:296
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v83, off, s32 offset:292
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v24, off, s32 offset:288
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v84, off, s32 offset:284
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v26, off, s32 offset:280
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v85, off, s32 offset:276
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v28, off, s32 offset:272
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v86, off, s32 offset:268
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v30, off, s32 offset:264
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v87, off, s32 offset:260
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v31, off, s32 offset:256
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v96, off, s32 offset:252
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v99, off, s32 offset:248
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v97, off, s32 offset:244
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v100, off, s32 offset:240
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v98, off, s32 offset:236
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v101, off, s32 offset:232
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v112, off, s32 offset:228
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v102, off, s32 offset:224
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v113, off, s32 offset:220
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v103, off, s32 offset:216
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v114, off, s32 offset:388
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v115, off, s32
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v116, off, s32 offset:8
@@ -54553,7 +54566,6 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v165, off, s32 offset:136
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v166, off, s32 offset:144
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v179, off, s32 offset:152
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v180, off, s32 offset:160
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v181, off, s32 offset:168
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v182, off, s32 offset:176
@@ -54561,6 +54573,51 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v136, off, s32 offset:192
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v137, off, s32 offset:200
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v138, off, s32 offset:208
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v103, off, s32 offset:216
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v102, off, s32 offset:224
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v101, off, s32 offset:232
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v100, off, s32 offset:240
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v99, off, s32 offset:248
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v31, off, s32 offset:256
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v30, off, s32 offset:264
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v28, off, s32 offset:272
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v26, off, s32 offset:280
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v24, off, s32 offset:288
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v22, off, s32 offset:296
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v20, off, s32 offset:304
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v18, off, s32 offset:312
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v16, off, s32 offset:320
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32 offset:328
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v12, off, s32 offset:336
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:344
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:352
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:360
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:368
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32 offset:376
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:384
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v64, off, s32 offset:380
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v65, off, s32 offset:372
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v66, off, s32 offset:364
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v67, off, s32 offset:356
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v68, off, s32 offset:348
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v69, off, s32 offset:340
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v70, off, s32 offset:332
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v71, off, s32 offset:324
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v80, off, s32 offset:316
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v81, off, s32 offset:308
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v82, off, s32 offset:300
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v83, off, s32 offset:292
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v84, off, s32 offset:284
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v85, off, s32 offset:276
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v86, off, s32 offset:268
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v87, off, s32 offset:260
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v96, off, s32 offset:252
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v97, off, s32 offset:244
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v98, off, s32 offset:236
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v112, off, s32 offset:228
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v113, off, s32 offset:220
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v132, off, s32 offset:212
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v133, off, s32 offset:204
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v134, off, s32 offset:196
@@ -54604,61 +54661,34 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v109, 8, v25
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v110, 8, v27
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v93, 8, v29
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(54)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(62)
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v114
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(53)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v94, 8, v115
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(52)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v95, 8, v116
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(51)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v104, 8, v117
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(50)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v105, 8, v118
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(49)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v79, 8, v119
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(48)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v88, 8, v128
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(47)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v89, 8, v129
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(46)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v90, 8, v130
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(45)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v91, 8, v131
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(44)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v61, 8, v144
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(43)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v62, 8, v145
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(42)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v63, 8, v146
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(41)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v72, 8, v147
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(40)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v73, 8, v148
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(39)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v45, 8, v162
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(38)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v46, 8, v163
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(37)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v47, 8, v164
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(36)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v56, 8, v165
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(35)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v57, 8, v166
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(34)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v179, 8, v179
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(33)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v180, 8, v180
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(32)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v181, 8, v181
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(31)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v182, 8, v182
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(30)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v183, 8, v183
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(29)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v162, 8, v136
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(28)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v163, 8, v137
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(27)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v164, 8, v138
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v165, 8, v103
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v166, 8, v102
@@ -54668,19 +54698,33 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v147, 8, v31
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v148, 8, v30
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v119, 8, v28
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(61)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v128, 8, v26
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(60)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v129, 8, v24
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(59)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v130, 8, v22
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(58)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v131, 8, v20
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(57)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v114, 8, v18
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(56)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v115, 8, v16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(55)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v116, 8, v14
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(54)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v117, 8, v12
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(53)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v118, 8, v10
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(52)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v99, 8, v8
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(51)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v100, 8, v6
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(50)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v101, 8, v4
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(49)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v102, 8, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(48)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v103, 8, v0
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -55670,30 +55714,29 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:44
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:52
-; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:76
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:84
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:92
-; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:100
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:108
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:116
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:124
-; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:132
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:140
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:148
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:156
-; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:164
-; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:172
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:180
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:188
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:180
+; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:172
+; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:164
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:156
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:148
+; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:140
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:132
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:124
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:116
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:108
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:100
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:84
+; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:44
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:196
@@ -55707,14 +55750,14 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:212
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:220
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:260
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:244
 ; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:228
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:220
 ; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:236
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:244
 ; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:252
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:260
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:268
@@ -56904,11 +56947,11 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:312
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:320
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:328
-; VI-NEXT:    buffer_load_ushort v11, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_ushort v12, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_ushort v20, off, s[0:3], s32 offset:28
 ; VI-NEXT:    buffer_load_ushort v13, off, s[0:3], s32 offset:36
+; VI-NEXT:    buffer_load_ushort v20, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_ushort v12, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v11, off, s[0:3], s32 offset:4
 ; VI-NEXT:    s_waitcnt vmcnt(12)
 ; VI-NEXT:    v_lshlrev_b32_e32 v31, 8, v5
 ; VI-NEXT:    s_waitcnt vmcnt(11)
@@ -56923,44 +56966,44 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:44
-; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:52
-; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:60
-; VI-NEXT:    buffer_load_ushort v15, off, s[0:3], s32 offset:68
-; VI-NEXT:    buffer_load_ushort v22, off, s[0:3], s32 offset:76
-; VI-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:84
-; VI-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:92
-; VI-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:100
-; VI-NEXT:    buffer_load_ushort v24, off, s[0:3], s32 offset:108
-; VI-NEXT:    buffer_load_ushort v21, off, s[0:3], s32 offset:116
-; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:124
-; VI-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:132
-; VI-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:140
-; VI-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:148
-; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:156
-; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:164
-; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:172
-; VI-NEXT:    buffer_load_ushort v34, off, s[0:3], s32 offset:180
-; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:188
-; VI-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:196
-; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:204
-; VI-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:212
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:276
+; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:260
+; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:244
+; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:236
 ; VI-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:220
+; VI-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:212
+; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:204
+; VI-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:196
+; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:188
+; VI-NEXT:    buffer_load_ushort v34, off, s[0:3], s32 offset:180
+; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:172
+; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:164
+; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:156
+; VI-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:148
+; VI-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:140
+; VI-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:132
+; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:124
+; VI-NEXT:    buffer_load_ushort v21, off, s[0:3], s32 offset:116
+; VI-NEXT:    buffer_load_ushort v24, off, s[0:3], s32 offset:108
+; VI-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:100
+; VI-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:92
+; VI-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:84
+; VI-NEXT:    buffer_load_ushort v22, off, s[0:3], s32 offset:76
+; VI-NEXT:    buffer_load_ushort v15, off, s[0:3], s32 offset:68
+; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:60
+; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:52
+; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:44
 ; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:228
-; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:236
-; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:244
 ; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:252
-; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:260
 ; VI-NEXT:    buffer_load_ushort v36, off, s[0:3], s32 offset:268
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:276
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:316
+; VI-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:300
 ; VI-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:284
 ; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:292
-; VI-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:300
 ; VI-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:308
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:316
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:324
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -57912,11 +57955,11 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:312
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:320
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:328
+; GFX9-NEXT:    buffer_load_ushort v13, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:28
 ; GFX9-NEXT:    buffer_load_ushort v11, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:12
 ; GFX9-NEXT:    buffer_load_ushort v12, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_ushort v13, off, s[0:3], s32 offset:36
 ; GFX9-NEXT:    s_waitcnt vmcnt(11)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v60, 8, v5
 ; GFX9-NEXT:    s_waitcnt vmcnt(10)
@@ -57928,32 +57971,33 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v21, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    buffer_load_ushort v14, off, s[0:3], s32 offset:52
-; GFX9-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:60
-; GFX9-NEXT:    buffer_load_ushort v34, off, s[0:3], s32 offset:68
-; GFX9-NEXT:    buffer_load_ushort v16, off, s[0:3], s32 offset:76
-; GFX9-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:84
-; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:92
-; GFX9-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:100
-; GFX9-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:108
-; GFX9-NEXT:    buffer_load_ushort v20, off, s[0:3], s32 offset:116
-; GFX9-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:124
-; GFX9-NEXT:    buffer_load_ushort v22, off, s[0:3], s32 offset:132
-; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:140
-; GFX9-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:148
-; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:156
-; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:164
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:220
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:204
+; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:196
+; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:188
 ; GFX9-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:172
+; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:164
+; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:156
+; GFX9-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:148
+; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:140
+; GFX9-NEXT:    buffer_load_ushort v22, off, s[0:3], s32 offset:132
+; GFX9-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:124
+; GFX9-NEXT:    buffer_load_ushort v20, off, s[0:3], s32 offset:116
+; GFX9-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:108
+; GFX9-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:100
+; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:92
+; GFX9-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:84
+; GFX9-NEXT:    buffer_load_ushort v16, off, s[0:3], s32 offset:76
+; GFX9-NEXT:    buffer_load_ushort v34, off, s[0:3], s32 offset:68
+; GFX9-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:60
+; GFX9-NEXT:    buffer_load_ushort v14, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    buffer_load_ushort v21, off, s[0:3], s32 offset:44
 ; GFX9-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:180
-; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:188
-; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:196
-; GFX9-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:204
 ; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:212
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:220
 ; GFX9-NEXT:    s_waitcnt vmcnt(29)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v42, 8, v3
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(22)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:228
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -57964,10 +58008,10 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:244
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:252
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:260
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:252
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:268
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -60662,9 +60706,9 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:8
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:8
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32
 ; SI-NEXT:    ; implicit-def: $vgpr60
@@ -60697,7 +60741,7 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr35
 ; SI-NEXT:    ; implicit-def: $vgpr32
 ; SI-NEXT:    ; implicit-def: $vgpr33
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:    ; kill: killed $vgpr31
@@ -60816,6 +60860,7 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v20, 0xffff0000, v19
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_and_b32_e32 v31, 0xffff0000, v62
 ; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -60920,6 +60965,7 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB40_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_add_f32_e32 v32, 1.0, v62
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_f32_e32 v31, 1.0, v63
@@ -69162,9 +69208,9 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:8
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:8
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32
 ; SI-NEXT:    ; implicit-def: $vgpr60
@@ -69197,7 +69243,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr36
 ; SI-NEXT:    ; implicit-def: $vgpr34
 ; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:    ; kill: killed $vgpr31
@@ -69270,6 +69316,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB44_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v62
 ; SI-NEXT:    v_cvt_f32_f16_e32 v32, v31
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -73097,8 +73144,8 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
 ; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    ; implicit-def: $vgpr60
@@ -73132,7 +73179,7 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr49
 ; SI-NEXT:    ; kill: killed $vgpr39
 ; SI-NEXT:    ; implicit-def: $vgpr39
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -73183,6 +73230,7 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB48_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_add_f32_e32 v31, 1.0, v31
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_f32_e32 v32, 1.0, v32
@@ -74383,12 +74431,12 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:116
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:4
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:116
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v31
@@ -75096,24 +75144,27 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:76
 ; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:24
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:28
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:36
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:44
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:56
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:52
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:72
+; SI-NEXT:    s_waitcnt expcnt(5)
 ; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:68
+; SI-NEXT:    s_waitcnt expcnt(3)
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:44
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:60
 ; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v5
@@ -75133,31 +75184,32 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v55
 ; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v40
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v36
-; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v10
-; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v12
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v33
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v50
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v49
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
@@ -75213,6 +75265,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v21, v0, v21
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v30
 ; SI-NEXT:    v_or_b32_e32 v22, v0, v61
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v32
 ; SI-NEXT:    v_or_b32_e32 v23, v0, v23
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v53
@@ -75283,6 +75336,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v35, v34
 ; SI-NEXT:    v_mov_b32_e32 v34, v54
 ; SI-NEXT:    v_mov_b32_e32 v54, v14
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_mov_b32_e32 v33, v32
 ; SI-NEXT:    v_mov_b32_e32 v53, v63
 ; SI-NEXT:    v_mov_b32_e32 v62, v52
@@ -77070,8 +77124,8 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
 ; SI-NEXT:    ; implicit-def: $vgpr36
 ; SI-NEXT:    ; kill: killed $vgpr36
@@ -77233,7 +77287,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    ; kill: killed $vgpr36
 ; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -78282,8 +78336,8 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; VI-NEXT:    ; implicit-def: $vgpr39
 ; VI-NEXT:    ; kill: killed $vgpr39
@@ -79251,8 +79305,8 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
 ; GFX9-NEXT:    ; kill: killed $vgpr40
@@ -79442,7 +79496,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(29)
+; GFX9-NEXT:    s_waitcnt vmcnt(30)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    ; kill: killed $vgpr33
@@ -79451,6 +79505,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB56_2
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
+; GFX9-NEXT:    s_waitcnt vmcnt(29)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v32
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v32
@@ -85773,28 +85828,29 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v52
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:188
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:192
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:200
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:208
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:216
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:188
 ; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:196
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:196
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v3
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:204
@@ -85804,28 +85860,29 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:212
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:220
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:224
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:232
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:240
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:220
 ; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:228
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:228
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v3
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:236
@@ -85835,28 +85892,29 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:244
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:252
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:256
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:264
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:272
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:280
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:252
 ; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:260
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:260
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v3
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:268
@@ -85866,28 +85924,29 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:276
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:284
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:288
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:296
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:304
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:312
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:284
 ; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:292
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:292
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v3
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:300
@@ -85897,28 +85956,29 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:308
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:316
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:320
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:328
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:336
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:344
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:316
 ; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:324
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:324
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v3
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:332
@@ -85928,26 +85988,28 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:340
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:348
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:352
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:360
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:368
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:376
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:348
 ; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:356
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:356
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v43, 8, v3
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:364
@@ -85957,12 +86019,12 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:372
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:384
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:380
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:384
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:44
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:36
@@ -87289,25 +87351,27 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v56
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:192
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:200
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
 ; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:204
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -87315,25 +87379,27 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:212
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:220
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:224
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:232
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:240
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:248
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:220
 ; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:228
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:228
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:236
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -87341,25 +87407,27 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:244
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:252
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:256
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:264
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:272
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:280
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:252
 ; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:260
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:260
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:268
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -87367,25 +87435,27 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:276
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:284
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:288
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:296
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:304
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:312
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:284
 ; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:292
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:292
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:300
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -87393,25 +87463,27 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:308
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:316
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:320
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:328
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:336
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:344
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:316
 ; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:324
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:324
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:332
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -87419,25 +87491,27 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:340
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:348
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:352
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:360
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:368
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:376
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:348
 ; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:356
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:356
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -87445,11 +87519,9 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
 ; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
@@ -87465,6 +87537,8 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
 ; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
 ; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
+; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -88522,26 +88596,28 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v56
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:192
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
 ; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:192
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:200
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:204
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -88549,26 +88625,28 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:212
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:220
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:224
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:232
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:240
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:248
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:220
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:228
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:228
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:236
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -88576,26 +88654,28 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:244
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:252
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:256
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:264
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:272
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:280
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:252
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:260
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:260
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:268
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -88603,26 +88683,28 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:276
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:284
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:288
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:296
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:304
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:312
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:284
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:292
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:292
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:300
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -88630,26 +88712,28 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:308
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:316
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:320
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:328
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:336
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:344
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:316
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:324
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:324
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:332
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -88657,26 +88741,28 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:340
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:348
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:352
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:360
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:368
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:376
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:348
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:356
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:356
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -88684,11 +88770,10 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
 ; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
@@ -89606,50 +89691,6 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:384
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:380
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:376
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:372
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v51, off, s32 offset:368
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:364
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v51, off, s32 offset:360
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:356
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v50, off, s32 offset:352
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:348
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v52, off, s32 offset:344
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:340
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v52, off, s32 offset:336
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:332
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v54, off, s32 offset:328
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:324
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v54, off, s32 offset:320
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:316
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v53, off, s32 offset:312
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:308
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v55, off, s32 offset:304
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:300
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v55, off, s32 offset:296
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:292
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v67, off, s32 offset:288
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:284
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:280
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:276
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:272
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v39, off, s32 offset:268
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v67, off, s32 offset:264
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v39, off, s32 offset:260
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v68, off, s32 offset:256
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v48, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v71, off, s32 offset:248
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v48, off, s32 offset:244
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v71, off, s32 offset:240
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v49, off, s32 offset:236
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v70, off, s32 offset:232
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v49, off, s32 offset:228
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:224
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v50, off, s32 offset:220
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v80, off, s32 offset:216
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v113, off, s32 offset:388
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v83, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v84, off, s32 offset:8
@@ -89671,7 +89712,6 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v161, off, s32 offset:136
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v161, off, s32 offset:144
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v162, off, s32 offset:152
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v162, off, s32 offset:160
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v163, off, s32 offset:168
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v163, off, s32 offset:176
@@ -89679,6 +89719,51 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v164, off, s32 offset:192
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v165, off, s32 offset:200
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v165, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v80, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v70, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v71, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v71, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v68, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v67, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v67, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v55, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v55, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v53, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v54, off, s32 offset:320
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v54, off, s32 offset:328
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v52, off, s32 offset:336
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v52, off, s32 offset:344
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v50, off, s32 offset:352
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v51, off, s32 offset:360
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v51, off, s32 offset:368
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:376
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:384
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:380
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:372
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:364
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:356
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:348
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:340
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:332
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:324
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:316
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v39, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v39, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v48, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v48, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v49, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v49, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v50, off, s32 offset:220
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v53, off, s32 offset:212
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:204
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v64, off, s32 offset:196
@@ -89739,21 +89824,6 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v144.h, 8, v27.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v130.l, 8, v29.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(62)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.h, 8, v51.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v51.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(56)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v50.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v54.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v54.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(26)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.h, 8, v53.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v67.l, 8, v67.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v66.h, 8, v66.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v66.l, 8, v66.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v71.h, 8, v71.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v71.l, 8, v71.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(18)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v70.h, 8, v70.h
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v113
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v130.h, 8, v83.h
@@ -89788,12 +89858,24 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v84.h, 8, v165.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v85.l, 8, v80.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v85.h, 8, v80.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v70.h, 8, v70.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v71.l, 8, v71.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v71.h, 8, v71.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v80.l, 8, v68.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v80.h, 8, v67.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v66.l, 8, v66.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v66.h, 8, v66.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v67.l, 8, v67.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v67.h, 8, v55.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v68.l, 8, v55.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.h, 8, v53.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v54.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v54.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.l, 8, v52.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.h, 8, v52.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v50.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v51.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.h, 8, v51.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.l, 8, v31.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.h, 8, v31.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
@@ -90391,50 +90473,6 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0
 ; GFX11-FAKE16-NEXT:    s_clause 0x1f
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:384
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v64, off, s32 offset:380
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32 offset:376
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v65, off, s32 offset:372
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:368
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v66, off, s32 offset:364
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:360
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v67, off, s32 offset:356
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:352
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v68, off, s32 offset:348
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:344
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v69, off, s32 offset:340
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v12, off, s32 offset:336
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v70, off, s32 offset:332
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32 offset:328
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v71, off, s32 offset:324
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v16, off, s32 offset:320
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v80, off, s32 offset:316
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v18, off, s32 offset:312
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v81, off, s32 offset:308
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v20, off, s32 offset:304
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v82, off, s32 offset:300
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v22, off, s32 offset:296
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v83, off, s32 offset:292
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v24, off, s32 offset:288
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v84, off, s32 offset:284
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v26, off, s32 offset:280
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v85, off, s32 offset:276
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v28, off, s32 offset:272
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v86, off, s32 offset:268
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v30, off, s32 offset:264
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v87, off, s32 offset:260
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v31, off, s32 offset:256
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v96, off, s32 offset:252
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v99, off, s32 offset:248
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v97, off, s32 offset:244
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v100, off, s32 offset:240
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v98, off, s32 offset:236
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v101, off, s32 offset:232
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v112, off, s32 offset:228
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v102, off, s32 offset:224
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v113, off, s32 offset:220
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v103, off, s32 offset:216
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v114, off, s32 offset:388
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v115, off, s32
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v116, off, s32 offset:8
@@ -90456,7 +90494,6 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v165, off, s32 offset:136
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v166, off, s32 offset:144
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v179, off, s32 offset:152
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v180, off, s32 offset:160
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v181, off, s32 offset:168
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v182, off, s32 offset:176
@@ -90464,6 +90501,51 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v136, off, s32 offset:192
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v137, off, s32 offset:200
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v138, off, s32 offset:208
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v103, off, s32 offset:216
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v102, off, s32 offset:224
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v101, off, s32 offset:232
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v100, off, s32 offset:240
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v99, off, s32 offset:248
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v31, off, s32 offset:256
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v30, off, s32 offset:264
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v28, off, s32 offset:272
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v26, off, s32 offset:280
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v24, off, s32 offset:288
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v22, off, s32 offset:296
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v20, off, s32 offset:304
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v18, off, s32 offset:312
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v16, off, s32 offset:320
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32 offset:328
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v12, off, s32 offset:336
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:344
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:352
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:360
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:368
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32 offset:376
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:384
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v64, off, s32 offset:380
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v65, off, s32 offset:372
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v66, off, s32 offset:364
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v67, off, s32 offset:356
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v68, off, s32 offset:348
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v69, off, s32 offset:340
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v70, off, s32 offset:332
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v71, off, s32 offset:324
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v80, off, s32 offset:316
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v81, off, s32 offset:308
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v82, off, s32 offset:300
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v83, off, s32 offset:292
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v84, off, s32 offset:284
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v85, off, s32 offset:276
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v86, off, s32 offset:268
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v87, off, s32 offset:260
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v96, off, s32 offset:252
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v97, off, s32 offset:244
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v98, off, s32 offset:236
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v112, off, s32 offset:228
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v113, off, s32 offset:220
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v132, off, s32 offset:212
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v133, off, s32 offset:204
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v134, off, s32 offset:196
@@ -90507,61 +90589,34 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v109, 8, v25
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v110, 8, v27
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v93, 8, v29
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(54)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(62)
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v114
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(53)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v94, 8, v115
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(52)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v95, 8, v116
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(51)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v104, 8, v117
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(50)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v105, 8, v118
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(49)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v79, 8, v119
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(48)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v88, 8, v128
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(47)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v89, 8, v129
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(46)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v90, 8, v130
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(45)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v91, 8, v131
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(44)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v61, 8, v144
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(43)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v62, 8, v145
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(42)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v63, 8, v146
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(41)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v72, 8, v147
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(40)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v73, 8, v148
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(39)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v45, 8, v162
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(38)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v46, 8, v163
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(37)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v47, 8, v164
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(36)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v56, 8, v165
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(35)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v57, 8, v166
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(34)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v179, 8, v179
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(33)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v180, 8, v180
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(32)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v181, 8, v181
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(31)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v182, 8, v182
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(30)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v183, 8, v183
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(29)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v162, 8, v136
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(28)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v163, 8, v137
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(27)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v164, 8, v138
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v165, 8, v103
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v166, 8, v102
@@ -90571,19 +90626,33 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v147, 8, v31
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v148, 8, v30
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v119, 8, v28
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(61)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v128, 8, v26
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(60)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v129, 8, v24
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(59)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v130, 8, v22
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(58)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v131, 8, v20
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(57)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v114, 8, v18
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(56)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v115, 8, v16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(55)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v116, 8, v14
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(54)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v117, 8, v12
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(53)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v118, 8, v10
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(52)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v99, 8, v8
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(51)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v100, 8, v6
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(50)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v101, 8, v4
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(49)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v102, 8, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(48)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v103, 8, v0
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -91573,30 +91642,29 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:44
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:52
-; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:76
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:84
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:92
-; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:100
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:108
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:116
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:124
-; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:132
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:140
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:148
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:156
-; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:164
-; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:172
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:180
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:188
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:180
+; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:172
+; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:164
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:156
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:148
+; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:140
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:132
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:124
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:116
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:108
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:100
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:84
+; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:44
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:196
@@ -91610,14 +91678,14 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:212
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:220
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:260
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:244
 ; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:228
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:220
 ; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:236
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:244
 ; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:252
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:260
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:268
@@ -92807,11 +92875,11 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:312
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:320
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:328
-; VI-NEXT:    buffer_load_ushort v11, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_ushort v12, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_ushort v20, off, s[0:3], s32 offset:28
 ; VI-NEXT:    buffer_load_ushort v13, off, s[0:3], s32 offset:36
+; VI-NEXT:    buffer_load_ushort v20, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_ushort v12, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v11, off, s[0:3], s32 offset:4
 ; VI-NEXT:    s_waitcnt vmcnt(12)
 ; VI-NEXT:    v_lshlrev_b32_e32 v31, 8, v5
 ; VI-NEXT:    s_waitcnt vmcnt(11)
@@ -92826,44 +92894,44 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:44
-; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:52
-; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:60
-; VI-NEXT:    buffer_load_ushort v15, off, s[0:3], s32 offset:68
-; VI-NEXT:    buffer_load_ushort v22, off, s[0:3], s32 offset:76
-; VI-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:84
-; VI-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:92
-; VI-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:100
-; VI-NEXT:    buffer_load_ushort v24, off, s[0:3], s32 offset:108
-; VI-NEXT:    buffer_load_ushort v21, off, s[0:3], s32 offset:116
-; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:124
-; VI-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:132
-; VI-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:140
-; VI-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:148
-; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:156
-; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:164
-; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:172
-; VI-NEXT:    buffer_load_ushort v34, off, s[0:3], s32 offset:180
-; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:188
-; VI-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:196
-; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:204
-; VI-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:212
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:276
+; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:260
+; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:244
+; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:236
 ; VI-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:220
+; VI-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:212
+; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:204
+; VI-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:196
+; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:188
+; VI-NEXT:    buffer_load_ushort v34, off, s[0:3], s32 offset:180
+; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:172
+; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:164
+; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:156
+; VI-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:148
+; VI-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:140
+; VI-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:132
+; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:124
+; VI-NEXT:    buffer_load_ushort v21, off, s[0:3], s32 offset:116
+; VI-NEXT:    buffer_load_ushort v24, off, s[0:3], s32 offset:108
+; VI-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:100
+; VI-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:92
+; VI-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:84
+; VI-NEXT:    buffer_load_ushort v22, off, s[0:3], s32 offset:76
+; VI-NEXT:    buffer_load_ushort v15, off, s[0:3], s32 offset:68
+; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:60
+; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:52
+; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:44
 ; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:228
-; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:236
-; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:244
 ; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:252
-; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:260
 ; VI-NEXT:    buffer_load_ushort v36, off, s[0:3], s32 offset:268
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:276
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:316
+; VI-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:300
 ; VI-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:284
 ; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:292
-; VI-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:300
 ; VI-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:308
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:316
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:324
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -93815,11 +93883,11 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:312
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:320
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:328
+; GFX9-NEXT:    buffer_load_ushort v13, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:28
 ; GFX9-NEXT:    buffer_load_ushort v11, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:12
 ; GFX9-NEXT:    buffer_load_ushort v12, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_ushort v13, off, s[0:3], s32 offset:36
 ; GFX9-NEXT:    s_waitcnt vmcnt(11)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v60, 8, v5
 ; GFX9-NEXT:    s_waitcnt vmcnt(10)
@@ -93831,32 +93899,33 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v21, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    buffer_load_ushort v14, off, s[0:3], s32 offset:52
-; GFX9-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:60
-; GFX9-NEXT:    buffer_load_ushort v34, off, s[0:3], s32 offset:68
-; GFX9-NEXT:    buffer_load_ushort v16, off, s[0:3], s32 offset:76
-; GFX9-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:84
-; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:92
-; GFX9-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:100
-; GFX9-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:108
-; GFX9-NEXT:    buffer_load_ushort v20, off, s[0:3], s32 offset:116
-; GFX9-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:124
-; GFX9-NEXT:    buffer_load_ushort v22, off, s[0:3], s32 offset:132
-; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:140
-; GFX9-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:148
-; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:156
-; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:164
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:220
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:204
+; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:196
+; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:188
 ; GFX9-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:172
+; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:164
+; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:156
+; GFX9-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:148
+; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:140
+; GFX9-NEXT:    buffer_load_ushort v22, off, s[0:3], s32 offset:132
+; GFX9-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:124
+; GFX9-NEXT:    buffer_load_ushort v20, off, s[0:3], s32 offset:116
+; GFX9-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:108
+; GFX9-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:100
+; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:92
+; GFX9-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:84
+; GFX9-NEXT:    buffer_load_ushort v16, off, s[0:3], s32 offset:76
+; GFX9-NEXT:    buffer_load_ushort v34, off, s[0:3], s32 offset:68
+; GFX9-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:60
+; GFX9-NEXT:    buffer_load_ushort v14, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    buffer_load_ushort v21, off, s[0:3], s32 offset:44
 ; GFX9-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:180
-; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:188
-; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:196
-; GFX9-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:204
 ; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:212
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:220
 ; GFX9-NEXT:    s_waitcnt vmcnt(29)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v42, 8, v3
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(22)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:228
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -93867,10 +93936,10 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:244
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:252
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:260
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:252
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:268
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -96565,9 +96634,9 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:8
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:8
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32
 ; SI-NEXT:    ; implicit-def: $vgpr60
@@ -96600,7 +96669,7 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr35
 ; SI-NEXT:    ; implicit-def: $vgpr32
 ; SI-NEXT:    ; implicit-def: $vgpr33
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:    ; kill: killed $vgpr31
@@ -96719,6 +96788,7 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v20, 0xffff0000, v19
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_and_b32_e32 v31, 0xffff0000, v62
 ; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -105007,9 +105077,9 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:8
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:8
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32
 ; SI-NEXT:    ; implicit-def: $vgpr60
@@ -105042,7 +105112,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr36
 ; SI-NEXT:    ; implicit-def: $vgpr34
 ; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:    ; kill: killed $vgpr31
@@ -105115,6 +105185,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB64_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v62
 ; SI-NEXT:    v_cvt_f32_f16_e32 v32, v31
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -108984,8 +109055,8 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
 ; SI-NEXT:    ; implicit-def: $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr60
@@ -109019,7 +109090,7 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr50
 ; SI-NEXT:    ; kill: killed $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr48
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -110330,12 +110401,12 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:116
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:4
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:116
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v31
@@ -111043,24 +111114,27 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:76
 ; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:24
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:28
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:36
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:44
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:56
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:52
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:72
+; SI-NEXT:    s_waitcnt expcnt(5)
 ; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:68
+; SI-NEXT:    s_waitcnt expcnt(3)
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:44
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:60
 ; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v5
@@ -111080,31 +111154,32 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v55
 ; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v40
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v36
-; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v10
-; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v12
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v33
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v50
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v49
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
@@ -111160,6 +111235,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    v_or_b32_e32 v21, v0, v21
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v30
 ; SI-NEXT:    v_or_b32_e32 v22, v0, v61
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v32
 ; SI-NEXT:    v_or_b32_e32 v23, v0, v23
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v53
@@ -111230,6 +111306,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v35, v34
 ; SI-NEXT:    v_mov_b32_e32 v34, v54
 ; SI-NEXT:    v_mov_b32_e32 v54, v14
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_mov_b32_e32 v33, v32
 ; SI-NEXT:    v_mov_b32_e32 v53, v63
 ; SI-NEXT:    v_mov_b32_e32 v62, v52
@@ -112048,8 +112125,8 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; SI-NEXT:    ; implicit-def: $vgpr36
 ; SI-NEXT:    ; kill: killed $vgpr36
@@ -112211,7 +112288,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr38
 ; SI-NEXT:    ; kill: killed $vgpr36
 ; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -113244,8 +113321,8 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; VI-NEXT:    ; implicit-def: $vgpr39
 ; VI-NEXT:    ; kill: killed $vgpr39
@@ -114200,8 +114277,8 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX9-NEXT:    ; implicit-def: $vgpr41
 ; GFX9-NEXT:    ; kill: killed $vgpr41
@@ -114395,13 +114472,14 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(31)
+; GFX9-NEXT:    s_waitcnt vmcnt(32)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB72_2
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
+; GFX9-NEXT:    s_waitcnt vmcnt(31)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v32
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v32
@@ -121812,28 +121890,29 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v52
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:188
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:192
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:200
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:208
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:216
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:188
 ; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:196
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:196
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v3
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:204
@@ -121843,28 +121922,29 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:212
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:220
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:224
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:232
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:240
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:220
 ; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:228
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:228
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v3
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:236
@@ -121874,28 +121954,29 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:244
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:252
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:256
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:264
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:272
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:280
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:252
 ; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:260
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:260
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v3
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:268
@@ -121905,28 +121986,29 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:276
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:284
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:288
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:296
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:304
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:312
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:284
 ; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:292
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:292
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v3
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:300
@@ -121936,28 +122018,29 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:308
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:316
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:320
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:328
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:336
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:344
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:316
 ; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:324
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:324
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v3
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:332
@@ -121967,26 +122050,28 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:340
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:348
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:352
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:360
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:368
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:376
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:348
 ; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:356
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:356
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v43, 8, v3
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:364
@@ -121996,12 +122081,12 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:372
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:384
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:380
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:384
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:44
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:36
@@ -123328,25 +123413,27 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v56
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:192
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:200
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
 ; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:204
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -123354,25 +123441,27 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:212
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:220
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:224
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:232
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:240
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:248
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:220
 ; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:228
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:228
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:236
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -123380,25 +123469,27 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:244
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:252
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:256
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:264
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:272
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:280
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:252
 ; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:260
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:260
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:268
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -123406,25 +123497,27 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:276
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:284
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:288
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:296
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:304
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:312
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:284
 ; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:292
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:292
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:300
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -123432,25 +123525,27 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:308
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:316
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:320
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:328
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:336
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:344
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:316
 ; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:324
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:324
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:332
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -123458,25 +123553,27 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:340
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:348
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:352
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:360
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:368
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:376
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:348
 ; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:356
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:356
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -123484,11 +123581,9 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
 ; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
@@ -123504,6 +123599,8 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
 ; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
 ; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
+; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -124561,26 +124658,28 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v56
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:192
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
 ; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:192
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:200
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:204
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -124588,26 +124687,28 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:212
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:220
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:224
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:232
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:240
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:248
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:220
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:228
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:228
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:236
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -124615,26 +124716,28 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:244
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:252
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:256
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:264
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:272
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:280
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:252
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:260
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:260
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:268
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -124642,26 +124745,28 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:276
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:284
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:288
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:296
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:304
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:312
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:284
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:292
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:292
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:300
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -124669,26 +124774,28 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:308
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:316
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:320
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:328
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:336
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:344
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:316
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:324
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:324
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:332
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -124696,26 +124803,28 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:340
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:348
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:352
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:360
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:368
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:376
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:348
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:356
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:356
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -124723,11 +124832,10 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
 ; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
@@ -125645,50 +125753,6 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:384
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:380
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:376
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:372
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v51, off, s32 offset:368
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:364
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v51, off, s32 offset:360
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:356
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v50, off, s32 offset:352
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:348
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v52, off, s32 offset:344
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:340
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v52, off, s32 offset:336
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:332
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v54, off, s32 offset:328
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:324
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v54, off, s32 offset:320
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:316
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v53, off, s32 offset:312
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:308
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v55, off, s32 offset:304
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:300
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v55, off, s32 offset:296
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:292
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v67, off, s32 offset:288
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:284
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:280
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:276
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:272
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v39, off, s32 offset:268
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v67, off, s32 offset:264
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v39, off, s32 offset:260
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v68, off, s32 offset:256
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v48, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v71, off, s32 offset:248
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v48, off, s32 offset:244
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v71, off, s32 offset:240
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v49, off, s32 offset:236
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v70, off, s32 offset:232
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v49, off, s32 offset:228
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:224
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v50, off, s32 offset:220
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v80, off, s32 offset:216
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v113, off, s32 offset:388
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v83, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v84, off, s32 offset:8
@@ -125710,7 +125774,6 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v161, off, s32 offset:136
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v161, off, s32 offset:144
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v162, off, s32 offset:152
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v162, off, s32 offset:160
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v163, off, s32 offset:168
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v163, off, s32 offset:176
@@ -125718,6 +125781,51 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v164, off, s32 offset:192
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v165, off, s32 offset:200
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v165, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v80, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v70, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v71, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v71, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v68, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v67, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v67, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v55, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v55, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v53, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v54, off, s32 offset:320
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v54, off, s32 offset:328
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v52, off, s32 offset:336
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v52, off, s32 offset:344
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v50, off, s32 offset:352
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v51, off, s32 offset:360
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v51, off, s32 offset:368
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:376
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:384
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:380
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:372
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:364
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:356
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:348
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:340
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:332
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:324
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:316
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v39, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v39, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v48, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v48, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v49, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v49, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v50, off, s32 offset:220
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v53, off, s32 offset:212
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:204
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v64, off, s32 offset:196
@@ -125778,21 +125886,6 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v144.h, 8, v27.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v130.l, 8, v29.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(62)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.h, 8, v51.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v51.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(56)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v50.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v54.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v54.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(26)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.h, 8, v53.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v67.l, 8, v67.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v66.h, 8, v66.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v66.l, 8, v66.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v71.h, 8, v71.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v71.l, 8, v71.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(18)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v70.h, 8, v70.h
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v113
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v130.h, 8, v83.h
@@ -125827,12 +125920,24 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v84.h, 8, v165.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v85.l, 8, v80.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v85.h, 8, v80.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v70.h, 8, v70.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v71.l, 8, v71.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v71.h, 8, v71.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v80.l, 8, v68.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v80.h, 8, v67.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v66.l, 8, v66.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v66.h, 8, v66.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v67.l, 8, v67.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v67.h, 8, v55.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v68.l, 8, v55.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.h, 8, v53.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v54.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v54.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.l, 8, v52.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.h, 8, v52.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v50.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v51.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.h, 8, v51.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.l, 8, v31.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.h, 8, v31.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
@@ -126430,50 +126535,6 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0
 ; GFX11-FAKE16-NEXT:    s_clause 0x1f
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:384
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v64, off, s32 offset:380
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32 offset:376
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v65, off, s32 offset:372
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:368
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v66, off, s32 offset:364
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:360
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v67, off, s32 offset:356
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:352
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v68, off, s32 offset:348
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:344
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v69, off, s32 offset:340
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v12, off, s32 offset:336
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v70, off, s32 offset:332
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32 offset:328
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v71, off, s32 offset:324
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v16, off, s32 offset:320
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v80, off, s32 offset:316
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v18, off, s32 offset:312
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v81, off, s32 offset:308
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v20, off, s32 offset:304
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v82, off, s32 offset:300
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v22, off, s32 offset:296
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v83, off, s32 offset:292
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v24, off, s32 offset:288
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v84, off, s32 offset:284
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v26, off, s32 offset:280
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v85, off, s32 offset:276
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v28, off, s32 offset:272
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v86, off, s32 offset:268
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v30, off, s32 offset:264
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v87, off, s32 offset:260
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v31, off, s32 offset:256
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v96, off, s32 offset:252
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v99, off, s32 offset:248
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v97, off, s32 offset:244
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v100, off, s32 offset:240
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v98, off, s32 offset:236
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v101, off, s32 offset:232
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v112, off, s32 offset:228
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v102, off, s32 offset:224
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v113, off, s32 offset:220
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v103, off, s32 offset:216
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v114, off, s32 offset:388
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v115, off, s32
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v116, off, s32 offset:8
@@ -126495,7 +126556,6 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v165, off, s32 offset:136
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v166, off, s32 offset:144
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v179, off, s32 offset:152
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v180, off, s32 offset:160
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v181, off, s32 offset:168
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v182, off, s32 offset:176
@@ -126503,6 +126563,51 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v136, off, s32 offset:192
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v137, off, s32 offset:200
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v138, off, s32 offset:208
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v103, off, s32 offset:216
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v102, off, s32 offset:224
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v101, off, s32 offset:232
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v100, off, s32 offset:240
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v99, off, s32 offset:248
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v31, off, s32 offset:256
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v30, off, s32 offset:264
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v28, off, s32 offset:272
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v26, off, s32 offset:280
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v24, off, s32 offset:288
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v22, off, s32 offset:296
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v20, off, s32 offset:304
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v18, off, s32 offset:312
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v16, off, s32 offset:320
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32 offset:328
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v12, off, s32 offset:336
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:344
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:352
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:360
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:368
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32 offset:376
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:384
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v64, off, s32 offset:380
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v65, off, s32 offset:372
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v66, off, s32 offset:364
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v67, off, s32 offset:356
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v68, off, s32 offset:348
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v69, off, s32 offset:340
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v70, off, s32 offset:332
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v71, off, s32 offset:324
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v80, off, s32 offset:316
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v81, off, s32 offset:308
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v82, off, s32 offset:300
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v83, off, s32 offset:292
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v84, off, s32 offset:284
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v85, off, s32 offset:276
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v86, off, s32 offset:268
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v87, off, s32 offset:260
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v96, off, s32 offset:252
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v97, off, s32 offset:244
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v98, off, s32 offset:236
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v112, off, s32 offset:228
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v113, off, s32 offset:220
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v132, off, s32 offset:212
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v133, off, s32 offset:204
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v134, off, s32 offset:196
@@ -126546,61 +126651,34 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v109, 8, v25
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v110, 8, v27
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v93, 8, v29
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(54)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(62)
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v114
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(53)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v94, 8, v115
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(52)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v95, 8, v116
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(51)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v104, 8, v117
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(50)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v105, 8, v118
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(49)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v79, 8, v119
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(48)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v88, 8, v128
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(47)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v89, 8, v129
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(46)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v90, 8, v130
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(45)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v91, 8, v131
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(44)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v61, 8, v144
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(43)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v62, 8, v145
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(42)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v63, 8, v146
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(41)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v72, 8, v147
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(40)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v73, 8, v148
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(39)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v45, 8, v162
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(38)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v46, 8, v163
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(37)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v47, 8, v164
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(36)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v56, 8, v165
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(35)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v57, 8, v166
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(34)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v179, 8, v179
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(33)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v180, 8, v180
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(32)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v181, 8, v181
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(31)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v182, 8, v182
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(30)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v183, 8, v183
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(29)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v162, 8, v136
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(28)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v163, 8, v137
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(27)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v164, 8, v138
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v165, 8, v103
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v166, 8, v102
@@ -126610,19 +126688,33 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v147, 8, v31
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v148, 8, v30
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v119, 8, v28
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(61)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v128, 8, v26
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(60)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v129, 8, v24
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(59)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v130, 8, v22
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(58)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v131, 8, v20
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(57)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v114, 8, v18
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(56)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v115, 8, v16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(55)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v116, 8, v14
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(54)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v117, 8, v12
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(53)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v118, 8, v10
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(52)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v99, 8, v8
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(51)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v100, 8, v6
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(50)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v101, 8, v4
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(49)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v102, 8, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(48)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v103, 8, v0
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -127612,30 +127704,29 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:44
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:52
-; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:76
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:84
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:92
-; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:100
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:108
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:116
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:124
-; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:132
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:140
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:148
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:156
-; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:164
-; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:172
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:180
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:188
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:180
+; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:172
+; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:164
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:156
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:148
+; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:140
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:132
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:124
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:116
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:108
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:100
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:84
+; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:44
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:196
@@ -127649,14 +127740,14 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:212
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:220
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:260
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:244
 ; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:228
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:220
 ; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:236
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:244
 ; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:252
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:260
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:268
@@ -128846,11 +128937,11 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:312
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:320
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:328
-; VI-NEXT:    buffer_load_ushort v11, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_ushort v12, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_ushort v20, off, s[0:3], s32 offset:28
 ; VI-NEXT:    buffer_load_ushort v13, off, s[0:3], s32 offset:36
+; VI-NEXT:    buffer_load_ushort v20, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_ushort v12, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v11, off, s[0:3], s32 offset:4
 ; VI-NEXT:    s_waitcnt vmcnt(12)
 ; VI-NEXT:    v_lshlrev_b32_e32 v31, 8, v5
 ; VI-NEXT:    s_waitcnt vmcnt(11)
@@ -128865,44 +128956,44 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:44
-; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:52
-; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:60
-; VI-NEXT:    buffer_load_ushort v15, off, s[0:3], s32 offset:68
-; VI-NEXT:    buffer_load_ushort v22, off, s[0:3], s32 offset:76
-; VI-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:84
-; VI-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:92
-; VI-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:100
-; VI-NEXT:    buffer_load_ushort v24, off, s[0:3], s32 offset:108
-; VI-NEXT:    buffer_load_ushort v21, off, s[0:3], s32 offset:116
-; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:124
-; VI-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:132
-; VI-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:140
-; VI-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:148
-; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:156
-; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:164
-; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:172
-; VI-NEXT:    buffer_load_ushort v34, off, s[0:3], s32 offset:180
-; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:188
-; VI-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:196
-; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:204
-; VI-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:212
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:276
+; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:260
+; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:244
+; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:236
 ; VI-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:220
+; VI-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:212
+; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:204
+; VI-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:196
+; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:188
+; VI-NEXT:    buffer_load_ushort v34, off, s[0:3], s32 offset:180
+; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:172
+; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:164
+; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:156
+; VI-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:148
+; VI-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:140
+; VI-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:132
+; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:124
+; VI-NEXT:    buffer_load_ushort v21, off, s[0:3], s32 offset:116
+; VI-NEXT:    buffer_load_ushort v24, off, s[0:3], s32 offset:108
+; VI-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:100
+; VI-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:92
+; VI-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:84
+; VI-NEXT:    buffer_load_ushort v22, off, s[0:3], s32 offset:76
+; VI-NEXT:    buffer_load_ushort v15, off, s[0:3], s32 offset:68
+; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:60
+; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:52
+; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:44
 ; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:228
-; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:236
-; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:244
 ; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:252
-; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:260
 ; VI-NEXT:    buffer_load_ushort v36, off, s[0:3], s32 offset:268
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:276
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:316
+; VI-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:300
 ; VI-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:284
 ; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:292
-; VI-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:300
 ; VI-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:308
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:316
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:324
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -129854,11 +129945,11 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:312
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:320
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:328
+; GFX9-NEXT:    buffer_load_ushort v13, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:28
 ; GFX9-NEXT:    buffer_load_ushort v11, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:12
 ; GFX9-NEXT:    buffer_load_ushort v12, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_ushort v13, off, s[0:3], s32 offset:36
 ; GFX9-NEXT:    s_waitcnt vmcnt(11)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v60, 8, v5
 ; GFX9-NEXT:    s_waitcnt vmcnt(10)
@@ -129870,32 +129961,33 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v21, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    buffer_load_ushort v14, off, s[0:3], s32 offset:52
-; GFX9-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:60
-; GFX9-NEXT:    buffer_load_ushort v34, off, s[0:3], s32 offset:68
-; GFX9-NEXT:    buffer_load_ushort v16, off, s[0:3], s32 offset:76
-; GFX9-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:84
-; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:92
-; GFX9-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:100
-; GFX9-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:108
-; GFX9-NEXT:    buffer_load_ushort v20, off, s[0:3], s32 offset:116
-; GFX9-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:124
-; GFX9-NEXT:    buffer_load_ushort v22, off, s[0:3], s32 offset:132
-; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:140
-; GFX9-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:148
-; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:156
-; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:164
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:220
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:204
+; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:196
+; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:188
 ; GFX9-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:172
+; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:164
+; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:156
+; GFX9-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:148
+; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:140
+; GFX9-NEXT:    buffer_load_ushort v22, off, s[0:3], s32 offset:132
+; GFX9-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:124
+; GFX9-NEXT:    buffer_load_ushort v20, off, s[0:3], s32 offset:116
+; GFX9-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:108
+; GFX9-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:100
+; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:92
+; GFX9-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:84
+; GFX9-NEXT:    buffer_load_ushort v16, off, s[0:3], s32 offset:76
+; GFX9-NEXT:    buffer_load_ushort v34, off, s[0:3], s32 offset:68
+; GFX9-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:60
+; GFX9-NEXT:    buffer_load_ushort v14, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    buffer_load_ushort v21, off, s[0:3], s32 offset:44
 ; GFX9-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:180
-; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:188
-; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:196
-; GFX9-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:204
 ; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:212
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:220
 ; GFX9-NEXT:    s_waitcnt vmcnt(29)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v42, 8, v3
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(22)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:228
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -129906,10 +129998,10 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:244
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:252
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:260
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:252
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:268
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -132604,8 +132696,8 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; SI-NEXT:    ; implicit-def: $vgpr35
 ; SI-NEXT:    ; kill: killed $vgpr35
@@ -132703,13 +132795,14 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr37
 ; SI-NEXT:    ; kill: killed $vgpr35
 ; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB76_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v35, 0xffff0000, v32
 ; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@@ -140994,8 +141087,8 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; SI-NEXT:    ; implicit-def: $vgpr35
 ; SI-NEXT:    ; kill: killed $vgpr35
@@ -141093,7 +141186,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr36
 ; SI-NEXT:    ; kill: killed $vgpr35
 ; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -141143,6 +141236,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v52
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v32
 ; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v31
@@ -144851,8 +144945,8 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; SI-NEXT:    ; implicit-def: $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr60
@@ -144886,7 +144980,7 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr50
 ; SI-NEXT:    ; kill: killed $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr48
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -146041,12 +146135,12 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:116
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:4
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:116
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v31
@@ -146754,24 +146848,27 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:76
 ; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:24
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:28
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:36
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:44
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:56
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:52
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:72
+; SI-NEXT:    s_waitcnt expcnt(5)
 ; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:68
+; SI-NEXT:    s_waitcnt expcnt(3)
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:44
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:60
 ; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v5
@@ -146791,31 +146888,32 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v55
 ; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v40
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v36
-; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v10
-; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v12
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v33
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v50
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v49
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
@@ -146871,6 +146969,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v21, v0, v21
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v30
 ; SI-NEXT:    v_or_b32_e32 v22, v0, v61
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v32
 ; SI-NEXT:    v_or_b32_e32 v23, v0, v23
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v53
@@ -146941,6 +147040,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v35, v34
 ; SI-NEXT:    v_mov_b32_e32 v34, v54
 ; SI-NEXT:    v_mov_b32_e32 v54, v14
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_mov_b32_e32 v33, v32
 ; SI-NEXT:    v_mov_b32_e32 v53, v63
 ; SI-NEXT:    v_mov_b32_e32 v62, v52
@@ -147778,6 +147878,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:128
 ; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:392
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4
@@ -147798,7 +147899,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:164
 ; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:180
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:188
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:128
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -147847,7 +147947,9 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr12
 ; SI-NEXT:    ; implicit-def: $vgpr32
 ; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v18
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:144
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -147874,10 +147976,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v13
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:144
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v15
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -147895,6 +147994,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v25
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v18
 ; SI-NEXT:    ; implicit-def: $vgpr11
 ; SI-NEXT:    ; implicit-def: $vgpr10
 ; SI-NEXT:    ; implicit-def: $vgpr9
@@ -147904,7 +148004,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr17
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:160
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:140
@@ -147954,24 +148054,24 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:80
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:192
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:196
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:212
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:220
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:192
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(3) expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:208
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v3
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:204
@@ -147981,22 +148081,23 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:200
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:224
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:228
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:244
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:252
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:224
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 24, v2
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:240
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v19, 24, v2
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v26, 8, v3
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:236
@@ -148006,21 +148107,21 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:232
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:256
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:260
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:276
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:284
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:256
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v31, 24, v1
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v33, 24, v2
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v38, 8, v3
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:272
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v31, 24, v1
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v33, 24, v2
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshlrev_b32_e32 v38, 8, v3
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:268
@@ -148096,6 +148197,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:104
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:76
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:120
 ; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:152
@@ -148106,8 +148208,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:312
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:344
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:376
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:76
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:72
@@ -150032,25 +150133,27 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v44
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:192
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:200
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
 ; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:204
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -150058,25 +150161,27 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:212
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:220
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:224
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:232
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:240
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:248
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:220
 ; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:228
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:228
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:236
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -150084,25 +150189,27 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:244
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:252
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:256
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:264
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:272
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:280
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:252
 ; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:260
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:260
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:268
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -150110,25 +150217,27 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:276
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:284
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:288
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:296
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:304
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:312
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:284
 ; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:292
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:292
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:300
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -150136,25 +150245,27 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:308
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:316
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:320
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:328
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:336
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:344
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:316
 ; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:324
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:324
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:332
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -150162,23 +150273,23 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:340
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:348
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:352
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:360
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:368
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:376
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:348
 ; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:356
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v38, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b16_e32 v39, 8, v1
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshlrev_b16_e32 v49, 8, v2
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v51, 8, v3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:356
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -150186,11 +150297,9 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b16_e32 v53, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:44
 ; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:36
@@ -150206,6 +150315,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:68
 ; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:60
 ; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:52
+; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    v_lshlrev_b16_e32 v53, 8, v0
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -151274,26 +151385,28 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v44
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:192
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
 ; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:192
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:200
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:204
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -151301,26 +151414,28 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:212
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:220
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:224
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:232
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:240
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:248
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:220
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:228
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:228
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:236
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -151328,26 +151443,28 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:244
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:252
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:256
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:264
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:272
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:280
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:252
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:260
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:260
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:268
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -151355,26 +151472,28 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:276
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:284
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:288
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:296
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:304
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:312
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:284
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:292
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:292
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:300
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -151382,26 +151501,28 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:308
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:316
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:320
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:328
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:336
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:344
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:316
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:324
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:324
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:332
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -151409,24 +151530,23 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:340
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:348
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:352
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:360
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:368
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:376
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:348
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v37, 8, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v37, 8, v0
+; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:356
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v49, 8, v1
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v48, 8, v2
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v52, 8, v3
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:356
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -151434,11 +151554,10 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v53, 8, v0
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v53, 8, v0
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
 ; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:36
@@ -152361,50 +152480,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:384
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v39, off, s32 offset:380
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:376
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v50, off, s32 offset:372
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v150, off, s32 offset:368
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v39, off, s32 offset:364
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v150, off, s32 offset:360
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v50, off, s32 offset:356
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v147, off, s32 offset:352
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:348
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v149, off, s32 offset:344
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v49, off, s32 offset:340
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v149, off, s32 offset:336
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:332
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v148, off, s32 offset:328
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v48, off, s32 offset:324
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v145, off, s32 offset:320
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:316
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v148, off, s32 offset:312
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v48, off, s32 offset:308
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v147, off, s32 offset:304
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:300
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v146, off, s32 offset:296
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:292
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v146, off, s32 offset:288
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:284
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v145, off, s32 offset:280
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:276
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v134, off, s32 offset:272
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:268
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v144, off, s32 offset:264
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:260
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v144, off, s32 offset:256
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v135, off, s32 offset:248
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:244
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v132, off, s32 offset:240
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:236
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v135, off, s32 offset:232
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:228
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v134, off, s32 offset:224
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:220
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v133, off, s32 offset:216
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v160, off, s32 offset:388
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v101, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v103, off, s32 offset:8
@@ -152426,7 +152501,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v129, off, s32 offset:136
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v129, off, s32 offset:144
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v130, off, s32 offset:152
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v130, off, s32 offset:160
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v131, off, s32 offset:168
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v131, off, s32 offset:176
@@ -152434,6 +152508,51 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v133, off, s32 offset:192
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v151, off, s32 offset:200
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v151, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v133, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v134, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v135, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v132, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v135, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v144, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v144, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v134, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v145, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v146, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v146, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v147, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v148, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v145, off, s32 offset:320
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v148, off, s32 offset:328
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v149, off, s32 offset:336
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v149, off, s32 offset:344
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v147, off, s32 offset:352
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v150, off, s32 offset:360
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v150, off, s32 offset:368
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:376
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:384
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v39, off, s32 offset:380
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v50, off, s32 offset:372
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v39, off, s32 offset:364
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v50, off, s32 offset:356
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:348
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v49, off, s32 offset:340
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:332
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v48, off, s32 offset:324
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:316
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v48, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:220
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v100, off, s32 offset:212
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v98, off, s32 offset:204
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v100, off, s32 offset:196
@@ -152495,26 +152614,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.l, 8, v27.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.l, 8, v29.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.h, 8, v150.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.l, 8, v150.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.h, 8, v147.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v149.h, 8, v149.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v149.l, 8, v149.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v148.h, 8, v148.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v145.l, 8, v145.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v148.l, 8, v148.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.l, 8, v147.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v146.h, 8, v146.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v146.l, 8, v146.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v145.h, 8, v145.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v134.h, 8, v134.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v144.h, 8, v144.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v144.l, 8, v144.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v135.h, 8, v135.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.l, 8, v132.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v135.l, 8, v135.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v134.l, 8, v134.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.h, 8, v133.h
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v160
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v101.l, 8, v101.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.h, 8, v103.h
@@ -152543,6 +152642,26 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.h, 8, v133.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.h, 8, v151.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.l, 8, v151.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.h, 8, v133.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v134.l, 8, v134.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v135.l, 8, v135.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.l, 8, v132.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v135.h, 8, v135.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v144.l, 8, v144.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v144.h, 8, v144.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v134.h, 8, v134.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v145.h, 8, v145.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v146.l, 8, v146.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v146.h, 8, v146.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.l, 8, v147.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v148.l, 8, v148.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v145.l, 8, v145.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v148.h, 8, v148.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v149.l, 8, v149.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v149.h, 8, v149.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.h, 8, v147.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.l, 8, v150.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.h, 8, v150.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v151.l, 8, v31.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v151.h, 8, v31.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
@@ -153137,50 +153256,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v33, v6
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v38, v0
 ; GFX11-FAKE16-NEXT:    s_clause 0x1f
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:384
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v118, off, s32 offset:380
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32 offset:376
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v134, off, s32 offset:372
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:368
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v116, off, s32 offset:364
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:360
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v131, off, s32 offset:356
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:352
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v98, off, s32 offset:348
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:344
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v129, off, s32 offset:340
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v12, off, s32 offset:336
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v99, off, s32 offset:332
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32 offset:328
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v112, off, s32 offset:324
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v16, off, s32 offset:320
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v81, off, s32 offset:316
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v18, off, s32 offset:312
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v103, off, s32 offset:308
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v20, off, s32 offset:304
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v86, off, s32 offset:300
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v22, off, s32 offset:296
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v101, off, s32 offset:292
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v24, off, s32 offset:288
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v83, off, s32 offset:284
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v26, off, s32 offset:280
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v97, off, s32 offset:276
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v28, off, s32 offset:272
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v67, off, s32 offset:268
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v30, off, s32 offset:264
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v85, off, s32 offset:260
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v31, off, s32 offset:256
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v68, off, s32 offset:252
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v94, off, s32 offset:248
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v80, off, s32 offset:244
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v88, off, s32 offset:240
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v64, off, s32 offset:236
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v93, off, s32 offset:232
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v69, off, s32 offset:228
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v91, off, s32 offset:224
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v65, off, s32 offset:220
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v92, off, s32 offset:216
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v150, off, s32 offset:388
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v182, off, s32
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v40, off, s32 offset:8
@@ -153202,7 +153277,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v75, off, s32 offset:136
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v76, off, s32 offset:144
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v77, off, s32 offset:152
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v78, off, s32 offset:160
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v79, off, s32 offset:168
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v89, off, s32 offset:176
@@ -153210,6 +153284,51 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v95, off, s32 offset:192
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v104, off, s32 offset:200
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v105, off, s32 offset:208
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v92, off, s32 offset:216
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v91, off, s32 offset:224
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v93, off, s32 offset:232
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v88, off, s32 offset:240
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v94, off, s32 offset:248
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v31, off, s32 offset:256
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v30, off, s32 offset:264
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v28, off, s32 offset:272
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v26, off, s32 offset:280
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v24, off, s32 offset:288
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v22, off, s32 offset:296
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v20, off, s32 offset:304
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v18, off, s32 offset:312
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v16, off, s32 offset:320
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32 offset:328
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v12, off, s32 offset:336
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:344
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:352
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:360
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:368
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32 offset:376
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:384
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v118, off, s32 offset:380
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v134, off, s32 offset:372
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v116, off, s32 offset:364
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v131, off, s32 offset:356
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v98, off, s32 offset:348
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v129, off, s32 offset:340
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v99, off, s32 offset:332
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v112, off, s32 offset:324
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v81, off, s32 offset:316
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v103, off, s32 offset:308
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v86, off, s32 offset:300
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v101, off, s32 offset:292
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v83, off, s32 offset:284
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v97, off, s32 offset:276
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v67, off, s32 offset:268
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v85, off, s32 offset:260
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v68, off, s32 offset:252
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v80, off, s32 offset:244
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v64, off, s32 offset:236
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v69, off, s32 offset:228
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v65, off, s32 offset:220
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v42, off, s32 offset:212
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v180, off, s32 offset:204
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v183, off, s32 offset:196
@@ -153254,89 +153373,71 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v167, 8, v27
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v181, 8, v29
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(62)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v127, 8, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v126, 8, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v124, 8, v4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v125, 8, v6
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v120, 8, v8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v123, 8, v10
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v121, 8, v12
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v122, 8, v14
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v106, 8, v16
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v111, 8, v18
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v109, 8, v20
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v110, 8, v22
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v107, 8, v24
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v108, 8, v26
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(61)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v88, 8, v88
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(59)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v93, 8, v93
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(57)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v91, 8, v91
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(55)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v92, 8, v92
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(54)
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v150
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(53)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v150, 8, v182
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(52)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v41, 8, v40
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(51)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v40, 8, v43
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(50)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v43, 8, v44
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(49)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v182, 8, v45
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(48)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v46, 8, v46
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(47)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v45, 8, v47
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(46)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v57, 8, v56
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(45)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v56, 8, v58
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(44)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v58, 8, v59
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(43)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v44, 8, v60
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(42)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v60, 8, v61
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(41)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v59, 8, v62
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(40)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v62, 8, v63
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(39)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v47, 8, v72
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(38)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v72, 8, v73
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(37)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v63, 8, v74
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(36)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v74, 8, v75
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(35)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v73, 8, v76
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(34)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v75, 8, v77
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(33)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v61, 8, v78
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(32)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v78, 8, v79
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(31)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v77, 8, v89
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(30)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v79, 8, v90
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(29)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v76, 8, v95
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(28)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v90, 8, v104
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(27)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v89, 8, v105
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v92, 8, v92
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v91, 8, v91
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v93, 8, v93
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v88, 8, v88
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v104, 8, v94
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v95, 8, v31
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v105, 8, v30
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v94, 8, v28
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(61)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v108, 8, v26
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(60)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v107, 8, v24
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(59)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v110, 8, v22
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(58)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v109, 8, v20
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(57)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v111, 8, v18
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(56)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v106, 8, v16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(55)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v122, 8, v14
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(54)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v121, 8, v12
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(53)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v123, 8, v10
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(52)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v120, 8, v8
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(51)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v125, 8, v6
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(50)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v124, 8, v4
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(49)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v126, 8, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(48)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v127, 8, v0
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -153636,10 +153737,13 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB88_4
 ; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(46)
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v134, 3
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v118, 3
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(44)
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v131, 3
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v116, 3
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(42)
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v129, 3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
@@ -153656,10 +153760,12 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v116, 0x300, v1
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v30, 0x300, v2
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v98, 0x300, v3
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(40)
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v112, 3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v4
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v99, 3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(38)
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v103, 3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v123, v2
@@ -153674,10 +153780,12 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v28, 0x300, v1
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v81, 3
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v81, 0x300, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(36)
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v101, 3
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v27, 0x300, v0
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v86, 3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(34)
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v97, 3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v83, 3
@@ -153692,14 +153800,17 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v26, 0x300, v2
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v107, v4
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v86, 0x300, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(32)
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v85, 3
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v25, 0x300, v1
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v67, 3
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v67, 0x300, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(30)
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v80, 3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v68, 3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(28)
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v69, 3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v105, v0
@@ -155959,13 +156070,13 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:68
 ; VI-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:44
 ; VI-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:52
 ; VI-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:60
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:68
 ; VI-NEXT:    s_waitcnt vmcnt(10)
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:76
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -155998,19 +156109,19 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    buffer_load_ushort v34, off, s[0:3], s32 offset:204
 ; VI-NEXT:    buffer_load_ushort v25, off, s[0:3], s32 offset:212
 ; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:220
+; VI-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:244
 ; VI-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:228
+; VI-NEXT:    buffer_load_ushort v27, off, s[0:3], s32 offset:260
 ; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:236
-; VI-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:244
 ; VI-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:252
-; VI-NEXT:    buffer_load_ushort v27, off, s[0:3], s32 offset:260
 ; VI-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:268
 ; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:276
 ; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:284
 ; VI-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:292
 ; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:300
 ; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:308
-; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:316
 ; VI-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:324
+; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:316
 ; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
@@ -156987,11 +157098,11 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:320
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:328
-; GFX9-NEXT:    buffer_load_ushort v11, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_ushort v36, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:28
 ; GFX9-NEXT:    buffer_load_ushort v13, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_ushort v36, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_ushort v11, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v39, 8, v7
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
@@ -156999,100 +157110,108 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 8, v1
-; GFX9-NEXT:    buffer_load_ushort v21, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:52
-; GFX9-NEXT:    buffer_load_ushort v25, off, s[0:3], s32 offset:60
-; GFX9-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:68
-; GFX9-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:76
+; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:108
 ; GFX9-NEXT:    buffer_load_ushort v34, off, s[0:3], s32 offset:84
-; GFX9-NEXT:    buffer_load_ushort v27, off, s[0:3], s32 offset:92
+; GFX9-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:76
 ; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:100
-; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:108
+; GFX9-NEXT:    buffer_load_ushort v27, off, s[0:3], s32 offset:92
+; GFX9-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:68
+; GFX9-NEXT:    buffer_load_ushort v25, off, s[0:3], s32 offset:60
+; GFX9-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    buffer_load_ushort v21, off, s[0:3], s32 offset:44
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:132
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:116
 ; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:124
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:132
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:140
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:148
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:140
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v22, off, s[0:3], s32 offset:156
 ; GFX9-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:164
-; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:172
 ; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:180
 ; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:188
-; GFX9-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:196
-; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:204
 ; GFX9-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:212
-; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:220
+; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:204
 ; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:228
-; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:236
+; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:220
 ; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:244
-; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:252
+; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:236
 ; GFX9-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:260
-; GFX9-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:268
+; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:252
 ; GFX9-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:276
-; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:284
+; GFX9-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:268
 ; GFX9-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:292
-; GFX9-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:300
+; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:284
 ; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:308
-; GFX9-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:316
+; GFX9-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:300
 ; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:324
+; GFX9-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:316
+; GFX9-NEXT:    buffer_load_ushort v22, off, s[0:3], s32 offset:156
+; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:172
+; GFX9-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:196
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(23)
+; GFX9-NEXT:    s_waitcnt vmcnt(24)
 ; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(22)
+; GFX9-NEXT:    s_waitcnt vmcnt(24)
 ; GFX9-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(23)
+; GFX9-NEXT:    s_waitcnt vmcnt(25)
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(24)
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(28)
+; GFX9-NEXT:    s_waitcnt vmcnt(32)
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(33)
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(31)
+; GFX9-NEXT:    s_waitcnt vmcnt(35)
 ; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(36)
 ; GFX9-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(34)
+; GFX9-NEXT:    s_waitcnt vmcnt(38)
 ; GFX9-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(38)
 ; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(34)
+; GFX9-NEXT:    s_waitcnt vmcnt(38)
 ; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(38)
 ; GFX9-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(34)
+; GFX9-NEXT:    s_waitcnt vmcnt(38)
 ; GFX9-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(39)
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(35)
+; GFX9-NEXT:    s_waitcnt vmcnt(39)
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(39)
 ; GFX9-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(35)
+; GFX9-NEXT:    s_waitcnt vmcnt(39)
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(39)
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
@@ -157380,6 +157499,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:    s_branch .LBB89_3
 ; GFX9-NEXT:  .LBB89_2:
+; GFX9-NEXT:    s_waitcnt vmcnt(55)
 ; GFX9-NEXT:    v_mov_b32_e32 v58, v50
 ; GFX9-NEXT:    v_mov_b32_e32 v45, v59
 ; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
@@ -161522,8 +161642,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; VI-NEXT:    ; implicit-def: $vgpr35
 ; VI-NEXT:    ; implicit-def: $vgpr45
@@ -163074,8 +163194,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    ; kill: killed $vgpr33
@@ -163273,15 +163393,15 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(33)
+; GFX9-NEXT:    s_waitcnt vmcnt(34)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; GFX9-NEXT:    ; implicit-def: $vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB90_2
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
-; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v4
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 24, v8
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 24, v6
@@ -163310,20 +163430,21 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v60, 8, v18
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v61, 16, v17
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v43, 8, v17
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v16
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v16
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v16
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v15
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v14
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v4
@@ -163364,6 +163485,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v27
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(20)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v13
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v27
@@ -163722,6 +163844,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
 ; GFX9-NEXT:    v_cndmask_b32_e32 v17, v18, v19, vcc
 ; GFX9-NEXT:    v_perm_b32 v33, v17, v24, s7
+; GFX9-NEXT:    s_waitcnt vmcnt(53)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v63
 ; GFX9-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
 ; GFX9-NEXT:    v_bfe_u32 v18, v17, 16, 1
@@ -163755,8 +163878,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_e32 v19, 0x400000, v17
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
 ; GFX9-NEXT:    v_cndmask_b32_e32 v17, v18, v19, vcc
-; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_perm_b32 v35, v17, v23, s7
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
 ; GFX9-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
@@ -163789,8 +163912,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v37, v1, v22, s7
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
 ; GFX9-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v1, s6
 ; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v1
@@ -163934,7 +164057,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
 ; GFX9-NEXT:    v_cndmask_b32_e32 v11, v31, v32, vcc
 ; GFX9-NEXT:    v_perm_b32 v54, v11, v1, s7
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v14
 ; GFX9-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
 ; GFX9-NEXT:    v_bfe_u32 v31, v11, 16, 1
@@ -163949,6 +164072,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_e32 v32, 0x400000, v14
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
 ; GFX9-NEXT:    v_cndmask_b32_e32 v14, v31, v32, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v13
 ; GFX9-NEXT:    v_add_f32_e32 v31, 0x40c00000, v31
 ; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
@@ -163965,7 +164089,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
 ; GFX9-NEXT:    v_cndmask_b32_e32 v13, v31, v32, vcc
 ; GFX9-NEXT:    v_perm_b32 v41, v13, v0, s7
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v16
 ; GFX9-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
 ; GFX9-NEXT:    v_bfe_u32 v31, v13, 16, 1
@@ -163980,6 +164104,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_e32 v32, 0x400000, v16
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
 ; GFX9-NEXT:    v_cndmask_b32_e32 v16, v31, v32, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v15
 ; GFX9-NEXT:    v_add_f32_e32 v31, 0x40c00000, v31
 ; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
@@ -164261,8 +164386,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_mov_b32_e32 v63, v16
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v54, 8, v54
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v40, 8, v40
@@ -164278,10 +164403,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 24, v59
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v43, 8, v58
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v60
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v61
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v47, 8, v61
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v60
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v45, 8, v60
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v60, 8, v59
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v61, 16, v58
@@ -175501,13 +175626,13 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v48
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:200
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:196
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:204
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:212
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:220
 ; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:192
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:200
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
 ; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    ; implicit-def: $vgpr37
@@ -175526,177 +175651,170 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr17
 ; SI-NEXT:    ; implicit-def: $vgpr13
 ; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:208
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:208
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v4
 ; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:224
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:228
-; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:236
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:244
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:252
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:224
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:232
 ; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(3) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v6
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:232
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v4
 ; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:240
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:256
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:260
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:268
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:276
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:284
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:256
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:264
 ; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(3) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v6
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:264
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v4
 ; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:272
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:288
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:292
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:300
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:308
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:316
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:288
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:296
 ; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(3) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v6
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:296
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v4
 ; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:304
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:320
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:324
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:332
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:340
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:348
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:320
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:328
 ; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(3) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v6
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:328
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v4
 ; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:336
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:352
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:356
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:364
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:372
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:380
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:352
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:360
 ; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(3) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v6
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:360
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v4
 ; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr7
 ; SI-NEXT:    ; implicit-def: $vgpr6
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:368
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:388
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:384
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:388
 ; SI-NEXT:    ; implicit-def: $vgpr11
 ; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:48
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:40
@@ -175718,11 +175836,11 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:112
-; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:120
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:152
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:112
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:120
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:184
@@ -177749,25 +177867,27 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v44
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:192
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:200
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
 ; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:204
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -177775,25 +177895,27 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:212
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:220
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:224
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:232
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:240
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:248
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:220
 ; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:228
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:228
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:236
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -177801,25 +177923,27 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:244
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:252
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:256
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:264
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:272
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:280
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:252
 ; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:260
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:260
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:268
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -177827,25 +177951,27 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:276
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:284
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:288
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:296
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:304
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:312
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:284
 ; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:292
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:292
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:300
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -177853,25 +177979,27 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:308
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:316
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:320
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:328
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:336
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:344
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:316
 ; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:324
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:324
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:332
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -177879,23 +178007,23 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:340
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:348
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:352
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:360
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:368
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:376
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:348
 ; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:356
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v38, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b16_e32 v39, 8, v1
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshlrev_b16_e32 v49, 8, v2
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v51, 8, v3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:356
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -177903,11 +178031,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b16_e32 v53, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:44
 ; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:36
@@ -177923,6 +178049,8 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:68
 ; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:60
 ; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:52
+; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    v_lshlrev_b16_e32 v53, 8, v0
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -178991,26 +179119,28 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v44
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:192
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
 ; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:192
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:200
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:204
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -179018,26 +179148,28 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:212
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:220
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:224
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:232
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:240
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:248
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:220
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:228
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:228
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:236
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -179045,26 +179177,28 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:244
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:252
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:256
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:264
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:272
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:280
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:252
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:260
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:260
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:268
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -179072,26 +179206,28 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:276
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:284
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:288
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:296
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:304
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:312
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:284
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:292
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:292
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:300
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -179099,26 +179235,28 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:308
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:316
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:320
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:328
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:336
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:344
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:316
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:324
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:324
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:332
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -179126,24 +179264,23 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:340
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:348
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:352
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:360
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:368
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:376
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:348
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v37, 8, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v37, 8, v0
+; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:356
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v49, 8, v1
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v48, 8, v2
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v52, 8, v3
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:356
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -179151,11 +179288,10 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v53, 8, v0
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v53, 8, v0
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
 ; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:36
@@ -180078,50 +180214,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:384
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v39, off, s32 offset:380
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:376
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v50, off, s32 offset:372
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v150, off, s32 offset:368
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v39, off, s32 offset:364
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v150, off, s32 offset:360
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v50, off, s32 offset:356
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v147, off, s32 offset:352
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:348
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v149, off, s32 offset:344
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v49, off, s32 offset:340
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v149, off, s32 offset:336
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:332
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v148, off, s32 offset:328
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v48, off, s32 offset:324
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v145, off, s32 offset:320
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:316
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v148, off, s32 offset:312
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v48, off, s32 offset:308
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v147, off, s32 offset:304
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:300
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v146, off, s32 offset:296
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:292
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v146, off, s32 offset:288
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:284
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v145, off, s32 offset:280
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:276
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v134, off, s32 offset:272
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:268
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v144, off, s32 offset:264
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:260
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v144, off, s32 offset:256
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v135, off, s32 offset:248
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:244
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v132, off, s32 offset:240
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:236
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v135, off, s32 offset:232
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:228
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v134, off, s32 offset:224
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:220
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v133, off, s32 offset:216
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v160, off, s32 offset:388
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v101, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v103, off, s32 offset:8
@@ -180143,7 +180235,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v129, off, s32 offset:136
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v129, off, s32 offset:144
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v130, off, s32 offset:152
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v130, off, s32 offset:160
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v131, off, s32 offset:168
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v131, off, s32 offset:176
@@ -180151,6 +180242,51 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v133, off, s32 offset:192
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v151, off, s32 offset:200
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v151, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v133, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v134, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v135, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v132, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v135, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v144, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v144, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v134, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v145, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v146, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v146, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v147, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v148, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v145, off, s32 offset:320
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v148, off, s32 offset:328
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v149, off, s32 offset:336
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v149, off, s32 offset:344
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v147, off, s32 offset:352
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v150, off, s32 offset:360
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v150, off, s32 offset:368
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:376
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:384
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v39, off, s32 offset:380
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v50, off, s32 offset:372
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v39, off, s32 offset:364
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v50, off, s32 offset:356
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:348
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v49, off, s32 offset:340
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:332
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v48, off, s32 offset:324
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:316
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v48, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:220
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v100, off, s32 offset:212
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v98, off, s32 offset:204
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v100, off, s32 offset:196
@@ -180212,26 +180348,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.l, 8, v27.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.l, 8, v29.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.h, 8, v150.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.l, 8, v150.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.h, 8, v147.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v149.h, 8, v149.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v149.l, 8, v149.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v148.h, 8, v148.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v145.l, 8, v145.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v148.l, 8, v148.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.l, 8, v147.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v146.h, 8, v146.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v146.l, 8, v146.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v145.h, 8, v145.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v134.h, 8, v134.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v144.h, 8, v144.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v144.l, 8, v144.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v135.h, 8, v135.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.l, 8, v132.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v135.l, 8, v135.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v134.l, 8, v134.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.h, 8, v133.h
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v160
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v101.l, 8, v101.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.h, 8, v103.h
@@ -180260,6 +180376,26 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.h, 8, v133.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.h, 8, v151.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.l, 8, v151.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.h, 8, v133.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v134.l, 8, v134.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v135.l, 8, v135.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.l, 8, v132.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v135.h, 8, v135.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v144.l, 8, v144.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v144.h, 8, v144.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v134.h, 8, v134.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v145.h, 8, v145.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v146.l, 8, v146.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v146.h, 8, v146.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.l, 8, v147.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v148.l, 8, v148.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v145.l, 8, v145.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v148.h, 8, v148.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v149.l, 8, v149.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v149.h, 8, v149.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.h, 8, v147.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.l, 8, v150.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.h, 8, v150.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v151.l, 8, v31.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v151.h, 8, v31.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
@@ -180854,50 +180990,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v33, v6
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v38, v0
 ; GFX11-FAKE16-NEXT:    s_clause 0x1f
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:384
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v118, off, s32 offset:380
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32 offset:376
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v134, off, s32 offset:372
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:368
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v116, off, s32 offset:364
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:360
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v131, off, s32 offset:356
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:352
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v98, off, s32 offset:348
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:344
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v129, off, s32 offset:340
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v12, off, s32 offset:336
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v99, off, s32 offset:332
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32 offset:328
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v112, off, s32 offset:324
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v16, off, s32 offset:320
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v81, off, s32 offset:316
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v18, off, s32 offset:312
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v103, off, s32 offset:308
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v20, off, s32 offset:304
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v86, off, s32 offset:300
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v22, off, s32 offset:296
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v101, off, s32 offset:292
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v24, off, s32 offset:288
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v83, off, s32 offset:284
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v26, off, s32 offset:280
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v97, off, s32 offset:276
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v28, off, s32 offset:272
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v67, off, s32 offset:268
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v30, off, s32 offset:264
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v85, off, s32 offset:260
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v31, off, s32 offset:256
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v68, off, s32 offset:252
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v94, off, s32 offset:248
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v80, off, s32 offset:244
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v88, off, s32 offset:240
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v64, off, s32 offset:236
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v93, off, s32 offset:232
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v69, off, s32 offset:228
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v91, off, s32 offset:224
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v65, off, s32 offset:220
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v92, off, s32 offset:216
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v150, off, s32 offset:388
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v182, off, s32
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v40, off, s32 offset:8
@@ -180919,7 +181011,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v75, off, s32 offset:136
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v76, off, s32 offset:144
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v77, off, s32 offset:152
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v78, off, s32 offset:160
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v79, off, s32 offset:168
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v89, off, s32 offset:176
@@ -180927,6 +181018,51 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v95, off, s32 offset:192
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v104, off, s32 offset:200
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v105, off, s32 offset:208
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v92, off, s32 offset:216
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v91, off, s32 offset:224
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v93, off, s32 offset:232
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v88, off, s32 offset:240
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v94, off, s32 offset:248
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v31, off, s32 offset:256
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v30, off, s32 offset:264
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v28, off, s32 offset:272
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v26, off, s32 offset:280
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v24, off, s32 offset:288
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v22, off, s32 offset:296
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v20, off, s32 offset:304
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v18, off, s32 offset:312
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v16, off, s32 offset:320
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32 offset:328
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v12, off, s32 offset:336
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:344
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:352
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:360
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:368
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32 offset:376
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:384
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v118, off, s32 offset:380
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v134, off, s32 offset:372
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v116, off, s32 offset:364
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v131, off, s32 offset:356
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v98, off, s32 offset:348
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v129, off, s32 offset:340
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v99, off, s32 offset:332
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v112, off, s32 offset:324
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v81, off, s32 offset:316
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v103, off, s32 offset:308
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v86, off, s32 offset:300
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v101, off, s32 offset:292
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v83, off, s32 offset:284
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v97, off, s32 offset:276
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v67, off, s32 offset:268
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v85, off, s32 offset:260
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v68, off, s32 offset:252
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v80, off, s32 offset:244
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v64, off, s32 offset:236
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v69, off, s32 offset:228
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v65, off, s32 offset:220
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v42, off, s32 offset:212
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v180, off, s32 offset:204
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v183, off, s32 offset:196
@@ -180971,89 +181107,71 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v167, 8, v27
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v181, 8, v29
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(62)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v127, 8, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v126, 8, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v124, 8, v4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v125, 8, v6
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v120, 8, v8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v123, 8, v10
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v121, 8, v12
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v122, 8, v14
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v106, 8, v16
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v111, 8, v18
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v109, 8, v20
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v110, 8, v22
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v107, 8, v24
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v108, 8, v26
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(61)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v88, 8, v88
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(59)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v93, 8, v93
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(57)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v91, 8, v91
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(55)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v92, 8, v92
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(54)
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v150
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(53)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v150, 8, v182
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(52)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v41, 8, v40
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(51)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v40, 8, v43
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(50)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v43, 8, v44
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(49)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v182, 8, v45
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(48)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v46, 8, v46
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(47)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v45, 8, v47
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(46)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v57, 8, v56
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(45)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v56, 8, v58
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(44)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v58, 8, v59
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(43)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v44, 8, v60
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(42)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v60, 8, v61
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(41)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v59, 8, v62
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(40)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v62, 8, v63
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(39)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v47, 8, v72
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(38)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v72, 8, v73
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(37)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v63, 8, v74
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(36)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v74, 8, v75
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(35)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v73, 8, v76
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(34)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v75, 8, v77
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(33)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v61, 8, v78
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(32)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v78, 8, v79
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(31)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v77, 8, v89
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(30)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v79, 8, v90
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(29)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v76, 8, v95
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(28)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v90, 8, v104
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(27)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v89, 8, v105
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v92, 8, v92
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v91, 8, v91
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v93, 8, v93
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v88, 8, v88
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v104, 8, v94
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v95, 8, v31
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v105, 8, v30
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v94, 8, v28
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(61)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v108, 8, v26
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(60)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v107, 8, v24
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(59)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v110, 8, v22
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(58)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v109, 8, v20
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(57)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v111, 8, v18
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(56)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v106, 8, v16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(55)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v122, 8, v14
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(54)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v121, 8, v12
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(53)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v123, 8, v10
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(52)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v120, 8, v8
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(51)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v125, 8, v6
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(50)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v124, 8, v4
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(49)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v126, 8, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(48)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v127, 8, v0
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -181353,10 +181471,13 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB92_4
 ; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(46)
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v134, 3
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v118, 3
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(44)
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v131, 3
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v116, 3
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(42)
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v129, 3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
@@ -181373,10 +181494,12 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v116, 0x300, v1
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v30, 0x300, v2
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v98, 0x300, v3
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(40)
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v112, 3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v4
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v99, 3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(38)
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v103, 3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v123, v2
@@ -181391,10 +181514,12 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v28, 0x300, v1
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v81, 3
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v81, 0x300, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(36)
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v101, 3
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v27, 0x300, v0
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v86, 3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(34)
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v97, 3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v83, 3
@@ -181409,14 +181534,17 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v26, 0x300, v2
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v107, v4
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v86, 0x300, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(32)
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v85, 3
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v25, 0x300, v1
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v67, 3
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v67, 0x300, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(30)
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v80, 3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v68, 3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(28)
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v69, 3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v105, v0
@@ -183580,13 +183708,13 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:68
 ; VI-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:44
 ; VI-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:52
 ; VI-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:60
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:68
 ; VI-NEXT:    s_waitcnt vmcnt(10)
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:76
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -183619,19 +183747,19 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    buffer_load_ushort v34, off, s[0:3], s32 offset:204
 ; VI-NEXT:    buffer_load_ushort v25, off, s[0:3], s32 offset:212
 ; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:220
+; VI-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:244
 ; VI-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:228
+; VI-NEXT:    buffer_load_ushort v27, off, s[0:3], s32 offset:260
 ; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:236
-; VI-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:244
 ; VI-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:252
-; VI-NEXT:    buffer_load_ushort v27, off, s[0:3], s32 offset:260
 ; VI-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:268
 ; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:276
 ; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:284
 ; VI-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:292
 ; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:300
 ; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:308
-; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:316
 ; VI-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:324
+; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:316
 ; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
@@ -184608,11 +184736,11 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:320
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:328
-; GFX9-NEXT:    buffer_load_ushort v11, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_ushort v36, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:28
 ; GFX9-NEXT:    buffer_load_ushort v13, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_ushort v36, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_ushort v11, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v39, 8, v7
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
@@ -184620,100 +184748,108 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 8, v1
-; GFX9-NEXT:    buffer_load_ushort v21, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:52
-; GFX9-NEXT:    buffer_load_ushort v25, off, s[0:3], s32 offset:60
-; GFX9-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:68
-; GFX9-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:76
+; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:108
 ; GFX9-NEXT:    buffer_load_ushort v34, off, s[0:3], s32 offset:84
-; GFX9-NEXT:    buffer_load_ushort v27, off, s[0:3], s32 offset:92
+; GFX9-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:76
 ; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:100
-; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:108
+; GFX9-NEXT:    buffer_load_ushort v27, off, s[0:3], s32 offset:92
+; GFX9-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:68
+; GFX9-NEXT:    buffer_load_ushort v25, off, s[0:3], s32 offset:60
+; GFX9-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    buffer_load_ushort v21, off, s[0:3], s32 offset:44
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:132
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:116
 ; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:124
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:132
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:140
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:148
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:140
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v22, off, s[0:3], s32 offset:156
 ; GFX9-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:164
-; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:172
 ; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:180
 ; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:188
-; GFX9-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:196
-; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:204
 ; GFX9-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:212
-; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:220
+; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:204
 ; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:228
-; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:236
+; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:220
 ; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:244
-; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:252
+; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:236
 ; GFX9-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:260
-; GFX9-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:268
+; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:252
 ; GFX9-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:276
-; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:284
+; GFX9-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:268
 ; GFX9-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:292
-; GFX9-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:300
+; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:284
 ; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:308
-; GFX9-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:316
+; GFX9-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:300
 ; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:324
+; GFX9-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:316
+; GFX9-NEXT:    buffer_load_ushort v22, off, s[0:3], s32 offset:156
+; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:172
+; GFX9-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:196
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(23)
+; GFX9-NEXT:    s_waitcnt vmcnt(24)
 ; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(22)
+; GFX9-NEXT:    s_waitcnt vmcnt(24)
 ; GFX9-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(23)
+; GFX9-NEXT:    s_waitcnt vmcnt(25)
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(24)
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(28)
+; GFX9-NEXT:    s_waitcnt vmcnt(32)
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(33)
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(31)
+; GFX9-NEXT:    s_waitcnt vmcnt(35)
 ; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(36)
 ; GFX9-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(34)
+; GFX9-NEXT:    s_waitcnt vmcnt(38)
 ; GFX9-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(38)
 ; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(34)
+; GFX9-NEXT:    s_waitcnt vmcnt(38)
 ; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(38)
 ; GFX9-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(34)
+; GFX9-NEXT:    s_waitcnt vmcnt(38)
 ; GFX9-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(39)
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(35)
+; GFX9-NEXT:    s_waitcnt vmcnt(39)
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(39)
 ; GFX9-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(35)
+; GFX9-NEXT:    s_waitcnt vmcnt(39)
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(39)
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
@@ -185001,6 +185137,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:    s_branch .LBB93_3
 ; GFX9-NEXT:  .LBB93_2:
+; GFX9-NEXT:    s_waitcnt vmcnt(55)
 ; GFX9-NEXT:    v_mov_b32_e32 v58, v50
 ; GFX9-NEXT:    v_mov_b32_e32 v45, v59
 ; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
@@ -189290,8 +189427,8 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execz .LBB94_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v56, v38
 ; VI-NEXT:    v_mov_b32_e32 v45, v7
 ; VI-NEXT:    v_mov_b32_e32 v63, v53
@@ -189304,19 +189441,21 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b32_e32 v29, 24, v44
 ; VI-NEXT:    v_lshrrev_b32_e32 v5, 24, v32
 ; VI-NEXT:    v_lshrrev_b32_e32 v13, 24, v18
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 24, v1
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 8, v1
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshrrev_b64 v[0:1], 24, v[0:1]
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 8, v38
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 8, v37
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 8, v44
@@ -189326,76 +189465,75 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v62, v36
 ; VI-NEXT:    v_lshrrev_b32_e32 v41, 24, v38
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 8, v11
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 8, v10
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshrrev_b32_e32 v8, 24, v11
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshrrev_b32_e32 v23, 8, v6
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 24, v7
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 8, v7
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    v_lshrrev_b32_e32 v23, 8, v6
 ; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_lshrrev_b32_e32 v24, 8, v52
-; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshrrev_b32_e32 v57, 24, v53
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshrrev_b32_e32 v4, 24, v3
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshrrev_b32_e32 v20, 8, v53
+; VI-NEXT:    v_lshrrev_b32_e32 v24, 8, v52
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshrrev_b32_e32 v19, 8, v2
 ; VI-NEXT:    v_lshrrev_b32_e32 v25, 8, v3
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshrrev_b32_e32 v4, 24, v59
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v59
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v58
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v26
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshrrev_b32_e32 v14, 24, v27
 ; VI-NEXT:    v_lshrrev_b32_e32 v60, 8, v27
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshrrev_b32_e32 v9, 8, v33
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v34
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_lshrrev_b32_e32 v9, 8, v33
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshrrev_b32_e32 v42, 24, v34
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshrrev_b32_e32 v22, 8, v35
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v9, 24, v36
 ; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v61, 8, v31
 ; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
@@ -189425,6 +189563,8 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b64 v[0:1], 24, v[33:34]
 ; VI-NEXT:    v_lshrrev_b32_e32 v46, 8, v36
+; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    v_lshrrev_b32_e32 v22, 8, v35
 ; VI-NEXT:    v_lshrrev_b64 v[37:38], 24, v[35:36]
 ; VI-NEXT:    v_lshrrev_b64 v[10:11], 24, v[52:53]
 ; VI-NEXT:    v_lshrrev_b64 v[52:53], 24, v[58:59]
@@ -189444,10 +189584,9 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_mov_b32_e32 v29, v41
 ; VI-NEXT:    v_mov_b32_e32 v45, v60
 ; VI-NEXT:    v_lshrrev_b32_e32 v41, 8, v55
-; VI-NEXT:    s_waitcnt vmcnt(14)
-; VI-NEXT:    v_lshrrev_b32_e32 v21, 8, v49
 ; VI-NEXT:    v_lshrrev_b32_e32 v4, 24, v50
 ; VI-NEXT:    v_lshrrev_b32_e32 v30, 8, v50
+; VI-NEXT:    v_lshrrev_b32_e32 v21, 8, v49
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 24, v40
 ; VI-NEXT:    v_lshrrev_b64 v[35:36], 24, v[49:50]
 ; VI-NEXT:    v_lshrrev_b64 v[49:50], 24, v[39:40]
@@ -189498,33 +189637,35 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_e32 v62, v55, v0
 ; VI-NEXT:    v_add_f16_sdwa v0, v54, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; VI-NEXT:    v_add_f16_e32 v54, 0x200, v54
 ; VI-NEXT:    v_or_b32_e32 v61, v54, v0
 ; VI-NEXT:    v_mov_b32_e32 v26, v54
 ; VI-NEXT:    v_mov_b32_e32 v27, v55
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_f16_sdwa v60, v25, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v60
 ; VI-NEXT:    v_add_f16_e32 v25, 0x200, v25
 ; VI-NEXT:    v_or_b32_e32 v34, v25, v0
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_f16_sdwa v0, v24, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_f16_e32 v24, 0x200, v24
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; VI-NEXT:    v_or_b32_e32 v33, v24, v0
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_f16_sdwa v0, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; VI-NEXT:    v_add_f16_e32 v2, 0x200, v2
 ; VI-NEXT:    v_or_b32_e32 v36, v2, v0
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_f16_sdwa v0, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_f16_e32 v1, 0x200, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
@@ -189532,41 +189673,43 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; VI-NEXT:    v_or_b32_e32 v35, v1, v0
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_f16_sdwa v0, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; VI-NEXT:    v_add_f16_e32 v2, 0x200, v2
 ; VI-NEXT:    v_or_b32_e32 v38, v2, v0
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_f16_sdwa v0, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_f16_e32 v1, 0x200, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; VI-NEXT:    v_or_b32_e32 v37, v1, v0
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_add_f16_sdwa v1, v8, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_f16_sdwa v0, v9, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_f16_e32 v9, 0x200, v9
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_add_f16_sdwa v1, v8, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_f16_e32 v8, 0x200, v8
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; VI-NEXT:    v_or_b32_e32 v49, v9, v0
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_f16_sdwa v47, v3, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_f16_e32 v3, 0x200, v3
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_f16_sdwa v1, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_f16_e32 v2, 0x200, v2
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
@@ -189581,14 +189724,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_e32 v51, v3, v0
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; VI-NEXT:    v_or_b32_e32 v50, v2, v0
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_add_f16_sdwa v3, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_f16_sdwa v0, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; VI-NEXT:    v_add_f16_e32 v2, 0x200, v2
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_add_f16_sdwa v3, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_f16_e32 v1, 0x200, v1
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
@@ -189599,28 +189742,28 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_add_f16_sdwa v3, v44, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_f16_e32 v44, 0x200, v44
 ; VI-NEXT:    v_or_b32_e32 v52, v1, v0
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_add_f16_sdwa v59, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_f16_sdwa v0, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; VI-NEXT:    v_add_f16_e32 v2, 0x200, v2
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_add_f16_sdwa v59, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_f16_e32 v1, 0x200, v1
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; VI-NEXT:    v_or_b32_e32 v46, v2, v0
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v59
 ; VI-NEXT:    v_or_b32_e32 v45, v1, v0
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_add_f16_sdwa v1, v6, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_f16_sdwa v11, v7, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_f16_e32 v7, 0x200, v7
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_add_f16_sdwa v1, v6, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_f16_e32 v6, 0x200, v6
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v11
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
@@ -189629,45 +189772,46 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; VI-NEXT:    v_or_b32_e32 v4, v6, v0
-; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_add_f16_sdwa v39, v6, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_f16_sdwa v56, v7, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_f16_e32 v7, 0x200, v7
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_add_f16_sdwa v39, v6, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_f16_e32 v6, 0x200, v6
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v56
 ; VI-NEXT:    v_or_b32_e32 v41, v7, v0
 ; VI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v39
-; VI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_e32 v40, v6, v0
 ; VI-NEXT:    s_waitcnt vmcnt(5)
-; VI-NEXT:    v_add_f16_sdwa v19, v24, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_add_f16_sdwa v42, v25, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_f16_e32 v25, 0x200, v25
+; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    v_add_f16_sdwa v19, v24, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_f16_e32 v24, 0x200, v24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v42
 ; VI-NEXT:    v_or_b32_e32 v7, v25, v0
 ; VI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_add_f16_sdwa v28, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_f16_e32 v2, 0x200, v2
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_add_f16_sdwa v16, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_f16_e32 v1, 0x200, v1
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v19
-; VI-NEXT:    s_waitcnt vmcnt(5)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_add_f16_sdwa v13, v54, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_f16_e32 v54, 0x200, v54
 ; VI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
@@ -189679,7 +189823,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_e32 v31, v43, v0
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v28
 ; VI-NEXT:    v_or_b32_e32 v30, v2, v0
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_add_f16_sdwa v2, v55, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_f16_e32 v55, 0x200, v55
 ; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
@@ -190223,8 +190366,8 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
 ; GFX9-NEXT:    ; kill: killed $vgpr50
@@ -190349,7 +190492,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(17)
+; GFX9-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    ; kill: killed $vgpr33
@@ -190468,6 +190611,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v2
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(45)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v32
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v32
@@ -190606,6 +190750,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b64 v[33:34], 24, v[13:14]
+; GFX9-NEXT:    s_waitcnt vmcnt(19)
 ; GFX9-NEXT:    v_pk_add_f16 v32, v32, s6 op_sel_hi:[1,0]
 ; GFX9-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-NEXT:    v_pk_add_f16 v31, v31, s6 op_sel_hi:[1,0]
@@ -197789,6 +197934,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_mov_b32_e32 v54, v15
 ; SI-NEXT:    v_mov_b32_e32 v57, v5
 ; SI-NEXT:    v_mov_b32_e32 v41, v3
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:392
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:4
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -197808,7 +197954,6 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:164
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:148
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:188
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -197860,6 +198005,9 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr42
 ; SI-NEXT:    ; kill: killed $vgpr42
 ; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:96
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v36
 ; SI-NEXT:    v_lshlrev_b32_e32 v36, 24, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v7
@@ -197884,11 +198032,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v21
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v27
 ; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:96
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v27
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v17
@@ -197916,7 +198061,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:160
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:128
-; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:88
@@ -197950,8 +198095,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:108
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:180
 ; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:176
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:180
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v33
@@ -197963,48 +198108,48 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr10
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:172
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:192
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:196
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:220
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:192
+; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:216
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:212
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:208
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:212
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v8
 ; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:204
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:224
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:228
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:252
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:224
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:248
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:244
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:240
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:244
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v9
@@ -198012,24 +198157,24 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v8
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:236
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:256
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:260
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:284
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:256
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:280
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:272
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:276
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:272
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v11
@@ -198037,24 +198182,24 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v8
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:268
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:288
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:292
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:316
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:288
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:312
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:304
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:308
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:304
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v9
@@ -198063,101 +198208,99 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v8
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_lshlrev_b32_e32 v51, 24, v11
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:300
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:320
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:324
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:348
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:320
+; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    v_lshlrev_b32_e32 v51, 24, v11
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 24, v4
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v63, 8, v8
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:344
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v37, 24, v4
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshlrev_b32_e32 v63, 8, v8
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:336
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:340
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:336
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v60, 24, v9
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:332
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:352
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:356
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:380
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:352
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_lshlrev_b32_e32 v60, 24, v9
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v56, 24, v4
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v8
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:376
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v56, 24, v4
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v8
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:368
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:372
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:368
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:364
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:388
 ; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:384
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:388
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v26
 ; SI-NEXT:    ; implicit-def: $vgpr26
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 24, v8
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 24, v11
 ; SI-NEXT:    ; implicit-def: $vgpr11
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v32, 24, v8
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:120
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:44
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:40
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:44
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:32
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:76
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:72
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:76
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:64
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v35
@@ -200101,25 +200244,27 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v44
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:192
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:200
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
 ; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:204
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -200127,25 +200272,27 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:212
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:220
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:224
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:232
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:240
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:248
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:220
 ; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:228
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:228
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:236
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -200153,25 +200300,27 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:244
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:252
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:256
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:264
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:272
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:280
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:252
 ; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:260
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:260
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:268
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -200179,25 +200328,27 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:276
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:284
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:288
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:296
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:304
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:312
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:284
 ; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:292
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:292
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:300
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -200205,25 +200356,27 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:308
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:316
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:320
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:328
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:336
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:344
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:316
 ; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:324
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:324
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:332
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -200231,23 +200384,23 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:340
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:348
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:352
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:360
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:368
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:376
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:348
 ; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:356
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v38, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b16_e32 v39, 8, v1
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshlrev_b16_e32 v49, 8, v2
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v51, 8, v3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:356
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -200255,11 +200408,9 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b16_e32 v53, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:44
 ; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:36
@@ -200275,6 +200426,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:68
 ; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:60
 ; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:52
+; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    v_lshlrev_b16_e32 v53, 8, v0
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -201343,26 +201496,28 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v44
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:192
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
 ; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:192
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:200
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:204
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -201370,26 +201525,28 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:212
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:220
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:224
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:232
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:240
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:248
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:220
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:228
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:228
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:236
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -201397,26 +201554,28 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:244
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:252
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:256
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:264
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:272
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:280
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:252
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:260
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:260
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:268
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -201424,26 +201583,28 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:276
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:284
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:288
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:296
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:304
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:312
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:284
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:292
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:292
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:300
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -201451,26 +201612,28 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:308
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:316
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:320
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:328
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:336
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:344
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:316
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:324
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:324
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:332
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -201478,24 +201641,23 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:340
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:348
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:352
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:360
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:368
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:376
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:348
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v37, 8, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v37, 8, v0
+; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:356
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v49, 8, v1
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v48, 8, v2
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v52, 8, v3
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:356
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -201503,11 +201665,10 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v53, 8, v0
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v53, 8, v0
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
 ; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:36
@@ -202430,50 +202591,6 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:384
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v39, off, s32 offset:380
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:376
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v50, off, s32 offset:372
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v150, off, s32 offset:368
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v39, off, s32 offset:364
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v150, off, s32 offset:360
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v50, off, s32 offset:356
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v147, off, s32 offset:352
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:348
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v149, off, s32 offset:344
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v49, off, s32 offset:340
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v149, off, s32 offset:336
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:332
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v148, off, s32 offset:328
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v48, off, s32 offset:324
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v145, off, s32 offset:320
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:316
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v148, off, s32 offset:312
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v48, off, s32 offset:308
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v147, off, s32 offset:304
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:300
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v146, off, s32 offset:296
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:292
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v146, off, s32 offset:288
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:284
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v145, off, s32 offset:280
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:276
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v134, off, s32 offset:272
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:268
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v144, off, s32 offset:264
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:260
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v144, off, s32 offset:256
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v135, off, s32 offset:248
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:244
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v132, off, s32 offset:240
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:236
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v135, off, s32 offset:232
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:228
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v134, off, s32 offset:224
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:220
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v133, off, s32 offset:216
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v160, off, s32 offset:388
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v101, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v103, off, s32 offset:8
@@ -202495,7 +202612,6 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v129, off, s32 offset:136
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v129, off, s32 offset:144
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v130, off, s32 offset:152
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v130, off, s32 offset:160
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v131, off, s32 offset:168
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v131, off, s32 offset:176
@@ -202503,6 +202619,51 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v133, off, s32 offset:192
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v151, off, s32 offset:200
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v151, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v133, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v134, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v135, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v132, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v135, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v144, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v144, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v134, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v145, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v146, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v146, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v147, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v148, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v145, off, s32 offset:320
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v148, off, s32 offset:328
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v149, off, s32 offset:336
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v149, off, s32 offset:344
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v147, off, s32 offset:352
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v150, off, s32 offset:360
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v150, off, s32 offset:368
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:376
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:384
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v39, off, s32 offset:380
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v50, off, s32 offset:372
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v39, off, s32 offset:364
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v50, off, s32 offset:356
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:348
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v49, off, s32 offset:340
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:332
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v48, off, s32 offset:324
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:316
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v48, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:220
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v100, off, s32 offset:212
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v98, off, s32 offset:204
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v100, off, s32 offset:196
@@ -202564,26 +202725,6 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.l, 8, v27.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.l, 8, v29.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.h, 8, v150.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.l, 8, v150.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.h, 8, v147.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v149.h, 8, v149.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v149.l, 8, v149.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v148.h, 8, v148.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v145.l, 8, v145.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v148.l, 8, v148.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.l, 8, v147.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v146.h, 8, v146.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v146.l, 8, v146.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v145.h, 8, v145.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v134.h, 8, v134.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v144.h, 8, v144.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v144.l, 8, v144.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v135.h, 8, v135.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.l, 8, v132.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v135.l, 8, v135.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v134.l, 8, v134.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.h, 8, v133.h
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v160
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v101.l, 8, v101.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.h, 8, v103.h
@@ -202612,6 +202753,26 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.h, 8, v133.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.h, 8, v151.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.l, 8, v151.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.h, 8, v133.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v134.l, 8, v134.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v135.l, 8, v135.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.l, 8, v132.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v135.h, 8, v135.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v144.l, 8, v144.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v144.h, 8, v144.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v134.h, 8, v134.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v145.h, 8, v145.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v146.l, 8, v146.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v146.h, 8, v146.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.l, 8, v147.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v148.l, 8, v148.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v145.l, 8, v145.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v148.h, 8, v148.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v149.l, 8, v149.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v149.h, 8, v149.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.h, 8, v147.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.l, 8, v150.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.h, 8, v150.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v151.l, 8, v31.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v151.h, 8, v31.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
@@ -203206,50 +203367,6 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v33, v6
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v38, v0
 ; GFX11-FAKE16-NEXT:    s_clause 0x1f
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:384
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v118, off, s32 offset:380
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32 offset:376
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v134, off, s32 offset:372
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:368
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v116, off, s32 offset:364
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:360
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v131, off, s32 offset:356
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:352
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v98, off, s32 offset:348
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:344
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v129, off, s32 offset:340
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v12, off, s32 offset:336
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v99, off, s32 offset:332
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32 offset:328
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v112, off, s32 offset:324
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v16, off, s32 offset:320
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v81, off, s32 offset:316
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v18, off, s32 offset:312
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v103, off, s32 offset:308
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v20, off, s32 offset:304
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v86, off, s32 offset:300
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v22, off, s32 offset:296
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v101, off, s32 offset:292
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v24, off, s32 offset:288
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v83, off, s32 offset:284
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v26, off, s32 offset:280
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v97, off, s32 offset:276
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v28, off, s32 offset:272
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v67, off, s32 offset:268
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v30, off, s32 offset:264
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v85, off, s32 offset:260
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v31, off, s32 offset:256
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v68, off, s32 offset:252
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v94, off, s32 offset:248
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v80, off, s32 offset:244
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v88, off, s32 offset:240
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v64, off, s32 offset:236
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v93, off, s32 offset:232
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v69, off, s32 offset:228
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v91, off, s32 offset:224
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v65, off, s32 offset:220
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v92, off, s32 offset:216
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v150, off, s32 offset:388
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v182, off, s32
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v40, off, s32 offset:8
@@ -203271,7 +203388,6 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v75, off, s32 offset:136
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v76, off, s32 offset:144
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v77, off, s32 offset:152
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v78, off, s32 offset:160
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v79, off, s32 offset:168
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v89, off, s32 offset:176
@@ -203279,6 +203395,51 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v95, off, s32 offset:192
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v104, off, s32 offset:200
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v105, off, s32 offset:208
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v92, off, s32 offset:216
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v91, off, s32 offset:224
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v93, off, s32 offset:232
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v88, off, s32 offset:240
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v94, off, s32 offset:248
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v31, off, s32 offset:256
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v30, off, s32 offset:264
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v28, off, s32 offset:272
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v26, off, s32 offset:280
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v24, off, s32 offset:288
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v22, off, s32 offset:296
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v20, off, s32 offset:304
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v18, off, s32 offset:312
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v16, off, s32 offset:320
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32 offset:328
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v12, off, s32 offset:336
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:344
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:352
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:360
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:368
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32 offset:376
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:384
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v118, off, s32 offset:380
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v134, off, s32 offset:372
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v116, off, s32 offset:364
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v131, off, s32 offset:356
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v98, off, s32 offset:348
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v129, off, s32 offset:340
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v99, off, s32 offset:332
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v112, off, s32 offset:324
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v81, off, s32 offset:316
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v103, off, s32 offset:308
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v86, off, s32 offset:300
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v101, off, s32 offset:292
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v83, off, s32 offset:284
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v97, off, s32 offset:276
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v67, off, s32 offset:268
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v85, off, s32 offset:260
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v68, off, s32 offset:252
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v80, off, s32 offset:244
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v64, off, s32 offset:236
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v69, off, s32 offset:228
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v65, off, s32 offset:220
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v42, off, s32 offset:212
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v180, off, s32 offset:204
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v183, off, s32 offset:196
@@ -203323,89 +203484,71 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v167, 8, v27
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v181, 8, v29
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(62)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v127, 8, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v126, 8, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v124, 8, v4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v125, 8, v6
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v120, 8, v8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v123, 8, v10
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v121, 8, v12
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v122, 8, v14
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v106, 8, v16
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v111, 8, v18
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v109, 8, v20
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v110, 8, v22
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v107, 8, v24
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v108, 8, v26
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(61)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v88, 8, v88
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(59)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v93, 8, v93
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(57)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v91, 8, v91
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(55)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v92, 8, v92
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(54)
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v150
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(53)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v150, 8, v182
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(52)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v41, 8, v40
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(51)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v40, 8, v43
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(50)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v43, 8, v44
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(49)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v182, 8, v45
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(48)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v46, 8, v46
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(47)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v45, 8, v47
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(46)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v57, 8, v56
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(45)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v56, 8, v58
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(44)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v58, 8, v59
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(43)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v44, 8, v60
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(42)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v60, 8, v61
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(41)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v59, 8, v62
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(40)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v62, 8, v63
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(39)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v47, 8, v72
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(38)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v72, 8, v73
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(37)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v63, 8, v74
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(36)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v74, 8, v75
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(35)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v73, 8, v76
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(34)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v75, 8, v77
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(33)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v61, 8, v78
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(32)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v78, 8, v79
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(31)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v77, 8, v89
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(30)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v79, 8, v90
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(29)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v76, 8, v95
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(28)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v90, 8, v104
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(27)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v89, 8, v105
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v92, 8, v92
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v91, 8, v91
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v93, 8, v93
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v88, 8, v88
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v104, 8, v94
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v95, 8, v31
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v105, 8, v30
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v94, 8, v28
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(61)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v108, 8, v26
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(60)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v107, 8, v24
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(59)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v110, 8, v22
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(58)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v109, 8, v20
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(57)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v111, 8, v18
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(56)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v106, 8, v16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(55)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v122, 8, v14
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(54)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v121, 8, v12
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(53)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v123, 8, v10
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(52)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v120, 8, v8
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(51)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v125, 8, v6
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(50)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v124, 8, v4
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(49)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v126, 8, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(48)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v127, 8, v0
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -203705,10 +203848,13 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB96_4
 ; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(46)
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v134, 3
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v118, 3
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(44)
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v131, 3
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v116, 3
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(42)
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v129, 3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
@@ -203725,10 +203871,12 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v116, 0x300, v1
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v30, 0x300, v2
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v98, 0x300, v3
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(40)
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v112, 3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v4
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v99, 3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(38)
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v103, 3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v123, v2
@@ -203743,10 +203891,12 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v28, 0x300, v1
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v81, 3
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v81, 0x300, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(36)
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v101, 3
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v27, 0x300, v0
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v86, 3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(34)
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v97, 3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v83, 3
@@ -203761,14 +203911,17 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v26, 0x300, v2
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v107, v4
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v86, 0x300, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(32)
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v85, 3
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v25, 0x300, v1
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v67, 3
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v67, 0x300, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(30)
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v80, 3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v68, 3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(28)
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v69, 3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v105, v0
@@ -206003,13 +206156,13 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:68
 ; VI-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:44
 ; VI-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:52
 ; VI-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:60
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:68
 ; VI-NEXT:    s_waitcnt vmcnt(10)
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:76
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -206042,19 +206195,19 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    buffer_load_ushort v34, off, s[0:3], s32 offset:204
 ; VI-NEXT:    buffer_load_ushort v25, off, s[0:3], s32 offset:212
 ; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:220
+; VI-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:244
 ; VI-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:228
+; VI-NEXT:    buffer_load_ushort v27, off, s[0:3], s32 offset:260
 ; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:236
-; VI-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:244
 ; VI-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:252
-; VI-NEXT:    buffer_load_ushort v27, off, s[0:3], s32 offset:260
 ; VI-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:268
 ; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:276
 ; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:284
 ; VI-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:292
 ; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:300
 ; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:308
-; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:316
 ; VI-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:324
+; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:316
 ; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
@@ -207031,11 +207184,11 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:320
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:328
-; GFX9-NEXT:    buffer_load_ushort v11, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_ushort v36, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:28
 ; GFX9-NEXT:    buffer_load_ushort v13, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_ushort v36, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_ushort v11, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v39, 8, v7
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
@@ -207043,100 +207196,108 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 8, v1
-; GFX9-NEXT:    buffer_load_ushort v21, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:52
-; GFX9-NEXT:    buffer_load_ushort v25, off, s[0:3], s32 offset:60
-; GFX9-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:68
-; GFX9-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:76
+; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:108
 ; GFX9-NEXT:    buffer_load_ushort v34, off, s[0:3], s32 offset:84
-; GFX9-NEXT:    buffer_load_ushort v27, off, s[0:3], s32 offset:92
+; GFX9-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:76
 ; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:100
-; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:108
+; GFX9-NEXT:    buffer_load_ushort v27, off, s[0:3], s32 offset:92
+; GFX9-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:68
+; GFX9-NEXT:    buffer_load_ushort v25, off, s[0:3], s32 offset:60
+; GFX9-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    buffer_load_ushort v21, off, s[0:3], s32 offset:44
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:132
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:116
 ; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:124
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:132
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:140
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:148
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:140
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v22, off, s[0:3], s32 offset:156
 ; GFX9-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:164
-; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:172
 ; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:180
 ; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:188
-; GFX9-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:196
-; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:204
 ; GFX9-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:212
-; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:220
+; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:204
 ; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:228
-; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:236
+; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:220
 ; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:244
-; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:252
+; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:236
 ; GFX9-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:260
-; GFX9-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:268
+; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:252
 ; GFX9-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:276
-; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:284
+; GFX9-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:268
 ; GFX9-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:292
-; GFX9-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:300
+; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:284
 ; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:308
-; GFX9-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:316
+; GFX9-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:300
 ; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:324
+; GFX9-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:316
+; GFX9-NEXT:    buffer_load_ushort v22, off, s[0:3], s32 offset:156
+; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:172
+; GFX9-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:196
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(23)
+; GFX9-NEXT:    s_waitcnt vmcnt(24)
 ; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(22)
+; GFX9-NEXT:    s_waitcnt vmcnt(24)
 ; GFX9-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(23)
+; GFX9-NEXT:    s_waitcnt vmcnt(25)
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(24)
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(28)
+; GFX9-NEXT:    s_waitcnt vmcnt(32)
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(33)
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(31)
+; GFX9-NEXT:    s_waitcnt vmcnt(35)
 ; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(36)
 ; GFX9-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(34)
+; GFX9-NEXT:    s_waitcnt vmcnt(38)
 ; GFX9-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(38)
 ; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(34)
+; GFX9-NEXT:    s_waitcnt vmcnt(38)
 ; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(38)
 ; GFX9-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(34)
+; GFX9-NEXT:    s_waitcnt vmcnt(38)
 ; GFX9-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(39)
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(35)
+; GFX9-NEXT:    s_waitcnt vmcnt(39)
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(39)
 ; GFX9-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(35)
+; GFX9-NEXT:    s_waitcnt vmcnt(39)
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(39)
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
@@ -207424,6 +207585,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:    s_branch .LBB97_3
 ; GFX9-NEXT:  .LBB97_2:
+; GFX9-NEXT:    s_waitcnt vmcnt(55)
 ; GFX9-NEXT:    v_mov_b32_e32 v58, v50
 ; GFX9-NEXT:    v_mov_b32_e32 v45, v59
 ; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
@@ -209549,6 +209711,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:128
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:136
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:12
@@ -209560,7 +209723,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:108
 ; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:124
 ; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:132
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:128
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    ; implicit-def: $vgpr23
 ; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v28
@@ -209593,25 +209755,25 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    ; kill: killed $vgpr23
 ; SI-NEXT:    ; implicit-def: $vgpr23
 ; SI-NEXT:    s_waitcnt vmcnt(10)
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:120
+; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v13
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v19
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:120
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:116
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:112
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:116
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:104
@@ -209621,29 +209783,29 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:88
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:84
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:80
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:84
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v56
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:72
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v56
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:56
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:52
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:48
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:52
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:68
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:40
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:68
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v5
@@ -209789,10 +209951,10 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; kill: killed $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
 ; SI-NEXT:    ; kill: killed $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    s_waitcnt vmcnt(13)
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
 ; SI-NEXT:    ; kill: killed $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; kill: killed $vgpr2
@@ -209811,11 +209973,11 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:16
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:36
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:16
 ; SI-NEXT:    ; kill: killed $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; kill: killed $vgpr2
@@ -209886,26 +210048,26 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    ; kill: killed $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v60
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v59
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v63
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v60
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v59
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v63
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:100
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:96
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:100
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v61
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:64
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v61
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -212806,8 +212968,8 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
 ; GFX9-NEXT:    ; kill: killed $vgpr50
@@ -212932,7 +213094,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(17)
+; GFX9-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    ; kill: killed $vgpr33
@@ -213051,6 +213213,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v2
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(45)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v32
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v32
@@ -213188,6 +213351,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b64 v[33:34], 24, v[13:14]
+; GFX9-NEXT:    s_waitcnt vmcnt(19)
 ; GFX9-NEXT:    v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-NEXT:    v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0]
@@ -220152,8 +220316,8 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:76
 ; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:80
 ; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:84
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:92
 ; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:88
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:92
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -220223,8 +220387,9 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    v_mul_f32_e32 v44, 1.0, v44
 ; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_mul_f32_e32 v45, 1.0, v45
-; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_mul_f32_e32 v46, 1.0, v46
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_mul_f32_e32 v47, 1.0, v47
 ; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_mul_f32_e32 v56, 1.0, v1
@@ -227723,8 +227888,8 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:76
 ; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:80
 ; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:84
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:92
 ; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:88
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:92
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
@@ -227907,8 +228072,9 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v44, v44
 ; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v45, v45
-; SI-NEXT:    s_waitcnt vmcnt(11)
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v46, v46
+; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    ; kill: killed $vgpr4
 ; SI-NEXT:    ; implicit-def: $vgpr4
@@ -230712,8 +230878,8 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:80
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:84
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:92
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:88
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:92
 ; SI-NEXT:    v_mul_f32_e32 v33, 1.0, v1
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v3
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
@@ -230807,7 +230973,6 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    v_mul_f32_e32 v30, 1.0, v51
 ; SI-NEXT:    v_mul_f32_e32 v6, 1.0, v41
 ; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v42
-; SI-NEXT:    v_mul_f32_e32 v27, 1.0, v45
 ; SI-NEXT:    v_mul_f32_e32 v23, 1.0, v61
 ; SI-NEXT:    ; implicit-def: $vgpr61
 ; SI-NEXT:    ; kill: killed $vgpr61
@@ -230858,6 +231023,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr61
 ; SI-NEXT:    ; kill: killed $vgpr61
 ; SI-NEXT:    ; implicit-def: $vgpr61
+; SI-NEXT:    v_mul_f32_e32 v27, 1.0, v45
 ; SI-NEXT:    ; implicit-def: $vgpr45
 ; SI-NEXT:    ; kill: killed $vgpr61
 ; SI-NEXT:    ; implicit-def: $vgpr61
@@ -240177,8 +240343,8 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:80
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:84
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:92
 ; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:88
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:92
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v34, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v9
@@ -240245,8 +240411,7 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v49, v32
 ; SI-NEXT:    v_cvt_f16_f32_e32 v51, v60
-; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_cvt_f16_f32_e32 v50, v37
+; SI-NEXT:    v_cvt_f16_f32_e32 v61, v61
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:96
@@ -240256,11 +240421,11 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:112
 ; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:116
 ; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:120
-; SI-NEXT:    v_cvt_f16_f32_e32 v61, v61
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_cvt_f16_f32_e32 v50, v37
 ; SI-NEXT:    v_cvt_f16_f32_e32 v35, v33
 ; SI-NEXT:    v_cvt_f16_f32_e32 v33, v63
 ; SI-NEXT:    v_cvt_f16_f32_e32 v36, v36
-; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v45, v39
 ; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v14, v6
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
index 01e397d629ea9..64400ac3fff6e 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
@@ -6473,8 +6473,8 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v21.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v19.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v17.l
@@ -6504,8 +6504,9 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v29.l
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.l, 8, v31.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v32
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB26_3
@@ -13668,8 +13669,8 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v21.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v19.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v17.l
@@ -13699,8 +13700,9 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v29.l
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.l, 8, v31.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v32
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB50_3
@@ -20351,8 +20353,8 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v21.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v19.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v17.l
@@ -20382,8 +20384,9 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v29.l
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.l, 8, v31.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v32
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB70_3
@@ -26536,8 +26539,8 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v21.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v19.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v17.l
@@ -26567,8 +26570,9 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v29.l
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.l, 8, v31.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v32
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB86_3
@@ -32406,8 +32410,8 @@ define <16 x i16> @bitcast_v32i8_to_v16i16(<32 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v17.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v13.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v11.l
@@ -32435,8 +32439,9 @@ define <16 x i16> @bitcast_v32i8_to_v16i16(<32 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v29.l
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v31.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v32
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB98_3
@@ -37671,8 +37676,8 @@ define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v17.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v13.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v11.l
@@ -37700,8 +37705,9 @@ define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v29.l
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v31.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v32
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB106_3
@@ -41974,8 +41980,8 @@ define <16 x bfloat> @bitcast_v32i8_to_v16bf16(<32 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v17.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v13.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v11.l
@@ -42003,8 +42009,9 @@ define <16 x bfloat> @bitcast_v32i8_to_v16bf16(<32 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v29.l
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v31.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v32
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB110_3
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
index 9041f64cb17fb..b2b1cdb1f58ca 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
@@ -4278,14 +4278,14 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_mov_b32_e32 v33, v4
 ; SI-NEXT:    v_mov_b32_e32 v32, v2
 ; SI-NEXT:    v_mov_b32_e32 v31, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:28
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:8
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:28
 ; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:4
@@ -4305,17 +4305,17 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v23, 24, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v21, 8, v29
 ; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 24, v0
-; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    v_lshlrev_b32_e32 v19, 24, v0
 ; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 24, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 8, v8
+; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 24, v44
 ; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 8, v6
 ; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 8, v8
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 24, v44
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 24, v4
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -4572,13 +4572,13 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_mov_b32_e32 v33, v4
 ; VI-NEXT:    v_mov_b32_e32 v32, v2
 ; VI-NEXT:    v_mov_b32_e32 v31, v0
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:32
-; VI-NEXT:    buffer_load_ushort v36, off, s[0:3], s32 offset:28
-; VI-NEXT:    buffer_load_ushort v6, off, s[0:3], s32 offset:24
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32
 ; VI-NEXT:    buffer_load_ushort v8, off, s[0:3], s32 offset:8
 ; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:16
+; VI-NEXT:    buffer_load_ushort v6, off, s[0:3], s32 offset:24
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:32
+; VI-NEXT:    buffer_load_ushort v36, off, s[0:3], s32 offset:28
 ; VI-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:20
 ; VI-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:12
 ; VI-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:4
@@ -4598,17 +4598,17 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v23, 8, v27
 ; VI-NEXT:    v_lshlrev_b16_e32 v21, 8, v29
 ; VI-NEXT:    s_waitcnt vmcnt(9)
-; VI-NEXT:    v_lshlrev_b16_e32 v19, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(8)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; VI-NEXT:    s_waitcnt vmcnt(8)
+; VI-NEXT:    v_lshlrev_b16_e32 v19, 8, v0
 ; VI-NEXT:    s_waitcnt vmcnt(7)
-; VI-NEXT:    v_lshlrev_b16_e32 v11, 8, v4
+; VI-NEXT:    v_lshlrev_b16_e32 v17, 8, v8
+; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    v_lshlrev_b16_e32 v15, 8, v44
 ; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v13, 8, v6
 ; VI-NEXT:    s_waitcnt vmcnt(4)
-; VI-NEXT:    v_lshlrev_b16_e32 v17, 8, v8
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_lshlrev_b16_e32 v15, 8, v44
+; VI-NEXT:    v_lshlrev_b16_e32 v11, 8, v4
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -4784,13 +4784,13 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_mov_b32_e32 v33, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v32, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v31, v0
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:32
-; GFX9-NEXT:    buffer_load_ushort v36, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_ushort v6, off, s[0:3], s32 offset:24
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32
 ; GFX9-NEXT:    buffer_load_ushort v8, off, s[0:3], s32 offset:8
 ; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    buffer_load_ushort v6, off, s[0:3], s32 offset:24
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:32
+; GFX9-NEXT:    buffer_load_ushort v36, off, s[0:3], s32 offset:28
 ; GFX9-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:20
 ; GFX9-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:12
 ; GFX9-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:4
@@ -4810,17 +4810,17 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v23, 8, v27
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v21, 8, v29
 ; GFX9-NEXT:    s_waitcnt vmcnt(9)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v19, 8, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(8)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v19, 8, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(7)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v11, 8, v4
+; GFX9-NEXT:    v_lshlrev_b16_e32 v17, 8, v8
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v15, 8, v44
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v13, 8, v6
 ; GFX9-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v17, 8, v8
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v15, 8, v44
+; GFX9-NEXT:    v_lshlrev_b16_e32 v11, 8, v4
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -4987,12 +4987,12 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x9
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v36, off, s32 offset:36
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:8
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:16
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:24
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v36, off, s32 offset:36
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:28
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:20
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:12
@@ -5027,14 +5027,15 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v29.l
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.l, 8, v33.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v33.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.l, 8, v34.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v34.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(5)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.l, 8, v35.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v36
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB14_3
@@ -11180,14 +11181,14 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_mov_b32_e32 v33, v4
 ; SI-NEXT:    v_mov_b32_e32 v32, v2
 ; SI-NEXT:    v_mov_b32_e32 v31, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:28
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:8
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:28
 ; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:4
@@ -11207,17 +11208,17 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v23, 24, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v21, 8, v29
 ; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 24, v0
-; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    v_lshlrev_b32_e32 v19, 24, v0
 ; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 24, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 8, v8
+; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 24, v44
 ; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 8, v6
 ; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 8, v8
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 24, v44
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 24, v4
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -11474,13 +11475,13 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_mov_b32_e32 v33, v4
 ; VI-NEXT:    v_mov_b32_e32 v32, v2
 ; VI-NEXT:    v_mov_b32_e32 v31, v0
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:32
-; VI-NEXT:    buffer_load_ushort v36, off, s[0:3], s32 offset:28
-; VI-NEXT:    buffer_load_ushort v6, off, s[0:3], s32 offset:24
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32
 ; VI-NEXT:    buffer_load_ushort v8, off, s[0:3], s32 offset:8
 ; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:16
+; VI-NEXT:    buffer_load_ushort v6, off, s[0:3], s32 offset:24
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:32
+; VI-NEXT:    buffer_load_ushort v36, off, s[0:3], s32 offset:28
 ; VI-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:20
 ; VI-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:12
 ; VI-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:4
@@ -11500,17 +11501,17 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v23, 8, v27
 ; VI-NEXT:    v_lshlrev_b16_e32 v21, 8, v29
 ; VI-NEXT:    s_waitcnt vmcnt(9)
-; VI-NEXT:    v_lshlrev_b16_e32 v19, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(8)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; VI-NEXT:    s_waitcnt vmcnt(8)
+; VI-NEXT:    v_lshlrev_b16_e32 v19, 8, v0
 ; VI-NEXT:    s_waitcnt vmcnt(7)
-; VI-NEXT:    v_lshlrev_b16_e32 v11, 8, v4
+; VI-NEXT:    v_lshlrev_b16_e32 v17, 8, v8
+; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    v_lshlrev_b16_e32 v15, 8, v44
 ; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v13, 8, v6
 ; VI-NEXT:    s_waitcnt vmcnt(4)
-; VI-NEXT:    v_lshlrev_b16_e32 v17, 8, v8
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_lshlrev_b16_e32 v15, 8, v44
+; VI-NEXT:    v_lshlrev_b16_e32 v11, 8, v4
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -11686,13 +11687,13 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_mov_b32_e32 v33, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v32, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v31, v0
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:32
-; GFX9-NEXT:    buffer_load_ushort v36, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_ushort v6, off, s[0:3], s32 offset:24
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32
 ; GFX9-NEXT:    buffer_load_ushort v8, off, s[0:3], s32 offset:8
 ; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    buffer_load_ushort v6, off, s[0:3], s32 offset:24
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:32
+; GFX9-NEXT:    buffer_load_ushort v36, off, s[0:3], s32 offset:28
 ; GFX9-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:20
 ; GFX9-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:12
 ; GFX9-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:4
@@ -11712,17 +11713,17 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v23, 8, v27
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v21, 8, v29
 ; GFX9-NEXT:    s_waitcnt vmcnt(9)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v19, 8, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(8)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v19, 8, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(7)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v11, 8, v4
+; GFX9-NEXT:    v_lshlrev_b16_e32 v17, 8, v8
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v15, 8, v44
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v13, 8, v6
 ; GFX9-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v17, 8, v8
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v15, 8, v44
+; GFX9-NEXT:    v_lshlrev_b16_e32 v11, 8, v4
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -11889,12 +11890,12 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x9
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v36, off, s32 offset:36
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:8
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:16
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:24
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v36, off, s32 offset:36
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:28
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:20
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:12
@@ -11929,14 +11930,15 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v29.l
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.l, 8, v33.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v33.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.l, 8, v34.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v34.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(5)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.l, 8, v35.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v36
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB34_3
@@ -17628,15 +17630,15 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_mov_b32_e32 v31, v14
 ; SI-NEXT:    v_mov_b32_e32 v33, v12
 ; SI-NEXT:    v_mov_b32_e32 v38, v10
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:24
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:28
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:24
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:20
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:16
 ; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:4
 ; SI-NEXT:    v_lshlrev_b32_e32 v48, 8, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v39, 24, v7
@@ -17669,24 +17671,24 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr15
 ; SI-NEXT:    ; implicit-def: $vgpr17
 ; SI-NEXT:    ; implicit-def: $vgpr19
-; SI-NEXT:    s_waitcnt vmcnt(9) expcnt(4)
-; SI-NEXT:    v_lshlrev_b32_e32 v47, 24, v10
-; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
-; SI-NEXT:    s_waitcnt vmcnt(7) expcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(8) expcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v47, 24, v10
+; SI-NEXT:    s_waitcnt vmcnt(7) expcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v58, 8, v16
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_lshlrev_b32_e32 v56, 24, v14
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v59, 24, v18
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v57, 8, v32
+; SI-NEXT:    ; implicit-def: $vgpr32
 ; SI-NEXT:    ; implicit-def: $vgpr10
 ; SI-NEXT:    ; implicit-def: $vgpr12
 ; SI-NEXT:    ; implicit-def: $vgpr14
-; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v58, 8, v16
 ; SI-NEXT:    ; implicit-def: $vgpr16
-; SI-NEXT:    s_waitcnt vmcnt(3) expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v59, 24, v18
 ; SI-NEXT:    ; implicit-def: $vgpr18
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v57, 8, v32
-; SI-NEXT:    ; implicit-def: $vgpr32
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB50_2
@@ -17740,6 +17742,7 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-NEXT:    v_or_b32_e32 v14, v12, v15
 ; SI-NEXT:    v_and_b32_e32 v12, 0xff, v26
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_and_b32_e32 v16, 0xff, v42
 ; SI-NEXT:    v_and_b32_e32 v17, 0xff, v55
 ; SI-NEXT:    v_or_b32_e32 v34, v0, v7
@@ -17754,6 +17757,7 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-NEXT:    v_alignbit_b32 v13, v14, v12, 16
 ; SI-NEXT:    v_or_b32_e32 v18, v16, v19
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v16, 0xff, v43
 ; SI-NEXT:    v_or_b32_e32 v12, v0, v12
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -17978,13 +17982,13 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_mov_b32_e32 v38, v4
 ; VI-NEXT:    v_mov_b32_e32 v32, v2
 ; VI-NEXT:    v_mov_b32_e32 v36, v0
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:32
-; VI-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:28
-; VI-NEXT:    buffer_load_ushort v6, off, s[0:3], s32 offset:24
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32
 ; VI-NEXT:    buffer_load_ushort v8, off, s[0:3], s32 offset:8
 ; VI-NEXT:    buffer_load_ushort v10, off, s[0:3], s32 offset:16
+; VI-NEXT:    buffer_load_ushort v6, off, s[0:3], s32 offset:24
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:32
+; VI-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:28
 ; VI-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:20
 ; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:12
 ; VI-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:4
@@ -18006,17 +18010,17 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v27, 8, v27
 ; VI-NEXT:    v_lshlrev_b16_e32 v29, 8, v29
 ; VI-NEXT:    s_waitcnt vmcnt(9)
-; VI-NEXT:    v_lshlrev_b16_e32 v43, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(8)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; VI-NEXT:    s_waitcnt vmcnt(8)
+; VI-NEXT:    v_lshlrev_b16_e32 v43, 8, v0
 ; VI-NEXT:    s_waitcnt vmcnt(7)
-; VI-NEXT:    v_lshlrev_b16_e32 v47, 8, v4
+; VI-NEXT:    v_lshlrev_b16_e32 v44, 8, v8
+; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    v_lshlrev_b16_e32 v45, 8, v10
 ; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v46, 8, v6
 ; VI-NEXT:    s_waitcnt vmcnt(4)
-; VI-NEXT:    v_lshlrev_b16_e32 v44, 8, v8
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_lshlrev_b16_e32 v45, 8, v10
+; VI-NEXT:    v_lshlrev_b16_e32 v47, 8, v4
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -18097,6 +18101,7 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; VI-NEXT:    s_cbranch_execz .LBB50_4
 ; VI-NEXT:  ; %bb.3: ; %cmp.true
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_add_u16_e32 v0, 3, v55
 ; VI-NEXT:    v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_mov_b32_e32 v1, 0x300
@@ -18201,13 +18206,13 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_mov_b32_e32 v35, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v33, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v36, v0
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:32
-; GFX9-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_ushort v6, off, s[0:3], s32 offset:24
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32
 ; GFX9-NEXT:    buffer_load_ushort v8, off, s[0:3], s32 offset:8
 ; GFX9-NEXT:    buffer_load_ushort v10, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    buffer_load_ushort v6, off, s[0:3], s32 offset:24
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:32
+; GFX9-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:28
 ; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:20
 ; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:12
 ; GFX9-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:4
@@ -18229,17 +18234,17 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v23, 8, v27
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v29, 8, v29
 ; GFX9-NEXT:    s_waitcnt vmcnt(9)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v27, 8, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(8)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v27, 8, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(7)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v46, 8, v4
+; GFX9-NEXT:    v_lshlrev_b16_e32 v45, 8, v8
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v44, 8, v10
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v47, 8, v6
 ; GFX9-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v45, 8, v8
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v44, 8, v10
+; GFX9-NEXT:    v_lshlrev_b16_e32 v46, 8, v4
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -18411,17 +18416,17 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x9
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v37, off, s32 offset:36
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:8
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:16
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:24
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v37, off, s32 offset:36
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:28
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:20
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v29.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v27.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v25.l
@@ -18463,8 +18468,9 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v34.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.l, 8, v34.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v35.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(5)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v35.l, 8, v36.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v37
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB50_3
@@ -23933,13 +23939,13 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_mov_b32_e32 v36, v4
 ; SI-NEXT:    v_mov_b32_e32 v31, v2
 ; SI-NEXT:    v_mov_b32_e32 v35, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:28
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:28
 ; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:4
@@ -23974,21 +23980,21 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr15
 ; SI-NEXT:    ; implicit-def: $vgpr17
 ; SI-NEXT:    ; implicit-def: $vgpr19
-; SI-NEXT:    s_waitcnt vmcnt(9) expcnt(4)
-; SI-NEXT:    v_lshlrev_b32_e32 v47, 8, v0
-; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT:    s_waitcnt vmcnt(7) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(8) expcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v47, 8, v0
+; SI-NEXT:    s_waitcnt vmcnt(7) expcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v56, 8, v33
+; SI-NEXT:    s_waitcnt vmcnt(6) expcnt(2)
+; SI-NEXT:    v_lshlrev_b32_e32 v57, 8, v34
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v58, 8, v32
+; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v59, 8, v4
 ; SI-NEXT:    ; implicit-def: $vgpr0
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_lshlrev_b32_e32 v58, 8, v32
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_lshlrev_b32_e32 v56, 8, v33
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v57, 8, v34
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    ; implicit-def: $vgpr32
 ; SI-NEXT:    ; implicit-def: $vgpr34
@@ -24101,6 +24107,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; SI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB62_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v52
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; SI-NEXT:    v_or_b32_e32 v0, v59, v0
@@ -24246,13 +24253,13 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_mov_b32_e32 v38, v4
 ; VI-NEXT:    v_mov_b32_e32 v32, v2
 ; VI-NEXT:    v_mov_b32_e32 v36, v0
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:32
-; VI-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:28
-; VI-NEXT:    buffer_load_ushort v6, off, s[0:3], s32 offset:24
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32
 ; VI-NEXT:    buffer_load_ushort v8, off, s[0:3], s32 offset:8
 ; VI-NEXT:    buffer_load_ushort v10, off, s[0:3], s32 offset:16
+; VI-NEXT:    buffer_load_ushort v6, off, s[0:3], s32 offset:24
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:32
+; VI-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:28
 ; VI-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:20
 ; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:12
 ; VI-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:4
@@ -24274,17 +24281,17 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v27, 8, v27
 ; VI-NEXT:    v_lshlrev_b16_e32 v29, 8, v29
 ; VI-NEXT:    s_waitcnt vmcnt(9)
-; VI-NEXT:    v_lshlrev_b16_e32 v43, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(8)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; VI-NEXT:    s_waitcnt vmcnt(8)
+; VI-NEXT:    v_lshlrev_b16_e32 v43, 8, v0
 ; VI-NEXT:    s_waitcnt vmcnt(7)
-; VI-NEXT:    v_lshlrev_b16_e32 v47, 8, v4
+; VI-NEXT:    v_lshlrev_b16_e32 v44, 8, v8
+; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    v_lshlrev_b16_e32 v45, 8, v10
 ; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v46, 8, v6
 ; VI-NEXT:    s_waitcnt vmcnt(4)
-; VI-NEXT:    v_lshlrev_b16_e32 v44, 8, v8
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_lshlrev_b16_e32 v45, 8, v10
+; VI-NEXT:    v_lshlrev_b16_e32 v47, 8, v4
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -24365,6 +24372,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; VI-NEXT:    s_cbranch_execz .LBB62_4
 ; VI-NEXT:  ; %bb.3: ; %cmp.true
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_add_u16_e32 v0, 3, v55
 ; VI-NEXT:    v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_mov_b32_e32 v1, 0x300
@@ -24469,13 +24477,13 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_mov_b32_e32 v35, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v33, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v36, v0
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:32
-; GFX9-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_ushort v6, off, s[0:3], s32 offset:24
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32
 ; GFX9-NEXT:    buffer_load_ushort v8, off, s[0:3], s32 offset:8
 ; GFX9-NEXT:    buffer_load_ushort v10, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    buffer_load_ushort v6, off, s[0:3], s32 offset:24
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:32
+; GFX9-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:28
 ; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:20
 ; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:12
 ; GFX9-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:4
@@ -24497,17 +24505,17 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v23, 8, v27
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v29, 8, v29
 ; GFX9-NEXT:    s_waitcnt vmcnt(9)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v27, 8, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(8)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v27, 8, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(7)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v46, 8, v4
+; GFX9-NEXT:    v_lshlrev_b16_e32 v45, 8, v8
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v44, 8, v10
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v47, 8, v6
 ; GFX9-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v45, 8, v8
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v44, 8, v10
+; GFX9-NEXT:    v_lshlrev_b16_e32 v46, 8, v4
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -24679,17 +24687,17 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x9
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v37, off, s32 offset:36
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:8
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:16
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:24
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v37, off, s32 offset:36
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:28
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:20
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v29.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v27.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v25.l
@@ -24731,8 +24739,9 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v34.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.l, 8, v34.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v35.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(5)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v35.l, 8, v36.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v37
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB62_3
@@ -28267,13 +28276,13 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_mov_b32_e32 v33, v4
 ; SI-NEXT:    v_mov_b32_e32 v32, v2
 ; SI-NEXT:    v_mov_b32_e32 v31, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:28
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:28
 ; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:4
@@ -28296,17 +28305,17 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v51, 24, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v27, 8, v29
 ; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_lshlrev_b32_e32 v25, 24, v0
-; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    v_lshlrev_b32_e32 v25, 24, v0
 ; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 24, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v23, 8, v8
+; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    v_lshlrev_b32_e32 v21, 24, v10
 ; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 8, v6
 ; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_lshlrev_b32_e32 v23, 8, v8
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v21, 24, v10
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 24, v4
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -28572,13 +28581,13 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_mov_b32_e32 v33, v4
 ; VI-NEXT:    v_mov_b32_e32 v32, v2
 ; VI-NEXT:    v_mov_b32_e32 v31, v0
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:32
-; VI-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:28
-; VI-NEXT:    buffer_load_ushort v6, off, s[0:3], s32 offset:24
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32
 ; VI-NEXT:    buffer_load_ushort v8, off, s[0:3], s32 offset:8
 ; VI-NEXT:    buffer_load_ushort v10, off, s[0:3], s32 offset:16
+; VI-NEXT:    buffer_load_ushort v6, off, s[0:3], s32 offset:24
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:32
+; VI-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:28
 ; VI-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:20
 ; VI-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:12
 ; VI-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:4
@@ -28600,17 +28609,17 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v51, 8, v27
 ; VI-NEXT:    v_lshlrev_b16_e32 v27, 8, v29
 ; VI-NEXT:    s_waitcnt vmcnt(9)
-; VI-NEXT:    v_lshlrev_b16_e32 v25, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(8)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; VI-NEXT:    s_waitcnt vmcnt(8)
+; VI-NEXT:    v_lshlrev_b16_e32 v25, 8, v0
 ; VI-NEXT:    s_waitcnt vmcnt(7)
-; VI-NEXT:    v_lshlrev_b16_e32 v17, 8, v4
+; VI-NEXT:    v_lshlrev_b16_e32 v23, 8, v8
+; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    v_lshlrev_b16_e32 v21, 8, v10
 ; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v19, 8, v6
 ; VI-NEXT:    s_waitcnt vmcnt(4)
-; VI-NEXT:    v_lshlrev_b16_e32 v23, 8, v8
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_lshlrev_b16_e32 v21, 8, v10
+; VI-NEXT:    v_lshlrev_b16_e32 v17, 8, v4
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -28795,13 +28804,13 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_mov_b32_e32 v33, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v32, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v31, v0
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:32
-; GFX9-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_ushort v6, off, s[0:3], s32 offset:24
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32
 ; GFX9-NEXT:    buffer_load_ushort v8, off, s[0:3], s32 offset:8
 ; GFX9-NEXT:    buffer_load_ushort v10, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    buffer_load_ushort v6, off, s[0:3], s32 offset:24
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:32
+; GFX9-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:28
 ; GFX9-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:20
 ; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:12
 ; GFX9-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:4
@@ -28823,17 +28832,17 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v51, 8, v27
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v27, 8, v29
 ; GFX9-NEXT:    s_waitcnt vmcnt(9)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v25, 8, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(8)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v25, 8, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(7)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v17, 8, v4
+; GFX9-NEXT:    v_lshlrev_b16_e32 v23, 8, v8
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v21, 8, v10
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v19, 8, v6
 ; GFX9-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v23, 8, v8
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v21, 8, v10
+; GFX9-NEXT:    v_lshlrev_b16_e32 v17, 8, v4
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -29004,17 +29013,17 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x9
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v49, off, s32 offset:36
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:8
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:16
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:24
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v49, off, s32 offset:36
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:28
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:20
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(5)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.h, v29.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v27.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.h, v25.l
@@ -29051,6 +29060,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v37.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v37.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v38.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v49
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB72_3
@@ -32316,13 +32326,13 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_mov_b32_e32 v33, v4
 ; SI-NEXT:    v_mov_b32_e32 v32, v2
 ; SI-NEXT:    v_mov_b32_e32 v31, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:28
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:28
 ; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:4
@@ -32345,17 +32355,17 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v51, 24, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v27, 8, v29
 ; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_lshlrev_b32_e32 v25, 24, v0
-; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    v_lshlrev_b32_e32 v25, 24, v0
 ; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 24, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v23, 8, v8
+; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    v_lshlrev_b32_e32 v21, 24, v10
 ; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 8, v6
 ; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_lshlrev_b32_e32 v23, 8, v8
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v21, 24, v10
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 24, v4
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -32621,13 +32631,13 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_mov_b32_e32 v33, v4
 ; VI-NEXT:    v_mov_b32_e32 v32, v2
 ; VI-NEXT:    v_mov_b32_e32 v31, v0
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:32
-; VI-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:28
-; VI-NEXT:    buffer_load_ushort v6, off, s[0:3], s32 offset:24
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32
 ; VI-NEXT:    buffer_load_ushort v8, off, s[0:3], s32 offset:8
 ; VI-NEXT:    buffer_load_ushort v10, off, s[0:3], s32 offset:16
+; VI-NEXT:    buffer_load_ushort v6, off, s[0:3], s32 offset:24
+; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:32
+; VI-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:28
 ; VI-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:20
 ; VI-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:12
 ; VI-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:4
@@ -32649,17 +32659,17 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v51, 8, v27
 ; VI-NEXT:    v_lshlrev_b16_e32 v27, 8, v29
 ; VI-NEXT:    s_waitcnt vmcnt(9)
-; VI-NEXT:    v_lshlrev_b16_e32 v25, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(8)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; VI-NEXT:    s_waitcnt vmcnt(8)
+; VI-NEXT:    v_lshlrev_b16_e32 v25, 8, v0
 ; VI-NEXT:    s_waitcnt vmcnt(7)
-; VI-NEXT:    v_lshlrev_b16_e32 v17, 8, v4
+; VI-NEXT:    v_lshlrev_b16_e32 v23, 8, v8
+; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    v_lshlrev_b16_e32 v21, 8, v10
 ; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v19, 8, v6
 ; VI-NEXT:    s_waitcnt vmcnt(4)
-; VI-NEXT:    v_lshlrev_b16_e32 v23, 8, v8
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_lshlrev_b16_e32 v21, 8, v10
+; VI-NEXT:    v_lshlrev_b16_e32 v17, 8, v4
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -32844,13 +32854,13 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_mov_b32_e32 v33, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v32, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v31, v0
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:32
-; GFX9-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_ushort v6, off, s[0:3], s32 offset:24
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32
 ; GFX9-NEXT:    buffer_load_ushort v8, off, s[0:3], s32 offset:8
 ; GFX9-NEXT:    buffer_load_ushort v10, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    buffer_load_ushort v6, off, s[0:3], s32 offset:24
+; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:32
+; GFX9-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:28
 ; GFX9-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:20
 ; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:12
 ; GFX9-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:4
@@ -32872,17 +32882,17 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v51, 8, v27
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v27, 8, v29
 ; GFX9-NEXT:    s_waitcnt vmcnt(9)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v25, 8, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(8)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v25, 8, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(7)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v17, 8, v4
+; GFX9-NEXT:    v_lshlrev_b16_e32 v23, 8, v8
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v21, 8, v10
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v19, 8, v6
 ; GFX9-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v23, 8, v8
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v21, 8, v10
+; GFX9-NEXT:    v_lshlrev_b16_e32 v17, 8, v4
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -33053,17 +33063,17 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x9
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v49, off, s32 offset:36
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:8
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:16
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:24
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v49, off, s32 offset:36
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:28
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:20
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(5)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.h, v29.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v27.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.h, v25.l
@@ -33100,6 +33110,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v37.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v37.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v38.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v49
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB76_3
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
index ee23420c2a662..ba195133dd5d1 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
@@ -11426,17 +11426,17 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v44
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:116
 ; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:44
 ; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:36
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:28
 ; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:116
 ; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_lshlrev_b32_e32 v50, 24, v58
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v47
@@ -12838,18 +12838,6 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:128
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v64, off, s32 offset:120
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:116
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v65, off, s32 offset:112
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:108
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v65, off, s32 offset:104
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:100
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:96
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:92
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:88
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v81, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v67, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v67, off, s32 offset:8
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v68, off, s32 offset:16
@@ -12861,6 +12849,18 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v71, off, s32 offset:64
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v71, off, s32 offset:72
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v65, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v65, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v64, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v81, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:92
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:84
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:76
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:68
@@ -12873,7 +12873,7 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(23)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v80.h, v29.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v22.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v20.l
@@ -12904,12 +12904,6 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v80.h
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v64.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v64.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v65.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v65.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.l, 8, v66.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v66.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v26.h, 8, v67.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v27.l, 8, v67.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v68.l
@@ -12921,6 +12915,16 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v71.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v71.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.l, 8, v80.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(21)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v66.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.l, 8, v66.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(19)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v65.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v65.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(17)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v64.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v64.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(16)
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v81
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB26_3
@@ -13204,17 +13208,6 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
 ; GFX11-FAKE16-NEXT:    s_clause 0x1f
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:128
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v39, off, s32 offset:124
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32 offset:120
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v48, off, s32 offset:116
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:112
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v49, off, s32 offset:108
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:104
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v50, off, s32 offset:100
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:96
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v51, off, s32 offset:92
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:88
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v12, off, s32 offset:132
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v65, off, s32 offset:8
@@ -13227,6 +13220,17 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v129, off, s32 offset:64
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v130, off, s32 offset:72
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v131, off, s32 offset:80
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:88
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:96
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:104
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:112
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32 offset:120
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:128
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v39, off, s32 offset:124
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v48, off, s32 offset:116
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v49, off, s32 offset:108
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v50, off, s32 offset:100
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v51, off, s32 offset:92
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v52, off, s32 offset:84
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v53, off, s32 offset:76
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v54, off, s32 offset:68
@@ -13255,40 +13259,41 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v96, 8, v27
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v97, 8, v29
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(33)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(31)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v23, 8, v2
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(29)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v4
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(27)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v6
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(25)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v8
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(22)
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(21)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(32)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v98, 8, v14
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(20)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(31)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v99, 8, v65
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(19)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(30)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v81, 8, v66
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(18)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(29)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v82, 8, v67
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(17)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(28)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v83, 8, v83
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(16)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(27)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v84, 8, v84
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(15)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(26)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v85, 8, v85
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(14)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(25)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v27, 8, v128
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(13)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(24)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v29, 8, v129
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(23)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v65, 8, v130
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(22)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v66, 8, v131
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(21)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v67, 8, v10
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(20)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v8
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(19)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v6
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(18)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v4
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(17)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v23, 8, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(16)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v0
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -13711,22 +13716,22 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:76
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:28
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:36
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:44
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:56
-; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:52
 ; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:72
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:44
+; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:68
 ; SI-NEXT:    v_lshlrev_b32_e32 v50, 24, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v39, 8, v3
@@ -13748,21 +13753,19 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v23, 8, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 24, v4
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    v_lshlrev_b32_e32 v21, 8, v6
-; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 24, v52
-; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 8, v10
-; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 24, v12
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_lshlrev_b32_e32 v29, 8, v14
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 24, v28
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_lshlrev_b32_e32 v28, 8, v26
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_lshlrev_b32_e32 v15, 24, v24
 ; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
@@ -13817,33 +13820,41 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    v_or_b32_e32 v1, v25, v1
 ; SI-NEXT:    v_or_b32_e32 v10, v0, v1
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v30
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v42
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v23
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-NEXT:    v_or_b32_e32 v1, v11, v1
 ; SI-NEXT:    v_or_b32_e32 v11, v0, v1
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v41
+; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v40
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v21
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-NEXT:    v_or_b32_e32 v1, v19, v1
 ; SI-NEXT:    v_or_b32_e32 v12, v0, v1
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v35
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v55
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v17
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-NEXT:    v_or_b32_e32 v1, v13, v1
 ; SI-NEXT:    v_or_b32_e32 v13, v0, v1
+; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v36
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v53
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v29
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-NEXT:    v_or_b32_e32 v1, v14, v1
 ; SI-NEXT:    v_or_b32_e32 v14, v0, v1
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v37
 ; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v43
@@ -14119,6 +14130,7 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:  .LBB27_4:
 ; SI-NEXT:    v_mov_b32_e32 v27, v44
 ; SI-NEXT:    v_mov_b32_e32 v26, v8
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_mov_b32_e32 v52, v42
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; SI-NEXT:    s_branch .LBB27_2
@@ -14155,22 +14167,22 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:76
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32
 ; VI-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:8
-; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_load_ushort v6, off, s[0:3], s32 offset:16
-; VI-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:12
 ; VI-NEXT:    buffer_load_ushort v8, off, s[0:3], s32 offset:24
-; VI-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:20
 ; VI-NEXT:    buffer_load_ushort v10, off, s[0:3], s32 offset:32
-; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:28
 ; VI-NEXT:    buffer_load_ushort v12, off, s[0:3], s32 offset:40
-; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:36
 ; VI-NEXT:    buffer_load_ushort v14, off, s[0:3], s32 offset:48
-; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:44
 ; VI-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:56
-; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:52
 ; VI-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:64
-; VI-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:60
 ; VI-NEXT:    buffer_load_ushort v24, off, s[0:3], s32 offset:72
+; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:36
+; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:44
+; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:52
+; VI-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:60
 ; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:68
 ; VI-NEXT:    v_mov_b32_e32 v37, v30
 ; VI-NEXT:    v_lshlrev_b32_e32 v30, 8, v1
@@ -14192,21 +14204,19 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_lshlrev_b32_e32 v21, 8, v2
 ; VI-NEXT:    v_lshlrev_b32_e32 v11, 8, v33
-; VI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; VI-NEXT:    v_lshlrev_b32_e32 v19, 8, v6
-; VI-NEXT:    s_waitcnt vmcnt(13)
 ; VI-NEXT:    v_lshlrev_b32_e32 v29, 8, v8
-; VI-NEXT:    s_waitcnt vmcnt(11)
 ; VI-NEXT:    v_lshlrev_b32_e32 v33, 8, v10
-; VI-NEXT:    s_waitcnt vmcnt(9)
+; VI-NEXT:    s_waitcnt vmcnt(13)
 ; VI-NEXT:    v_lshlrev_b32_e32 v13, 8, v12
-; VI-NEXT:    s_waitcnt vmcnt(7)
+; VI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; VI-NEXT:    s_waitcnt vmcnt(12)
 ; VI-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
-; VI-NEXT:    s_waitcnt vmcnt(5)
+; VI-NEXT:    s_waitcnt vmcnt(11)
 ; VI-NEXT:    v_lshlrev_b32_e32 v27, 8, v28
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    s_waitcnt vmcnt(10)
 ; VI-NEXT:    v_lshlrev_b32_e32 v31, 8, v26
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(9)
 ; VI-NEXT:    v_lshlrev_b32_e32 v15, 8, v24
 ; VI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
@@ -14235,19 +14245,27 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32
 ; VI-NEXT:    v_or_b32_sdwa v1, v36, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(13)
 ; VI-NEXT:    v_or_b32_sdwa v1, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(12)
 ; VI-NEXT:    v_or_b32_sdwa v0, v55, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(11)
 ; VI-NEXT:    v_or_b32_sdwa v1, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(10)
 ; VI-NEXT:    v_or_b32_sdwa v0, v53, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(9)
 ; VI-NEXT:    v_or_b32_sdwa v1, v52, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(8)
 ; VI-NEXT:    v_or_b32_sdwa v0, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_or_b32_sdwa v1, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_and_b32 s4, s28, 0xff
 ; VI-NEXT:    s_lshl_b32 s5, s29, 8
 ; VI-NEXT:    v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_or_b32_sdwa v0, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_or_b32_sdwa v1, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -14461,6 +14479,7 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32
 ; VI-NEXT:  .LBB27_4:
 ; VI-NEXT:    v_mov_b32_e32 v28, v44
 ; VI-NEXT:    v_mov_b32_e32 v26, v4
+; VI-NEXT:    s_waitcnt vmcnt(8)
 ; VI-NEXT:    v_mov_b32_e32 v33, v42
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-NEXT:    s_branch .LBB27_2
@@ -14497,22 +14516,22 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:76
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32
 ; GFX9-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_load_ushort v6, off, s[0:3], s32 offset:16
-; GFX9-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:12
 ; GFX9-NEXT:    buffer_load_ushort v8, off, s[0:3], s32 offset:24
-; GFX9-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:20
 ; GFX9-NEXT:    buffer_load_ushort v10, off, s[0:3], s32 offset:32
-; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:28
 ; GFX9-NEXT:    buffer_load_ushort v12, off, s[0:3], s32 offset:40
-; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:36
 ; GFX9-NEXT:    buffer_load_ushort v14, off, s[0:3], s32 offset:48
-; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:44
 ; GFX9-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:56
-; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:52
 ; GFX9-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:64
-; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:60
 ; GFX9-NEXT:    buffer_load_ushort v24, off, s[0:3], s32 offset:72
+; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:60
 ; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:68
 ; GFX9-NEXT:    v_mov_b32_e32 v37, v30
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 8, v1
@@ -14536,22 +14555,22 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 8, v2
 ; GFX9-NEXT:    s_waitcnt vmcnt(17)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 8, v33
-; GFX9-NEXT:    s_and_b64 s[4:5], vcc, exec
-; GFX9-NEXT:    s_waitcnt vmcnt(15)
+; GFX9-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 8, v6
-; GFX9-NEXT:    s_waitcnt vmcnt(13)
+; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 8, v8
-; GFX9-NEXT:    s_waitcnt vmcnt(11)
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 8, v10
-; GFX9-NEXT:    s_waitcnt vmcnt(9)
+; GFX9-NEXT:    s_waitcnt vmcnt(13)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 8, v12
-; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    s_and_b64 s[4:5], vcc, exec
+; GFX9-NEXT:    s_waitcnt vmcnt(12)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-NEXT:    s_waitcnt vmcnt(11)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 8, v28
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    s_waitcnt vmcnt(10)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 8, v26
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 8, v24
 ; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
@@ -14580,19 +14599,27 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v36, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(13)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(12)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v55, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(11)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(10)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v53, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v52, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_and_b32 s4, s28, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s5, s29, 8
 ; GFX9-NEXT:    v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -14807,6 +14834,7 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32
 ; GFX9-NEXT:  .LBB27_4:
 ; GFX9-NEXT:    v_mov_b32_e32 v28, v44
 ; GFX9-NEXT:    v_mov_b32_e32 v26, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-NEXT:    v_mov_b32_e32 v33, v42
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX9-NEXT:    s_branch .LBB27_2
@@ -14819,8 +14847,6 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
 ; GFX11-TRUE16-NEXT:    s_clause 0xf
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:56
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v39, off, s32 offset:52
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v2, off, s32 offset:60
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v4, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v6, off, s32 offset:8
@@ -14829,6 +14855,8 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v12, off, s32 offset:32
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v14, off, s32 offset:40
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v86, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v39, off, s32 offset:52
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v48, off, s32 offset:44
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v49, off, s32 offset:36
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v50, off, s32 offset:28
@@ -14852,23 +14880,23 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v29, 8, v29
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 8, v0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(14)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 8, v4
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 8, v6
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(10)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v64, 8, v8
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(9)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 8, v10
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(10)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 8, v12
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(9)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 8, v14
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 8, v86
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 8, v0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB27_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -15204,8 +15232,6 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
 ; GFX11-FAKE16-NEXT:    s_clause 0xf
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:56
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v39, off, s32 offset:52
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v2, off, s32 offset:60
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:8
@@ -15214,6 +15240,8 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v12, off, s32 offset:32
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32 offset:40
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v86, off, s32 offset:48
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:56
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v39, off, s32 offset:52
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v48, off, s32 offset:44
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v49, off, s32 offset:36
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v50, off, s32 offset:28
@@ -15237,23 +15265,23 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 8, v29
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s4, 0
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 8, v0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(13)
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(14)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 8, v4
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(13)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 8, v6
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(10)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(12)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v64, 8, v8
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(9)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(11)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 8, v10
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(10)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 8, v12
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(9)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 8, v14
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(8)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 8, v86
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 8, v0
 ; GFX11-FAKE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
 ; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB27_4
 ; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -26438,17 +26466,17 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v44
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:116
 ; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:44
 ; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:36
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:28
 ; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:116
 ; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_lshlrev_b32_e32 v50, 24, v58
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v47
@@ -27850,18 +27878,6 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:128
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v64, off, s32 offset:120
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:116
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v65, off, s32 offset:112
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:108
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v65, off, s32 offset:104
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:100
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:96
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:92
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:88
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v81, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v67, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v67, off, s32 offset:8
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v68, off, s32 offset:16
@@ -27873,6 +27889,18 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v71, off, s32 offset:64
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v71, off, s32 offset:72
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v65, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v65, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v64, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v81, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:92
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:84
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:76
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:68
@@ -27885,7 +27913,7 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(23)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v80.h, v29.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v22.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v20.l
@@ -27916,12 +27944,6 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v80.h
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v64.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v64.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v65.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v65.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.l, 8, v66.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v66.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v26.h, 8, v67.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v27.l, 8, v67.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v68.l
@@ -27933,6 +27955,16 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v71.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v71.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.l, 8, v80.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(21)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v66.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.l, 8, v66.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(19)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v65.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v65.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(17)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v64.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v64.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(16)
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v81
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB50_3
@@ -28216,17 +28248,6 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
 ; GFX11-FAKE16-NEXT:    s_clause 0x1f
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:128
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v39, off, s32 offset:124
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32 offset:120
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v48, off, s32 offset:116
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:112
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v49, off, s32 offset:108
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:104
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v50, off, s32 offset:100
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:96
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v51, off, s32 offset:92
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:88
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v12, off, s32 offset:132
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v65, off, s32 offset:8
@@ -28239,6 +28260,17 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v129, off, s32 offset:64
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v130, off, s32 offset:72
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v131, off, s32 offset:80
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:88
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:96
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:104
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:112
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32 offset:120
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:128
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v39, off, s32 offset:124
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v48, off, s32 offset:116
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v49, off, s32 offset:108
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v50, off, s32 offset:100
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v51, off, s32 offset:92
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v52, off, s32 offset:84
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v53, off, s32 offset:76
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v54, off, s32 offset:68
@@ -28267,40 +28299,41 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v96, 8, v27
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v97, 8, v29
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(33)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(31)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v23, 8, v2
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(29)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v4
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(27)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v6
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(25)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v8
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(22)
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(21)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(32)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v98, 8, v14
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(20)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(31)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v99, 8, v65
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(19)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(30)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v81, 8, v66
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(18)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(29)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v82, 8, v67
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(17)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(28)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v83, 8, v83
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(16)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(27)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v84, 8, v84
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(15)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(26)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v85, 8, v85
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(14)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(25)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v27, 8, v128
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(13)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(24)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v29, 8, v129
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(23)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v65, 8, v130
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(22)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v66, 8, v131
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(21)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v67, 8, v10
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(20)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v8
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(19)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v6
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(18)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v4
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(17)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v23, 8, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(16)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v0
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -28723,22 +28756,22 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:76
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:28
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:36
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:44
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:56
-; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:52
 ; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:72
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:44
+; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:68
 ; SI-NEXT:    v_lshlrev_b32_e32 v50, 24, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v39, 8, v3
@@ -28760,21 +28793,19 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v23, 8, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 24, v4
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    v_lshlrev_b32_e32 v21, 8, v6
-; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 24, v52
-; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 8, v10
-; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 24, v12
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_lshlrev_b32_e32 v29, 8, v14
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 24, v28
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_lshlrev_b32_e32 v28, 8, v26
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_lshlrev_b32_e32 v15, 24, v24
 ; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
@@ -28829,33 +28860,41 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3
 ; SI-NEXT:    v_or_b32_e32 v1, v25, v1
 ; SI-NEXT:    v_or_b32_e32 v10, v0, v1
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v30
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v42
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v23
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-NEXT:    v_or_b32_e32 v1, v11, v1
 ; SI-NEXT:    v_or_b32_e32 v11, v0, v1
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v41
+; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v40
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v21
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-NEXT:    v_or_b32_e32 v1, v19, v1
 ; SI-NEXT:    v_or_b32_e32 v12, v0, v1
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v35
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v55
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v17
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-NEXT:    v_or_b32_e32 v1, v13, v1
 ; SI-NEXT:    v_or_b32_e32 v13, v0, v1
+; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v36
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v53
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v29
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-NEXT:    v_or_b32_e32 v1, v14, v1
 ; SI-NEXT:    v_or_b32_e32 v14, v0, v1
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v37
 ; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v43
@@ -29131,6 +29170,7 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3
 ; SI-NEXT:  .LBB51_4:
 ; SI-NEXT:    v_mov_b32_e32 v27, v44
 ; SI-NEXT:    v_mov_b32_e32 v26, v8
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_mov_b32_e32 v52, v42
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; SI-NEXT:    s_branch .LBB51_2
@@ -29167,22 +29207,22 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:76
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32
 ; VI-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:8
-; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_load_ushort v6, off, s[0:3], s32 offset:16
-; VI-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:12
 ; VI-NEXT:    buffer_load_ushort v8, off, s[0:3], s32 offset:24
-; VI-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:20
 ; VI-NEXT:    buffer_load_ushort v10, off, s[0:3], s32 offset:32
-; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:28
 ; VI-NEXT:    buffer_load_ushort v12, off, s[0:3], s32 offset:40
-; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:36
 ; VI-NEXT:    buffer_load_ushort v14, off, s[0:3], s32 offset:48
-; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:44
 ; VI-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:56
-; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:52
 ; VI-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:64
-; VI-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:60
 ; VI-NEXT:    buffer_load_ushort v24, off, s[0:3], s32 offset:72
+; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:36
+; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:44
+; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:52
+; VI-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:60
 ; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:68
 ; VI-NEXT:    v_mov_b32_e32 v37, v30
 ; VI-NEXT:    v_lshlrev_b32_e32 v30, 8, v1
@@ -29204,21 +29244,19 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_lshlrev_b32_e32 v21, 8, v2
 ; VI-NEXT:    v_lshlrev_b32_e32 v11, 8, v33
-; VI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; VI-NEXT:    v_lshlrev_b32_e32 v19, 8, v6
-; VI-NEXT:    s_waitcnt vmcnt(13)
 ; VI-NEXT:    v_lshlrev_b32_e32 v29, 8, v8
-; VI-NEXT:    s_waitcnt vmcnt(11)
 ; VI-NEXT:    v_lshlrev_b32_e32 v33, 8, v10
-; VI-NEXT:    s_waitcnt vmcnt(9)
+; VI-NEXT:    s_waitcnt vmcnt(13)
 ; VI-NEXT:    v_lshlrev_b32_e32 v13, 8, v12
-; VI-NEXT:    s_waitcnt vmcnt(7)
+; VI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; VI-NEXT:    s_waitcnt vmcnt(12)
 ; VI-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
-; VI-NEXT:    s_waitcnt vmcnt(5)
+; VI-NEXT:    s_waitcnt vmcnt(11)
 ; VI-NEXT:    v_lshlrev_b32_e32 v27, 8, v28
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    s_waitcnt vmcnt(10)
 ; VI-NEXT:    v_lshlrev_b32_e32 v31, 8, v26
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(9)
 ; VI-NEXT:    v_lshlrev_b32_e32 v15, 8, v24
 ; VI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
@@ -29247,19 +29285,27 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3
 ; VI-NEXT:    v_or_b32_sdwa v1, v36, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(13)
 ; VI-NEXT:    v_or_b32_sdwa v1, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(12)
 ; VI-NEXT:    v_or_b32_sdwa v0, v55, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(11)
 ; VI-NEXT:    v_or_b32_sdwa v1, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(10)
 ; VI-NEXT:    v_or_b32_sdwa v0, v53, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(9)
 ; VI-NEXT:    v_or_b32_sdwa v1, v52, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(8)
 ; VI-NEXT:    v_or_b32_sdwa v0, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_or_b32_sdwa v1, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_and_b32 s4, s28, 0xff
 ; VI-NEXT:    s_lshl_b32 s5, s29, 8
 ; VI-NEXT:    v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_or_b32_sdwa v0, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_or_b32_sdwa v1, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -29473,6 +29519,7 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3
 ; VI-NEXT:  .LBB51_4:
 ; VI-NEXT:    v_mov_b32_e32 v28, v44
 ; VI-NEXT:    v_mov_b32_e32 v26, v4
+; VI-NEXT:    s_waitcnt vmcnt(8)
 ; VI-NEXT:    v_mov_b32_e32 v33, v42
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-NEXT:    s_branch .LBB51_2
@@ -29509,22 +29556,22 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:76
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32
 ; GFX9-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_load_ushort v6, off, s[0:3], s32 offset:16
-; GFX9-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:12
 ; GFX9-NEXT:    buffer_load_ushort v8, off, s[0:3], s32 offset:24
-; GFX9-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:20
 ; GFX9-NEXT:    buffer_load_ushort v10, off, s[0:3], s32 offset:32
-; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:28
 ; GFX9-NEXT:    buffer_load_ushort v12, off, s[0:3], s32 offset:40
-; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:36
 ; GFX9-NEXT:    buffer_load_ushort v14, off, s[0:3], s32 offset:48
-; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:44
 ; GFX9-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:56
-; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:52
 ; GFX9-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:64
-; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:60
 ; GFX9-NEXT:    buffer_load_ushort v24, off, s[0:3], s32 offset:72
+; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:60
 ; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:68
 ; GFX9-NEXT:    v_mov_b32_e32 v37, v30
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 8, v1
@@ -29548,22 +29595,22 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 8, v2
 ; GFX9-NEXT:    s_waitcnt vmcnt(17)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 8, v33
-; GFX9-NEXT:    s_and_b64 s[4:5], vcc, exec
-; GFX9-NEXT:    s_waitcnt vmcnt(15)
+; GFX9-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 8, v6
-; GFX9-NEXT:    s_waitcnt vmcnt(13)
+; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 8, v8
-; GFX9-NEXT:    s_waitcnt vmcnt(11)
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 8, v10
-; GFX9-NEXT:    s_waitcnt vmcnt(9)
+; GFX9-NEXT:    s_waitcnt vmcnt(13)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 8, v12
-; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    s_and_b64 s[4:5], vcc, exec
+; GFX9-NEXT:    s_waitcnt vmcnt(12)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-NEXT:    s_waitcnt vmcnt(11)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 8, v28
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    s_waitcnt vmcnt(10)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 8, v26
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 8, v24
 ; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
@@ -29592,19 +29639,27 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v36, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(13)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(12)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v55, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(11)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(10)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v53, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v52, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_and_b32 s4, s28, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s5, s29, 8
 ; GFX9-NEXT:    v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -29819,6 +29874,7 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3
 ; GFX9-NEXT:  .LBB51_4:
 ; GFX9-NEXT:    v_mov_b32_e32 v28, v44
 ; GFX9-NEXT:    v_mov_b32_e32 v26, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-NEXT:    v_mov_b32_e32 v33, v42
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX9-NEXT:    s_branch .LBB51_2
@@ -29831,8 +29887,6 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
 ; GFX11-TRUE16-NEXT:    s_clause 0xf
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:56
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v39, off, s32 offset:52
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v2, off, s32 offset:60
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v4, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v6, off, s32 offset:8
@@ -29841,6 +29895,8 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v12, off, s32 offset:32
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v14, off, s32 offset:40
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v86, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v39, off, s32 offset:52
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v48, off, s32 offset:44
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v49, off, s32 offset:36
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v50, off, s32 offset:28
@@ -29864,23 +29920,23 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v29, 8, v29
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 8, v0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(14)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 8, v4
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 8, v6
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(10)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v64, 8, v8
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(9)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 8, v10
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(10)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 8, v12
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(9)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 8, v14
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 8, v86
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 8, v0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB51_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -30216,8 +30272,6 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
 ; GFX11-FAKE16-NEXT:    s_clause 0xf
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:56
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v39, off, s32 offset:52
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v2, off, s32 offset:60
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:8
@@ -30226,6 +30280,8 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v12, off, s32 offset:32
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32 offset:40
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v86, off, s32 offset:48
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:56
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v39, off, s32 offset:52
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v48, off, s32 offset:44
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v49, off, s32 offset:36
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v50, off, s32 offset:28
@@ -30249,23 +30305,23 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 8, v29
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s4, 0
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 8, v0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(13)
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(14)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 8, v4
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(13)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 8, v6
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(10)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(12)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v64, 8, v8
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(9)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(11)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 8, v10
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(10)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 8, v12
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(9)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 8, v14
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(8)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 8, v86
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 8, v0
 ; GFX11-FAKE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
 ; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB51_4
 ; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -40726,17 +40782,17 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v44
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:116
 ; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:44
 ; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:36
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:28
 ; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:116
 ; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_lshlrev_b32_e32 v50, 24, v58
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v47
@@ -42138,18 +42194,6 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:128
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v64, off, s32 offset:120
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:116
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v65, off, s32 offset:112
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:108
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v65, off, s32 offset:104
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:100
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:96
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:92
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:88
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v81, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v67, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v67, off, s32 offset:8
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v68, off, s32 offset:16
@@ -42161,6 +42205,18 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v71, off, s32 offset:64
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v71, off, s32 offset:72
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v65, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v65, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v64, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v81, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:92
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:84
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:76
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:68
@@ -42173,7 +42229,7 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(23)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v80.h, v29.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v22.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v20.l
@@ -42204,12 +42260,6 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v80.h
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v64.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v64.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v65.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v65.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.l, 8, v66.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v66.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v26.h, 8, v67.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v27.l, 8, v67.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v68.l
@@ -42221,6 +42271,16 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v71.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v71.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.l, 8, v80.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(21)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v66.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.l, 8, v66.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(19)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v65.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v65.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(17)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v64.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v64.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(16)
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v81
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB70_3
@@ -42504,17 +42564,6 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
 ; GFX11-FAKE16-NEXT:    s_clause 0x1f
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:128
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v39, off, s32 offset:124
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32 offset:120
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v48, off, s32 offset:116
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:112
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v49, off, s32 offset:108
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:104
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v50, off, s32 offset:100
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:96
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v51, off, s32 offset:92
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:88
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v12, off, s32 offset:132
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v65, off, s32 offset:8
@@ -42527,6 +42576,17 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v129, off, s32 offset:64
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v130, off, s32 offset:72
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v131, off, s32 offset:80
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:88
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:96
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:104
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:112
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32 offset:120
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:128
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v39, off, s32 offset:124
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v48, off, s32 offset:116
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v49, off, s32 offset:108
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v50, off, s32 offset:100
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v51, off, s32 offset:92
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v52, off, s32 offset:84
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v53, off, s32 offset:76
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v54, off, s32 offset:68
@@ -42555,40 +42615,41 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v96, 8, v27
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v97, 8, v29
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(33)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(31)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v23, 8, v2
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(29)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v4
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(27)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v6
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(25)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v8
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(22)
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(21)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(32)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v98, 8, v14
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(20)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(31)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v99, 8, v65
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(19)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(30)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v81, 8, v66
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(18)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(29)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v82, 8, v67
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(17)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(28)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v83, 8, v83
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(16)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(27)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v84, 8, v84
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(15)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(26)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v85, 8, v85
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(14)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(25)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v27, 8, v128
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(13)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(24)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v29, 8, v129
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(23)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v65, 8, v130
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(22)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v66, 8, v131
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(21)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v67, 8, v10
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(20)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v8
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(19)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v6
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(18)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v4
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(17)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v23, 8, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(16)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v0
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -43011,22 +43072,22 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:76
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:28
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:36
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:44
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:56
-; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:52
 ; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:72
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:44
+; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:68
 ; SI-NEXT:    v_lshlrev_b32_e32 v50, 24, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v39, 8, v3
@@ -43048,21 +43109,19 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v23, 8, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 24, v4
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    v_lshlrev_b32_e32 v21, 8, v6
-; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 24, v52
-; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 8, v10
-; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 24, v12
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_lshlrev_b32_e32 v29, 8, v14
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 24, v28
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_lshlrev_b32_e32 v28, 8, v26
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_lshlrev_b32_e32 v15, 24, v24
 ; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
@@ -43117,33 +43176,41 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in
 ; SI-NEXT:    v_or_b32_e32 v1, v25, v1
 ; SI-NEXT:    v_or_b32_e32 v10, v0, v1
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v30
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v42
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v23
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-NEXT:    v_or_b32_e32 v1, v11, v1
 ; SI-NEXT:    v_or_b32_e32 v11, v0, v1
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v41
+; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v40
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v21
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-NEXT:    v_or_b32_e32 v1, v19, v1
 ; SI-NEXT:    v_or_b32_e32 v12, v0, v1
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v35
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v55
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v17
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-NEXT:    v_or_b32_e32 v1, v13, v1
 ; SI-NEXT:    v_or_b32_e32 v13, v0, v1
+; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v36
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v53
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v29
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-NEXT:    v_or_b32_e32 v1, v14, v1
 ; SI-NEXT:    v_or_b32_e32 v14, v0, v1
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v37
 ; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v43
@@ -43419,6 +43486,7 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in
 ; SI-NEXT:  .LBB71_4:
 ; SI-NEXT:    v_mov_b32_e32 v27, v44
 ; SI-NEXT:    v_mov_b32_e32 v26, v8
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_mov_b32_e32 v52, v42
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; SI-NEXT:    s_branch .LBB71_2
@@ -43455,22 +43523,22 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:76
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32
 ; VI-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:8
-; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_load_ushort v6, off, s[0:3], s32 offset:16
-; VI-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:12
 ; VI-NEXT:    buffer_load_ushort v8, off, s[0:3], s32 offset:24
-; VI-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:20
 ; VI-NEXT:    buffer_load_ushort v10, off, s[0:3], s32 offset:32
-; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:28
 ; VI-NEXT:    buffer_load_ushort v12, off, s[0:3], s32 offset:40
-; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:36
 ; VI-NEXT:    buffer_load_ushort v14, off, s[0:3], s32 offset:48
-; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:44
 ; VI-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:56
-; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:52
 ; VI-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:64
-; VI-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:60
 ; VI-NEXT:    buffer_load_ushort v24, off, s[0:3], s32 offset:72
+; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:36
+; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:44
+; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:52
+; VI-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:60
 ; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:68
 ; VI-NEXT:    v_mov_b32_e32 v37, v30
 ; VI-NEXT:    v_lshlrev_b32_e32 v30, 8, v1
@@ -43492,21 +43560,19 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_lshlrev_b32_e32 v21, 8, v2
 ; VI-NEXT:    v_lshlrev_b32_e32 v11, 8, v33
-; VI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; VI-NEXT:    v_lshlrev_b32_e32 v19, 8, v6
-; VI-NEXT:    s_waitcnt vmcnt(13)
 ; VI-NEXT:    v_lshlrev_b32_e32 v29, 8, v8
-; VI-NEXT:    s_waitcnt vmcnt(11)
 ; VI-NEXT:    v_lshlrev_b32_e32 v33, 8, v10
-; VI-NEXT:    s_waitcnt vmcnt(9)
+; VI-NEXT:    s_waitcnt vmcnt(13)
 ; VI-NEXT:    v_lshlrev_b32_e32 v13, 8, v12
-; VI-NEXT:    s_waitcnt vmcnt(7)
+; VI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; VI-NEXT:    s_waitcnt vmcnt(12)
 ; VI-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
-; VI-NEXT:    s_waitcnt vmcnt(5)
+; VI-NEXT:    s_waitcnt vmcnt(11)
 ; VI-NEXT:    v_lshlrev_b32_e32 v27, 8, v28
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    s_waitcnt vmcnt(10)
 ; VI-NEXT:    v_lshlrev_b32_e32 v31, 8, v26
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(9)
 ; VI-NEXT:    v_lshlrev_b32_e32 v15, 8, v24
 ; VI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
@@ -43535,19 +43601,27 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in
 ; VI-NEXT:    v_or_b32_sdwa v1, v36, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(13)
 ; VI-NEXT:    v_or_b32_sdwa v1, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(12)
 ; VI-NEXT:    v_or_b32_sdwa v0, v55, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(11)
 ; VI-NEXT:    v_or_b32_sdwa v1, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(10)
 ; VI-NEXT:    v_or_b32_sdwa v0, v53, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(9)
 ; VI-NEXT:    v_or_b32_sdwa v1, v52, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(8)
 ; VI-NEXT:    v_or_b32_sdwa v0, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_or_b32_sdwa v1, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_and_b32 s4, s28, 0xff
 ; VI-NEXT:    s_lshl_b32 s5, s29, 8
 ; VI-NEXT:    v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_or_b32_sdwa v0, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_or_b32_sdwa v1, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -43761,6 +43835,7 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in
 ; VI-NEXT:  .LBB71_4:
 ; VI-NEXT:    v_mov_b32_e32 v28, v44
 ; VI-NEXT:    v_mov_b32_e32 v26, v4
+; VI-NEXT:    s_waitcnt vmcnt(8)
 ; VI-NEXT:    v_mov_b32_e32 v33, v42
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-NEXT:    s_branch .LBB71_2
@@ -43797,22 +43872,22 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:76
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32
 ; GFX9-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_load_ushort v6, off, s[0:3], s32 offset:16
-; GFX9-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:12
 ; GFX9-NEXT:    buffer_load_ushort v8, off, s[0:3], s32 offset:24
-; GFX9-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:20
 ; GFX9-NEXT:    buffer_load_ushort v10, off, s[0:3], s32 offset:32
-; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:28
 ; GFX9-NEXT:    buffer_load_ushort v12, off, s[0:3], s32 offset:40
-; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:36
 ; GFX9-NEXT:    buffer_load_ushort v14, off, s[0:3], s32 offset:48
-; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:44
 ; GFX9-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:56
-; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:52
 ; GFX9-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:64
-; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:60
 ; GFX9-NEXT:    buffer_load_ushort v24, off, s[0:3], s32 offset:72
+; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:60
 ; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:68
 ; GFX9-NEXT:    v_mov_b32_e32 v37, v30
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 8, v1
@@ -43836,22 +43911,22 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 8, v2
 ; GFX9-NEXT:    s_waitcnt vmcnt(17)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 8, v33
-; GFX9-NEXT:    s_and_b64 s[4:5], vcc, exec
-; GFX9-NEXT:    s_waitcnt vmcnt(15)
+; GFX9-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 8, v6
-; GFX9-NEXT:    s_waitcnt vmcnt(13)
+; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 8, v8
-; GFX9-NEXT:    s_waitcnt vmcnt(11)
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 8, v10
-; GFX9-NEXT:    s_waitcnt vmcnt(9)
+; GFX9-NEXT:    s_waitcnt vmcnt(13)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 8, v12
-; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    s_and_b64 s[4:5], vcc, exec
+; GFX9-NEXT:    s_waitcnt vmcnt(12)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-NEXT:    s_waitcnt vmcnt(11)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 8, v28
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    s_waitcnt vmcnt(10)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 8, v26
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 8, v24
 ; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
@@ -43880,19 +43955,27 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v36, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(13)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(12)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v55, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(11)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(10)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v53, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v52, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_and_b32 s4, s28, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s5, s29, 8
 ; GFX9-NEXT:    v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -44107,6 +44190,7 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in
 ; GFX9-NEXT:  .LBB71_4:
 ; GFX9-NEXT:    v_mov_b32_e32 v28, v44
 ; GFX9-NEXT:    v_mov_b32_e32 v26, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-NEXT:    v_mov_b32_e32 v33, v42
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX9-NEXT:    s_branch .LBB71_2
@@ -44119,8 +44203,6 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
 ; GFX11-TRUE16-NEXT:    s_clause 0xf
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:56
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v39, off, s32 offset:52
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v2, off, s32 offset:60
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v4, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v6, off, s32 offset:8
@@ -44129,6 +44211,8 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v12, off, s32 offset:32
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v14, off, s32 offset:40
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v86, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v39, off, s32 offset:52
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v48, off, s32 offset:44
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v49, off, s32 offset:36
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v50, off, s32 offset:28
@@ -44152,23 +44236,23 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v29, 8, v29
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 8, v0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(14)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 8, v4
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 8, v6
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(10)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v64, 8, v8
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(9)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 8, v10
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(10)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 8, v12
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(9)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 8, v14
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 8, v86
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 8, v0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB71_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -44504,8 +44588,6 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
 ; GFX11-FAKE16-NEXT:    s_clause 0xf
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:56
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v39, off, s32 offset:52
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v2, off, s32 offset:60
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:8
@@ -44514,6 +44596,8 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v12, off, s32 offset:32
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32 offset:40
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v86, off, s32 offset:48
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:56
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v39, off, s32 offset:52
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v48, off, s32 offset:44
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v49, off, s32 offset:36
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v50, off, s32 offset:28
@@ -44537,23 +44621,23 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 8, v29
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s4, 0
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 8, v0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(13)
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(14)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 8, v4
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(13)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 8, v6
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(10)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(12)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v64, 8, v8
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(9)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(11)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 8, v10
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(10)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 8, v12
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(9)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 8, v14
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(8)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 8, v86
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 8, v0
 ; GFX11-FAKE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
 ; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB71_4
 ; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -54174,17 +54258,17 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v44
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:116
 ; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:44
 ; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:36
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:28
 ; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:116
 ; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_lshlrev_b32_e32 v50, 24, v58
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v47
@@ -55586,18 +55670,6 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:128
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v64, off, s32 offset:120
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:116
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v65, off, s32 offset:112
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:108
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v65, off, s32 offset:104
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:100
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:96
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:92
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:88
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v81, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v67, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v67, off, s32 offset:8
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v68, off, s32 offset:16
@@ -55609,6 +55681,18 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v71, off, s32 offset:64
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v71, off, s32 offset:72
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v65, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v65, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v64, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v81, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:92
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:84
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:76
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:68
@@ -55621,7 +55705,7 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(23)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v80.h, v29.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v22.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v20.l
@@ -55652,12 +55736,6 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v80.h
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v64.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v64.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v65.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v65.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.l, 8, v66.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v66.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v26.h, 8, v67.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v27.l, 8, v67.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v68.l
@@ -55669,6 +55747,16 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v71.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v71.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.l, 8, v80.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(21)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v66.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.l, 8, v66.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(19)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v65.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v65.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(17)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v64.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v64.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(16)
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v81
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB86_3
@@ -55952,17 +56040,6 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
 ; GFX11-FAKE16-NEXT:    s_clause 0x1f
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:128
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v39, off, s32 offset:124
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32 offset:120
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v48, off, s32 offset:116
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:112
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v49, off, s32 offset:108
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:104
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v50, off, s32 offset:100
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:96
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v51, off, s32 offset:92
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:88
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v12, off, s32 offset:132
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v65, off, s32 offset:8
@@ -55975,6 +56052,17 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v129, off, s32 offset:64
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v130, off, s32 offset:72
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v131, off, s32 offset:80
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:88
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:96
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:104
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:112
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32 offset:120
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:128
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v39, off, s32 offset:124
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v48, off, s32 offset:116
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v49, off, s32 offset:108
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v50, off, s32 offset:100
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v51, off, s32 offset:92
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v52, off, s32 offset:84
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v53, off, s32 offset:76
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v54, off, s32 offset:68
@@ -56003,40 +56091,41 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v96, 8, v27
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v97, 8, v29
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(33)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(31)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v23, 8, v2
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(29)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v4
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(27)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v6
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(25)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v8
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(22)
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(21)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(32)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v98, 8, v14
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(20)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(31)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v99, 8, v65
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(19)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(30)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v81, 8, v66
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(18)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(29)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v82, 8, v67
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(17)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(28)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v83, 8, v83
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(16)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(27)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v84, 8, v84
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(15)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(26)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v85, 8, v85
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(14)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(25)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v27, 8, v128
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(13)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(24)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v29, 8, v129
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(23)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v65, 8, v130
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(22)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v66, 8, v131
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(21)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v67, 8, v10
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(20)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v8
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(19)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v6
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(18)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v4
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(17)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v23, 8, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(16)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v0
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -56459,22 +56548,22 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:76
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:28
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:36
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:44
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:56
-; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:52
 ; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:72
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:44
+; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:68
 ; SI-NEXT:    v_lshlrev_b32_e32 v50, 24, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v39, 8, v3
@@ -56496,21 +56585,19 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v23, 8, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 24, v4
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    v_lshlrev_b32_e32 v21, 8, v6
-; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 24, v52
-; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 8, v10
-; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 24, v12
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_lshlrev_b32_e32 v29, 8, v14
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 24, v28
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_lshlrev_b32_e32 v28, 8, v26
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_lshlrev_b32_e32 v15, 24, v24
 ; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
@@ -56565,33 +56652,41 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    v_or_b32_e32 v1, v25, v1
 ; SI-NEXT:    v_or_b32_e32 v10, v0, v1
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v30
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v42
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v23
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-NEXT:    v_or_b32_e32 v1, v11, v1
 ; SI-NEXT:    v_or_b32_e32 v11, v0, v1
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v41
+; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v40
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v21
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-NEXT:    v_or_b32_e32 v1, v19, v1
 ; SI-NEXT:    v_or_b32_e32 v12, v0, v1
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v35
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v55
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v17
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-NEXT:    v_or_b32_e32 v1, v13, v1
 ; SI-NEXT:    v_or_b32_e32 v13, v0, v1
+; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v36
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v53
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v29
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-NEXT:    v_or_b32_e32 v1, v14, v1
 ; SI-NEXT:    v_or_b32_e32 v14, v0, v1
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v37
 ; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v43
@@ -56867,6 +56962,7 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:  .LBB87_4:
 ; SI-NEXT:    v_mov_b32_e32 v27, v44
 ; SI-NEXT:    v_mov_b32_e32 v26, v8
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_mov_b32_e32 v52, v42
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; SI-NEXT:    s_branch .LBB87_2
@@ -56903,22 +56999,22 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:76
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32
 ; VI-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:8
-; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_load_ushort v6, off, s[0:3], s32 offset:16
-; VI-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:12
 ; VI-NEXT:    buffer_load_ushort v8, off, s[0:3], s32 offset:24
-; VI-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:20
 ; VI-NEXT:    buffer_load_ushort v10, off, s[0:3], s32 offset:32
-; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:28
 ; VI-NEXT:    buffer_load_ushort v12, off, s[0:3], s32 offset:40
-; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:36
 ; VI-NEXT:    buffer_load_ushort v14, off, s[0:3], s32 offset:48
-; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:44
 ; VI-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:56
-; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:52
 ; VI-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:64
-; VI-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:60
 ; VI-NEXT:    buffer_load_ushort v24, off, s[0:3], s32 offset:72
+; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:36
+; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:44
+; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:52
+; VI-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:60
 ; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:68
 ; VI-NEXT:    v_mov_b32_e32 v37, v30
 ; VI-NEXT:    v_lshlrev_b32_e32 v30, 8, v1
@@ -56940,21 +57036,19 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_lshlrev_b32_e32 v21, 8, v2
 ; VI-NEXT:    v_lshlrev_b32_e32 v11, 8, v33
-; VI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; VI-NEXT:    v_lshlrev_b32_e32 v19, 8, v6
-; VI-NEXT:    s_waitcnt vmcnt(13)
 ; VI-NEXT:    v_lshlrev_b32_e32 v29, 8, v8
-; VI-NEXT:    s_waitcnt vmcnt(11)
 ; VI-NEXT:    v_lshlrev_b32_e32 v33, 8, v10
-; VI-NEXT:    s_waitcnt vmcnt(9)
+; VI-NEXT:    s_waitcnt vmcnt(13)
 ; VI-NEXT:    v_lshlrev_b32_e32 v13, 8, v12
-; VI-NEXT:    s_waitcnt vmcnt(7)
+; VI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; VI-NEXT:    s_waitcnt vmcnt(12)
 ; VI-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
-; VI-NEXT:    s_waitcnt vmcnt(5)
+; VI-NEXT:    s_waitcnt vmcnt(11)
 ; VI-NEXT:    v_lshlrev_b32_e32 v27, 8, v28
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    s_waitcnt vmcnt(10)
 ; VI-NEXT:    v_lshlrev_b32_e32 v31, 8, v26
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(9)
 ; VI-NEXT:    v_lshlrev_b32_e32 v15, 8, v24
 ; VI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
@@ -56983,19 +57077,27 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32
 ; VI-NEXT:    v_or_b32_sdwa v1, v36, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(13)
 ; VI-NEXT:    v_or_b32_sdwa v1, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(12)
 ; VI-NEXT:    v_or_b32_sdwa v0, v55, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(11)
 ; VI-NEXT:    v_or_b32_sdwa v1, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(10)
 ; VI-NEXT:    v_or_b32_sdwa v0, v53, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(9)
 ; VI-NEXT:    v_or_b32_sdwa v1, v52, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(8)
 ; VI-NEXT:    v_or_b32_sdwa v0, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_or_b32_sdwa v1, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_and_b32 s4, s28, 0xff
 ; VI-NEXT:    s_lshl_b32 s5, s29, 8
 ; VI-NEXT:    v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_or_b32_sdwa v0, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_or_b32_sdwa v1, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -57209,6 +57311,7 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32
 ; VI-NEXT:  .LBB87_4:
 ; VI-NEXT:    v_mov_b32_e32 v28, v44
 ; VI-NEXT:    v_mov_b32_e32 v26, v4
+; VI-NEXT:    s_waitcnt vmcnt(8)
 ; VI-NEXT:    v_mov_b32_e32 v33, v42
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-NEXT:    s_branch .LBB87_2
@@ -57245,22 +57348,22 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:76
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32
 ; GFX9-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_load_ushort v6, off, s[0:3], s32 offset:16
-; GFX9-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:12
 ; GFX9-NEXT:    buffer_load_ushort v8, off, s[0:3], s32 offset:24
-; GFX9-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:20
 ; GFX9-NEXT:    buffer_load_ushort v10, off, s[0:3], s32 offset:32
-; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:28
 ; GFX9-NEXT:    buffer_load_ushort v12, off, s[0:3], s32 offset:40
-; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:36
 ; GFX9-NEXT:    buffer_load_ushort v14, off, s[0:3], s32 offset:48
-; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:44
 ; GFX9-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:56
-; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:52
 ; GFX9-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:64
-; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:60
 ; GFX9-NEXT:    buffer_load_ushort v24, off, s[0:3], s32 offset:72
+; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:60
 ; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:68
 ; GFX9-NEXT:    v_mov_b32_e32 v37, v30
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 8, v1
@@ -57284,22 +57387,22 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 8, v2
 ; GFX9-NEXT:    s_waitcnt vmcnt(17)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 8, v33
-; GFX9-NEXT:    s_and_b64 s[4:5], vcc, exec
-; GFX9-NEXT:    s_waitcnt vmcnt(15)
+; GFX9-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 8, v6
-; GFX9-NEXT:    s_waitcnt vmcnt(13)
+; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 8, v8
-; GFX9-NEXT:    s_waitcnt vmcnt(11)
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 8, v10
-; GFX9-NEXT:    s_waitcnt vmcnt(9)
+; GFX9-NEXT:    s_waitcnt vmcnt(13)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 8, v12
-; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    s_and_b64 s[4:5], vcc, exec
+; GFX9-NEXT:    s_waitcnt vmcnt(12)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-NEXT:    s_waitcnt vmcnt(11)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 8, v28
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    s_waitcnt vmcnt(10)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 8, v26
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 8, v24
 ; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
@@ -57328,19 +57431,27 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v36, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(13)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(12)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v55, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(11)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(10)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v53, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v52, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_and_b32 s4, s28, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s5, s29, 8
 ; GFX9-NEXT:    v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -57555,6 +57666,7 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32
 ; GFX9-NEXT:  .LBB87_4:
 ; GFX9-NEXT:    v_mov_b32_e32 v28, v44
 ; GFX9-NEXT:    v_mov_b32_e32 v26, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-NEXT:    v_mov_b32_e32 v33, v42
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX9-NEXT:    s_branch .LBB87_2
@@ -57567,8 +57679,6 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
 ; GFX11-TRUE16-NEXT:    s_clause 0xf
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:56
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v39, off, s32 offset:52
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v2, off, s32 offset:60
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v4, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v6, off, s32 offset:8
@@ -57577,6 +57687,8 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v12, off, s32 offset:32
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v14, off, s32 offset:40
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v86, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v39, off, s32 offset:52
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v48, off, s32 offset:44
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v49, off, s32 offset:36
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v50, off, s32 offset:28
@@ -57600,23 +57712,23 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v29, 8, v29
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 8, v0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(14)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 8, v4
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 8, v6
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(10)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v64, 8, v8
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(9)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 8, v10
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(10)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 8, v12
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(9)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 8, v14
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 8, v86
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 8, v0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB87_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -57952,8 +58064,6 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
 ; GFX11-FAKE16-NEXT:    s_clause 0xf
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:56
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v39, off, s32 offset:52
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v2, off, s32 offset:60
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:8
@@ -57962,6 +58072,8 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v12, off, s32 offset:32
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32 offset:40
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v86, off, s32 offset:48
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:56
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v39, off, s32 offset:52
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v48, off, s32 offset:44
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v49, off, s32 offset:36
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v50, off, s32 offset:28
@@ -57985,23 +58097,23 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 8, v29
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s4, 0
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 8, v0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(13)
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(14)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 8, v4
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(13)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 8, v6
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(10)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(12)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v64, 8, v8
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(9)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(11)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 8, v10
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(10)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 8, v12
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(9)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 8, v14
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(8)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 8, v86
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 8, v0
 ; GFX11-FAKE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
 ; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB87_4
 ; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -67744,6 +67856,8 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(3)
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:36
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:132
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32
@@ -67767,7 +67881,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:84
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:76
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:36
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
@@ -67784,6 +67897,9 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr55
 ; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:124
 ; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v16, 24, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v20, 8, v10
@@ -67792,21 +67908,21 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v26, 8, v26
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 24, v31
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 24, v32
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v44, 8, v33
-; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 24, v34
-; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_lshlrev_b32_e32 v56, 24, v35
-; SI-NEXT:    s_waitcnt vmcnt(11)
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_lshlrev_b32_e32 v60, 8, v36
-; SI-NEXT:    s_waitcnt vmcnt(10)
+; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_lshlrev_b32_e32 v59, 24, v37
-; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_lshlrev_b32_e32 v61, 24, v38
 ; SI-NEXT:    v_lshlrev_b32_e32 v22, 8, v25
-; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v39
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_lshlrev_b32_e32 v45, 8, v48
 ; SI-NEXT:    ; implicit-def: $vgpr37
 ; SI-NEXT:    ; implicit-def: $vgpr48
@@ -67819,10 +67935,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr25
 ; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -69388,18 +69500,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v55, off, s32 offset:128
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v55, off, s32 offset:120
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:116
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v53, off, s32 offset:112
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:108
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v53, off, s32 offset:104
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:100
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v52, off, s32 offset:96
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:92
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v54, off, s32 offset:88
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v66, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v39, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v48, off, s32 offset:8
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v48, off, s32 offset:16
@@ -69411,6 +69511,18 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:64
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v64, off, s32 offset:72
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v65, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v54, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v52, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v53, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v53, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v55, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v55, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v66, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:92
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:84
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:76
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:68
@@ -69426,7 +69538,7 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v29.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v27.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v25.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(21)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v23.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v21.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, v19.l
@@ -69459,27 +69571,30 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.l, 8, v29.h
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.h, 8, v55.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.l, 8, v55.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.h, 8, v53.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.l, 8, v53.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.h, 8, v52.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(14)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v54.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.h, 8, v39.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.h, 8, v48.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.h, 8, v48.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.l, 8, v50.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.h, 8, v50.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(28)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.h, 8, v51.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(21)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.l, 8, v52.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.l, 8, v54.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v64.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v64.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v65.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v54.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.h, 8, v52.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(19)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.l, 8, v53.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.h, 8, v53.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(17)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.l, 8, v55.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.h, 8, v55.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(16)
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v66
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB98_3
@@ -69764,17 +69879,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v38, v6 :: v_dual_mov_b32 v33, v4
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v37, v2 :: v_dual_mov_b32 v36, v0
 ; GFX11-FAKE16-NEXT:    s_clause 0x1f
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:128
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v67, off, s32 offset:124
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32 offset:120
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v70, off, s32 offset:116
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:112
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v66, off, s32 offset:108
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:104
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v69, off, s32 offset:100
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:96
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v64, off, s32 offset:92
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:88
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v12, off, s32 offset:132
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v96, off, s32 offset:8
@@ -69787,6 +69891,17 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v115, off, s32 offset:64
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v116, off, s32 offset:72
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v117, off, s32 offset:80
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:88
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:96
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:104
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:112
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32 offset:120
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:128
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v67, off, s32 offset:124
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v70, off, s32 offset:116
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v66, off, s32 offset:108
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v69, off, s32 offset:100
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v64, off, s32 offset:92
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v99, off, s32 offset:84
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v85, off, s32 offset:76
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v97, off, s32 offset:68
@@ -69815,37 +69930,41 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v27
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v27, 8, v29
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(33)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v119, 8, v0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(31)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v118, 8, v2
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(22)
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(21)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(32)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v23, 8, v14
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(20)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(31)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v98, 8, v96
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(19)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(30)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v100
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(18)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(29)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v96, 8, v101
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(17)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(28)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v29, 8, v102
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(16)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(27)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v112, 8, v103
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(15)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(26)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v103, 8, v113
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(14)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(25)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v101, 8, v114
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(13)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(24)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v100, 8, v115
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(23)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v114, 8, v116
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(22)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v102, 8, v117
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(21)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v117, 8, v10
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(20)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v113, 8, v8
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(19)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v116, 8, v6
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(18)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v115, 8, v4
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(17)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v118, 8, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(16)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v119, 8, v0
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -70007,10 +70126,13 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB98_2
 ; GFX11-FAKE16-NEXT:  .LBB98_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(14)
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v70, 3
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v67, 3
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(12)
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v69, 3
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v66, 3
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(11)
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, v64, 3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
@@ -70200,24 +70322,24 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    v_mov_b32_e32 v46, v30
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:76
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:24
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:44
 ; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:56
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:52
 ; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:28
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:36
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:72
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:44
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:68
 ; SI-NEXT:    v_readfirstlane_b32 s43, v1
 ; SI-NEXT:    v_readfirstlane_b32 s42, v0
@@ -70239,25 +70361,26 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v35
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 8, v38
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 24, v36
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 8, v48
-; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 24, v39
-; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_lshlrev_b32_e32 v61, 8, v37
-; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v49
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_lshlrev_b32_e32 v25, 8, v30
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_lshlrev_b32_e32 v30, 24, v31
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_lshlrev_b32_e32 v38, 8, v33
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_lshlrev_b32_e32 v29, 24, v34
+; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
@@ -70278,6 +70401,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v40
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(10) expcnt(0)
@@ -70773,6 +70897,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB99_4:
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_mov_b32_e32 v39, v32
 ; SI-NEXT:    ; implicit-def: $vgpr0
 ; SI-NEXT:    ; implicit-def: $sgpr6
@@ -70841,22 +70966,22 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:76
 ; VI-NEXT:    buffer_load_ushort v28, off, s[0:3], s32
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:8
-; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_load_ushort v6, off, s[0:3], s32 offset:16
-; VI-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:12
 ; VI-NEXT:    buffer_load_ushort v8, off, s[0:3], s32 offset:24
-; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:20
 ; VI-NEXT:    buffer_load_ushort v10, off, s[0:3], s32 offset:32
-; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:28
 ; VI-NEXT:    buffer_load_ushort v12, off, s[0:3], s32 offset:40
-; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:36
 ; VI-NEXT:    buffer_load_ushort v14, off, s[0:3], s32 offset:48
-; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:44
 ; VI-NEXT:    buffer_load_ushort v22, off, s[0:3], s32 offset:56
-; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:52
 ; VI-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:64
-; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:60
 ; VI-NEXT:    buffer_load_ushort v24, off, s[0:3], s32 offset:72
+; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:36
+; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:44
+; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:52
+; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:60
 ; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:68
 ; VI-NEXT:    v_lshlrev_b32_e32 v50, 8, v1
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v3
@@ -70881,19 +71006,15 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_lshlrev_b32_e32 v46, 8, v28
 ; VI-NEXT:    v_lshlrev_b32_e32 v56, 8, v4
-; VI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; VI-NEXT:    v_lshlrev_b32_e32 v58, 8, v6
 ; VI-NEXT:    v_lshlrev_b32_e32 v59, 8, v8
 ; VI-NEXT:    v_lshlrev_b32_e32 v60, 8, v10
-; VI-NEXT:    s_waitcnt vmcnt(13)
 ; VI-NEXT:    v_lshlrev_b32_e32 v61, 8, v12
-; VI-NEXT:    s_waitcnt vmcnt(11)
+; VI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; VI-NEXT:    v_lshlrev_b32_e32 v62, 8, v14
-; VI-NEXT:    s_waitcnt vmcnt(9)
 ; VI-NEXT:    v_lshlrev_b32_e32 v63, 8, v22
-; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_lshlrev_b32_e32 v33, 8, v33
-; VI-NEXT:    s_waitcnt vmcnt(5)
+; VI-NEXT:    s_waitcnt vmcnt(13)
 ; VI-NEXT:    v_lshlrev_b32_e32 v22, 8, v24
 ; VI-NEXT:    s_cbranch_scc0 .LBB99_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
@@ -70916,19 +71037,27 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; VI-NEXT:    v_or_b32_sdwa v1, v38, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v31, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(12)
 ; VI-NEXT:    v_or_b32_sdwa v1, v52, v56 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(11)
 ; VI-NEXT:    v_or_b32_sdwa v0, v54, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(10)
 ; VI-NEXT:    v_or_b32_sdwa v1, v41, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(9)
 ; VI-NEXT:    v_or_b32_sdwa v0, v42, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(8)
 ; VI-NEXT:    v_or_b32_sdwa v1, v43, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_or_b32_sdwa v0, v44, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_or_b32_sdwa v1, v45, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_and_b32 s4, s28, 0xff
 ; VI-NEXT:    s_lshl_b32 s5, s29, 8
 ; VI-NEXT:    v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_or_b32_sdwa v0, v47, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_or_b32_sdwa v1, v57, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -70975,6 +71104,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; VI-NEXT:    s_cbranch_execnz .LBB99_3
 ; VI-NEXT:  .LBB99_2: ; %cmp.true
 ; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(8)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v44
 ; VI-NEXT:    v_or_b32_sdwa v3, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u32_e32 v14, vcc, 0x300, v3
@@ -71181,22 +71311,22 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:76
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_load_ushort v6, off, s[0:3], s32 offset:16
-; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:12
 ; GFX9-NEXT:    buffer_load_ushort v8, off, s[0:3], s32 offset:24
-; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:20
 ; GFX9-NEXT:    buffer_load_ushort v10, off, s[0:3], s32 offset:32
-; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:28
 ; GFX9-NEXT:    buffer_load_ushort v12, off, s[0:3], s32 offset:40
-; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:36
 ; GFX9-NEXT:    buffer_load_ushort v14, off, s[0:3], s32 offset:48
-; GFX9-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:44
 ; GFX9-NEXT:    buffer_load_ushort v36, off, s[0:3], s32 offset:56
-; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:52
 ; GFX9-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:64
-; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
 ; GFX9-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:72
+; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
 ; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:68
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v50, 8, v3
@@ -71222,22 +71352,22 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v47, 8, v2
 ; GFX9-NEXT:    s_waitcnt vmcnt(20)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 8, v4
-; GFX9-NEXT:    s_and_b64 s[4:5], vcc, exec
-; GFX9-NEXT:    s_waitcnt vmcnt(18)
+; GFX9-NEXT:    s_waitcnt vmcnt(19)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v59, 8, v6
-; GFX9-NEXT:    s_waitcnt vmcnt(16)
+; GFX9-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v58, 8, v8
-; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    s_waitcnt vmcnt(17)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v61, 8, v10
-; GFX9-NEXT:    s_waitcnt vmcnt(12)
+; GFX9-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v60, 8, v12
-; GFX9-NEXT:    s_waitcnt vmcnt(10)
+; GFX9-NEXT:    s_and_b64 s[4:5], vcc, exec
+; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v63, 8, v14
-; GFX9-NEXT:    s_waitcnt vmcnt(8)
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v62, 8, v36
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(13)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v38, 8, v38
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(12)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v36, 8, v31
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB99_4
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
@@ -71278,38 +71408,46 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; GFX9-NEXT:    s_and_b32 s4, s16, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s5, s17, 8
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(11)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v52, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_or_b32 s4, s4, s5
 ; GFX9-NEXT:    s_and_b32 s5, s18, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s6, s19, 8
 ; GFX9-NEXT:    v_lshl_or_b32 v11, v1, 16, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(10)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v41, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_or_b32 s5, s5, s6
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v40, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s5
 ; GFX9-NEXT:    s_and_b32 s5, s20, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s6, s21, 8
 ; GFX9-NEXT:    v_lshl_or_b32 v12, v1, 16, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v44, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_or_b32 s5, s5, s6
 ; GFX9-NEXT:    s_and_b32 s6, s22, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s7, s23, 8
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v43, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_or_b32 s6, s6, s7
 ; GFX9-NEXT:    v_lshl_or_b32 v13, v1, 16, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v46, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
 ; GFX9-NEXT:    s_and_b32 s6, s24, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s7, s25, 8
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v45, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_or_b32 s6, s6, s7
 ; GFX9-NEXT:    s_and_b32 s7, s26, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s8, s27, 8
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v26, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshl_or_b32 v14, v1, 16, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v57, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_or_b32 s7, s7, s8
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
@@ -71324,6 +71462,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9-NEXT:    s_cbranch_execnz .LBB99_3
 ; GFX9-NEXT:  .LBB99_2: ; %cmp.true
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_add_u32_e32 v3, 3, v45
 ; GFX9-NEXT:    v_or_b32_sdwa v3, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u32_e32 v14, 0x300, v3
@@ -71410,6 +71549,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; GFX9-NEXT:    s_and_b32 s9, s16, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s10, s17, 8
 ; GFX9-NEXT:    s_add_i32 s18, s18, 3
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_add_u32_e32 v0, 3, v57
 ; GFX9-NEXT:    v_add_u32_e32 v2, 3, v46
 ; GFX9-NEXT:    s_or_b32 s9, s10, s9
@@ -71505,8 +71645,6 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v35, v0
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v38, v4 :: v_dual_mov_b32 v37, v2
 ; GFX11-TRUE16-NEXT:    s_clause 0xf
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:56
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:52
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v2, off, s32 offset:60
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v4, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v6, off, s32 offset:8
@@ -71515,6 +71653,8 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v12, off, s32 offset:32
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v14, off, s32 offset:40
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v84, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:52
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v82, off, s32 offset:44
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v69, off, s32 offset:36
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:28
@@ -71538,23 +71678,23 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 8, v29
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v85, 8, v0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(14)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v66, 8, v4
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v29, 8, v6
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(10)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v70, 8, v8
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(9)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v71, 8, v10
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(10)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v83, 8, v12
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(9)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 8, v14
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v84, 8, v84
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v85, 8, v0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB99_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -71697,6 +71837,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s1, 0x300
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s10, 0x300
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s4, 0x300
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v64
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s9, s10
@@ -71831,8 +71972,6 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v35, v0
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v38, v4 :: v_dual_mov_b32 v37, v2
 ; GFX11-FAKE16-NEXT:    s_clause 0xf
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:56
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v65, off, s32 offset:52
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v2, off, s32 offset:60
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:8
@@ -71841,6 +71980,8 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v12, off, s32 offset:32
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32 offset:40
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v84, off, s32 offset:48
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:56
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v65, off, s32 offset:52
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v82, off, s32 offset:44
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v69, off, s32 offset:36
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v80, off, s32 offset:28
@@ -71864,23 +72005,23 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 8, v29
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s4, 0
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v85, 8, v0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(13)
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(14)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 8, v4
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(13)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 8, v6
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(10)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(12)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v70, 8, v8
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(9)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(11)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v71, 8, v10
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(10)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v83, 8, v12
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(9)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v81, 8, v14
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(8)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v84, 8, v84
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v85, 8, v0
 ; GFX11-FAKE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
 ; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB99_4
 ; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -81810,18 +81951,6 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v55, off, s32 offset:128
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v55, off, s32 offset:120
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:116
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v53, off, s32 offset:112
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:108
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v53, off, s32 offset:104
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:100
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v52, off, s32 offset:96
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:92
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v54, off, s32 offset:88
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v66, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v39, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v48, off, s32 offset:8
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v48, off, s32 offset:16
@@ -81833,6 +81962,18 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:64
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v64, off, s32 offset:72
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v65, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v54, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v52, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v53, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v53, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v55, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v55, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v66, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:92
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:84
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:76
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:68
@@ -81848,7 +81989,7 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v29.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v27.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v25.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(21)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v23.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v21.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, v19.l
@@ -81881,27 +82022,30 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.l, 8, v29.h
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.h, 8, v55.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.l, 8, v55.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.h, 8, v53.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.l, 8, v53.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.h, 8, v52.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(14)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v54.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.h, 8, v39.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.h, 8, v48.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.h, 8, v48.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.l, 8, v50.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.h, 8, v50.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(28)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.h, 8, v51.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(21)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.l, 8, v52.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.l, 8, v54.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v64.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v64.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v65.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v54.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.h, 8, v52.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(19)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.l, 8, v53.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.h, 8, v53.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(17)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.l, 8, v55.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.h, 8, v55.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(16)
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v66
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB106_3
@@ -82186,17 +82330,6 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v38, v6 :: v_dual_mov_b32 v33, v4
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v37, v2 :: v_dual_mov_b32 v36, v0
 ; GFX11-FAKE16-NEXT:    s_clause 0x1f
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:128
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v67, off, s32 offset:124
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32 offset:120
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v70, off, s32 offset:116
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:112
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v66, off, s32 offset:108
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:104
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v69, off, s32 offset:100
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:96
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v64, off, s32 offset:92
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:88
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v12, off, s32 offset:132
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v96, off, s32 offset:8
@@ -82209,6 +82342,17 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v115, off, s32 offset:64
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v116, off, s32 offset:72
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v117, off, s32 offset:80
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:88
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:96
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:104
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:112
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32 offset:120
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:128
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v67, off, s32 offset:124
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v70, off, s32 offset:116
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v66, off, s32 offset:108
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v69, off, s32 offset:100
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v64, off, s32 offset:92
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v99, off, s32 offset:84
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v85, off, s32 offset:76
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v97, off, s32 offset:68
@@ -82237,37 +82381,41 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v27
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v27, 8, v29
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(33)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v119, 8, v0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(31)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v118, 8, v2
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(22)
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(21)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(32)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v23, 8, v14
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(20)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(31)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v98, 8, v96
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(19)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(30)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v100
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(18)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(29)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v96, 8, v101
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(17)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(28)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v29, 8, v102
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(16)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(27)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v112, 8, v103
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(15)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(26)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v103, 8, v113
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(14)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(25)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v101, 8, v114
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(13)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(24)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v100, 8, v115
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(23)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v114, 8, v116
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(22)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v102, 8, v117
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(21)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v117, 8, v10
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(20)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v113, 8, v8
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(19)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v116, 8, v6
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(18)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v115, 8, v4
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(17)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v118, 8, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(16)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v119, 8, v0
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -82429,10 +82577,13 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB106_2
 ; GFX11-FAKE16-NEXT:  .LBB106_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(14)
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v70, 3
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v67, 3
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(12)
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v69, 3
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v66, 3
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(11)
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, v64, 3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
@@ -83114,22 +83265,22 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:76
 ; VI-NEXT:    buffer_load_ushort v28, off, s[0:3], s32
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:8
-; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_load_ushort v6, off, s[0:3], s32 offset:16
-; VI-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:12
 ; VI-NEXT:    buffer_load_ushort v8, off, s[0:3], s32 offset:24
-; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:20
 ; VI-NEXT:    buffer_load_ushort v10, off, s[0:3], s32 offset:32
-; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:28
 ; VI-NEXT:    buffer_load_ushort v12, off, s[0:3], s32 offset:40
-; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:36
 ; VI-NEXT:    buffer_load_ushort v14, off, s[0:3], s32 offset:48
-; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:44
 ; VI-NEXT:    buffer_load_ushort v22, off, s[0:3], s32 offset:56
-; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:52
 ; VI-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:64
-; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:60
 ; VI-NEXT:    buffer_load_ushort v24, off, s[0:3], s32 offset:72
+; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:36
+; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:44
+; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:52
+; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:60
 ; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:68
 ; VI-NEXT:    v_lshlrev_b32_e32 v50, 8, v1
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v3
@@ -83154,19 +83305,15 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_lshlrev_b32_e32 v46, 8, v28
 ; VI-NEXT:    v_lshlrev_b32_e32 v56, 8, v4
-; VI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; VI-NEXT:    v_lshlrev_b32_e32 v58, 8, v6
 ; VI-NEXT:    v_lshlrev_b32_e32 v59, 8, v8
 ; VI-NEXT:    v_lshlrev_b32_e32 v60, 8, v10
-; VI-NEXT:    s_waitcnt vmcnt(13)
 ; VI-NEXT:    v_lshlrev_b32_e32 v61, 8, v12
-; VI-NEXT:    s_waitcnt vmcnt(11)
+; VI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; VI-NEXT:    v_lshlrev_b32_e32 v62, 8, v14
-; VI-NEXT:    s_waitcnt vmcnt(9)
 ; VI-NEXT:    v_lshlrev_b32_e32 v63, 8, v22
-; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_lshlrev_b32_e32 v33, 8, v33
-; VI-NEXT:    s_waitcnt vmcnt(5)
+; VI-NEXT:    s_waitcnt vmcnt(13)
 ; VI-NEXT:    v_lshlrev_b32_e32 v22, 8, v24
 ; VI-NEXT:    s_cbranch_scc0 .LBB107_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
@@ -83189,19 +83336,27 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
 ; VI-NEXT:    v_or_b32_sdwa v1, v38, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v31, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(12)
 ; VI-NEXT:    v_or_b32_sdwa v1, v52, v56 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(11)
 ; VI-NEXT:    v_or_b32_sdwa v0, v54, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(10)
 ; VI-NEXT:    v_or_b32_sdwa v1, v41, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(9)
 ; VI-NEXT:    v_or_b32_sdwa v0, v42, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(8)
 ; VI-NEXT:    v_or_b32_sdwa v1, v43, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_or_b32_sdwa v0, v44, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_or_b32_sdwa v1, v45, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_and_b32 s4, s28, 0xff
 ; VI-NEXT:    s_lshl_b32 s5, s29, 8
 ; VI-NEXT:    v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_or_b32_sdwa v0, v47, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_or_b32_sdwa v1, v57, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -83248,6 +83403,7 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
 ; VI-NEXT:    s_cbranch_execnz .LBB107_3
 ; VI-NEXT:  .LBB107_2: ; %cmp.true
 ; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(8)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v44
 ; VI-NEXT:    v_or_b32_sdwa v3, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u32_e32 v14, vcc, 0x300, v3
@@ -83454,22 +83610,22 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:76
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_load_ushort v6, off, s[0:3], s32 offset:16
-; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:12
 ; GFX9-NEXT:    buffer_load_ushort v8, off, s[0:3], s32 offset:24
-; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:20
 ; GFX9-NEXT:    buffer_load_ushort v10, off, s[0:3], s32 offset:32
-; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:28
 ; GFX9-NEXT:    buffer_load_ushort v12, off, s[0:3], s32 offset:40
-; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:36
 ; GFX9-NEXT:    buffer_load_ushort v14, off, s[0:3], s32 offset:48
-; GFX9-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:44
 ; GFX9-NEXT:    buffer_load_ushort v36, off, s[0:3], s32 offset:56
-; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:52
 ; GFX9-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:64
-; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
 ; GFX9-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:72
+; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
 ; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:68
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v50, 8, v3
@@ -83495,22 +83651,22 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v47, 8, v2
 ; GFX9-NEXT:    s_waitcnt vmcnt(20)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 8, v4
-; GFX9-NEXT:    s_and_b64 s[4:5], vcc, exec
-; GFX9-NEXT:    s_waitcnt vmcnt(18)
+; GFX9-NEXT:    s_waitcnt vmcnt(19)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v59, 8, v6
-; GFX9-NEXT:    s_waitcnt vmcnt(16)
+; GFX9-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v58, 8, v8
-; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    s_waitcnt vmcnt(17)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v61, 8, v10
-; GFX9-NEXT:    s_waitcnt vmcnt(12)
+; GFX9-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v60, 8, v12
-; GFX9-NEXT:    s_waitcnt vmcnt(10)
+; GFX9-NEXT:    s_and_b64 s[4:5], vcc, exec
+; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v63, 8, v14
-; GFX9-NEXT:    s_waitcnt vmcnt(8)
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v62, 8, v36
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(13)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v38, 8, v38
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(12)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v36, 8, v31
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB107_4
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
@@ -83551,38 +83707,46 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
 ; GFX9-NEXT:    s_and_b32 s4, s16, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s5, s17, 8
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(11)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v52, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_or_b32 s4, s4, s5
 ; GFX9-NEXT:    s_and_b32 s5, s18, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s6, s19, 8
 ; GFX9-NEXT:    v_lshl_or_b32 v11, v1, 16, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(10)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v41, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_or_b32 s5, s5, s6
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v40, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s5
 ; GFX9-NEXT:    s_and_b32 s5, s20, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s6, s21, 8
 ; GFX9-NEXT:    v_lshl_or_b32 v12, v1, 16, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v44, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_or_b32 s5, s5, s6
 ; GFX9-NEXT:    s_and_b32 s6, s22, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s7, s23, 8
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v43, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_or_b32 s6, s6, s7
 ; GFX9-NEXT:    v_lshl_or_b32 v13, v1, 16, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v46, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
 ; GFX9-NEXT:    s_and_b32 s6, s24, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s7, s25, 8
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v45, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_or_b32 s6, s6, s7
 ; GFX9-NEXT:    s_and_b32 s7, s26, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s8, s27, 8
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v26, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshl_or_b32 v14, v1, 16, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v57, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_or_b32 s7, s7, s8
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
@@ -83597,6 +83761,7 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9-NEXT:    s_cbranch_execnz .LBB107_3
 ; GFX9-NEXT:  .LBB107_2: ; %cmp.true
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_add_u32_e32 v3, 3, v45
 ; GFX9-NEXT:    v_or_b32_sdwa v3, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u32_e32 v14, 0x300, v3
@@ -83683,6 +83848,7 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
 ; GFX9-NEXT:    s_and_b32 s9, s16, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s10, s17, 8
 ; GFX9-NEXT:    s_add_i32 s18, s18, 3
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_add_u32_e32 v0, 3, v57
 ; GFX9-NEXT:    v_add_u32_e32 v2, 3, v46
 ; GFX9-NEXT:    s_or_b32 s9, s10, s9
@@ -83778,8 +83944,6 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v35, v0
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v38, v4 :: v_dual_mov_b32 v37, v2
 ; GFX11-TRUE16-NEXT:    s_clause 0xf
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:56
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:52
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v2, off, s32 offset:60
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v4, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v6, off, s32 offset:8
@@ -83788,6 +83952,8 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v12, off, s32 offset:32
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v14, off, s32 offset:40
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v84, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:52
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v82, off, s32 offset:44
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v69, off, s32 offset:36
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:28
@@ -83811,23 +83977,23 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 8, v29
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v85, 8, v0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(14)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v66, 8, v4
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v29, 8, v6
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(10)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v70, 8, v8
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(9)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v71, 8, v10
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(10)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v83, 8, v12
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(9)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 8, v14
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v84, 8, v84
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v85, 8, v0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB107_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -83970,6 +84136,7 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s1, 0x300
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s10, 0x300
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s4, 0x300
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v64
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s9, s10
@@ -84104,8 +84271,6 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v35, v0
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v38, v4 :: v_dual_mov_b32 v37, v2
 ; GFX11-FAKE16-NEXT:    s_clause 0xf
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:56
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v65, off, s32 offset:52
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v2, off, s32 offset:60
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:8
@@ -84114,6 +84279,8 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v12, off, s32 offset:32
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32 offset:40
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v84, off, s32 offset:48
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:56
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v65, off, s32 offset:52
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v82, off, s32 offset:44
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v69, off, s32 offset:36
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v80, off, s32 offset:28
@@ -84137,23 +84304,23 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 8, v29
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s4, 0
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v85, 8, v0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(13)
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(14)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 8, v4
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(13)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 8, v6
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(10)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(12)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v70, 8, v8
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(9)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(11)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v71, 8, v10
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(10)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v83, 8, v12
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(9)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v81, 8, v14
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(8)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v84, 8, v84
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v85, 8, v0
 ; GFX11-FAKE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
 ; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB107_4
 ; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -90408,6 +90575,8 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:28
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:132
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32
@@ -90428,7 +90597,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:100
 ; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:92
 ; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:28
 ; SI-NEXT:    v_lshlrev_b32_e32 v63, 8, v13
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 8, v21
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v27
@@ -90452,39 +90620,37 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr29
 ; SI-NEXT:    ; implicit-def: $vgpr42
 ; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:20
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v22, 24, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v8
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v12
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v18, 24, v17
-; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 8, v20
-; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_lshlrev_b32_e32 v8, 24, v24
-; SI-NEXT:    s_waitcnt vmcnt(11)
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 24, v28
-; SI-NEXT:    s_waitcnt vmcnt(10)
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_lshlrev_b32_e32 v57, 8, v31
-; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_lshlrev_b32_e32 v46, 24, v32
-; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 24, v33
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_lshlrev_b32_e32 v35, 8, v34
-; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_lshlrev_b32_e32 v61, 24, v36
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    ; implicit-def: $vgpr32
 ; SI-NEXT:    ; implicit-def: $vgpr34
 ; SI-NEXT:    ; implicit-def: $vgpr36
 ; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:20
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8
@@ -92045,18 +92211,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v55, off, s32 offset:128
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v55, off, s32 offset:120
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:116
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v53, off, s32 offset:112
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:108
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v53, off, s32 offset:104
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:100
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v52, off, s32 offset:96
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:92
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v54, off, s32 offset:88
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v66, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v39, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v48, off, s32 offset:8
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v48, off, s32 offset:16
@@ -92068,6 +92222,18 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:64
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v64, off, s32 offset:72
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v65, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v54, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v52, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v53, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v53, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v55, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v55, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v66, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:92
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:84
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:76
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:68
@@ -92083,7 +92249,7 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v29.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v27.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v25.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(21)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v23.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v21.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, v19.l
@@ -92116,27 +92282,30 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.l, 8, v29.h
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.h, 8, v55.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.l, 8, v55.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.h, 8, v53.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.l, 8, v53.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.h, 8, v52.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(14)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v54.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.h, 8, v39.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.h, 8, v48.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.h, 8, v48.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.l, 8, v50.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.h, 8, v50.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(28)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.h, 8, v51.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(21)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.l, 8, v52.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.l, 8, v54.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v64.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v64.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v65.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v54.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.h, 8, v52.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(19)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.l, 8, v53.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.h, 8, v53.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(17)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.l, 8, v55.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.h, 8, v55.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(16)
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v66
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB110_3
@@ -92421,17 +92590,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v38, v6 :: v_dual_mov_b32 v33, v4
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v37, v2 :: v_dual_mov_b32 v36, v0
 ; GFX11-FAKE16-NEXT:    s_clause 0x1f
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:128
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v67, off, s32 offset:124
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32 offset:120
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v70, off, s32 offset:116
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:112
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v66, off, s32 offset:108
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:104
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v69, off, s32 offset:100
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:96
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v64, off, s32 offset:92
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:88
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v12, off, s32 offset:132
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v96, off, s32 offset:8
@@ -92444,6 +92602,17 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v115, off, s32 offset:64
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v116, off, s32 offset:72
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v117, off, s32 offset:80
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:88
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:96
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:104
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:112
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32 offset:120
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:128
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v67, off, s32 offset:124
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v70, off, s32 offset:116
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v66, off, s32 offset:108
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v69, off, s32 offset:100
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v64, off, s32 offset:92
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v99, off, s32 offset:84
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v85, off, s32 offset:76
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v97, off, s32 offset:68
@@ -92472,37 +92641,41 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v27
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v27, 8, v29
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(33)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v119, 8, v0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(31)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v118, 8, v2
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(22)
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(21)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(32)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v23, 8, v14
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(20)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(31)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v98, 8, v96
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(19)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(30)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v100
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(18)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(29)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v96, 8, v101
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(17)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(28)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v29, 8, v102
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(16)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(27)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v112, 8, v103
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(15)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(26)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v103, 8, v113
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(14)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(25)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v101, 8, v114
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(13)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(24)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v100, 8, v115
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(23)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v114, 8, v116
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(22)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v102, 8, v117
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(21)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v117, 8, v10
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(20)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v113, 8, v8
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(19)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v116, 8, v6
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(18)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v115, 8, v4
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(17)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v118, 8, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(16)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v119, 8, v0
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -92664,10 +92837,13 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB110_2
 ; GFX11-FAKE16-NEXT:  .LBB110_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(14)
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v70, 3
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v67, 3
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(12)
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v69, 3
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v66, 3
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(11)
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, v64, 3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
@@ -92849,21 +93025,20 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:28
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:40
 ; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:56
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:72
 ; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:52
 ; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:48
 ; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:44
-; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:24
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:16
 ; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:40
-; SI-NEXT:    s_waitcnt expcnt(5)
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:36
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:72
 ; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:68
 ; SI-NEXT:    v_readfirstlane_b32 s46, v30
 ; SI-NEXT:    v_readfirstlane_b32 s44, v23
@@ -92890,14 +93065,14 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v36
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    s_waitcnt vmcnt(13)
-; SI-NEXT:    v_lshlrev_b32_e32 v44, 24, v37
-; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_lshlrev_b32_e32 v41, 24, v38
-; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v30, 24, v39
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    v_lshlrev_b32_e32 v41, 24, v38
+; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 24, v48
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(10)
+; SI-NEXT:    v_lshlrev_b32_e32 v44, 24, v37
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_lshlrev_b32_e32 v45, 24, v45
 ; SI-NEXT:    s_cbranch_scc0 .LBB111_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
@@ -92945,6 +93120,7 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
 ; SI-NEXT:    v_and_b32_e32 v17, 0xff, v18
 ; SI-NEXT:    s_and_b32 s4, s45, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s44, 8
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_and_b32_e32 v25, 0xff, v52
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 24, v3
@@ -92961,6 +93137,7 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
 ; SI-NEXT:    s_and_b32 s4, s46, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s47, 8
 ; SI-NEXT:    v_or_b32_e32 v32, v29, v25
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v29, 0xff, v40
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
 ; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
@@ -93353,22 +93530,22 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:76
 ; VI-NEXT:    buffer_load_ushort v28, off, s[0:3], s32
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:8
-; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_load_ushort v6, off, s[0:3], s32 offset:16
-; VI-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:12
 ; VI-NEXT:    buffer_load_ushort v8, off, s[0:3], s32 offset:24
-; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:20
 ; VI-NEXT:    buffer_load_ushort v10, off, s[0:3], s32 offset:32
-; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:28
 ; VI-NEXT:    buffer_load_ushort v12, off, s[0:3], s32 offset:40
-; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:36
 ; VI-NEXT:    buffer_load_ushort v14, off, s[0:3], s32 offset:48
-; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:44
 ; VI-NEXT:    buffer_load_ushort v22, off, s[0:3], s32 offset:56
-; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:52
 ; VI-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:64
-; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:60
 ; VI-NEXT:    buffer_load_ushort v24, off, s[0:3], s32 offset:72
+; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:36
+; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:44
+; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:52
+; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:60
 ; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:68
 ; VI-NEXT:    v_lshlrev_b32_e32 v50, 8, v1
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v3
@@ -93393,19 +93570,15 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_lshlrev_b32_e32 v46, 8, v28
 ; VI-NEXT:    v_lshlrev_b32_e32 v56, 8, v4
-; VI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; VI-NEXT:    v_lshlrev_b32_e32 v58, 8, v6
 ; VI-NEXT:    v_lshlrev_b32_e32 v59, 8, v8
 ; VI-NEXT:    v_lshlrev_b32_e32 v60, 8, v10
-; VI-NEXT:    s_waitcnt vmcnt(13)
 ; VI-NEXT:    v_lshlrev_b32_e32 v61, 8, v12
-; VI-NEXT:    s_waitcnt vmcnt(11)
+; VI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; VI-NEXT:    v_lshlrev_b32_e32 v62, 8, v14
-; VI-NEXT:    s_waitcnt vmcnt(9)
 ; VI-NEXT:    v_lshlrev_b32_e32 v63, 8, v22
-; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_lshlrev_b32_e32 v33, 8, v33
-; VI-NEXT:    s_waitcnt vmcnt(5)
+; VI-NEXT:    s_waitcnt vmcnt(13)
 ; VI-NEXT:    v_lshlrev_b32_e32 v22, 8, v24
 ; VI-NEXT:    s_cbranch_scc0 .LBB111_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
@@ -93428,19 +93601,27 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
 ; VI-NEXT:    v_or_b32_sdwa v1, v38, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v31, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(12)
 ; VI-NEXT:    v_or_b32_sdwa v1, v52, v56 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(11)
 ; VI-NEXT:    v_or_b32_sdwa v0, v54, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(10)
 ; VI-NEXT:    v_or_b32_sdwa v1, v41, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(9)
 ; VI-NEXT:    v_or_b32_sdwa v0, v42, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(8)
 ; VI-NEXT:    v_or_b32_sdwa v1, v43, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_or_b32_sdwa v0, v44, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_or_b32_sdwa v1, v45, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_and_b32 s4, s28, 0xff
 ; VI-NEXT:    s_lshl_b32 s5, s29, 8
 ; VI-NEXT:    v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_or_b32_sdwa v0, v47, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_or_b32_sdwa v1, v57, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -93487,6 +93668,7 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
 ; VI-NEXT:    s_cbranch_execnz .LBB111_3
 ; VI-NEXT:  .LBB111_2: ; %cmp.true
 ; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(8)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v44
 ; VI-NEXT:    v_or_b32_sdwa v3, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u32_e32 v14, vcc, 0x300, v3
@@ -93693,22 +93875,22 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:76
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_load_ushort v6, off, s[0:3], s32 offset:16
-; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:12
 ; GFX9-NEXT:    buffer_load_ushort v8, off, s[0:3], s32 offset:24
-; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:20
 ; GFX9-NEXT:    buffer_load_ushort v10, off, s[0:3], s32 offset:32
-; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:28
 ; GFX9-NEXT:    buffer_load_ushort v12, off, s[0:3], s32 offset:40
-; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:36
 ; GFX9-NEXT:    buffer_load_ushort v14, off, s[0:3], s32 offset:48
-; GFX9-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:44
 ; GFX9-NEXT:    buffer_load_ushort v36, off, s[0:3], s32 offset:56
-; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:52
 ; GFX9-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:64
-; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
 ; GFX9-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:72
+; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
 ; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:68
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v50, 8, v3
@@ -93734,22 +93916,22 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v47, 8, v2
 ; GFX9-NEXT:    s_waitcnt vmcnt(20)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 8, v4
-; GFX9-NEXT:    s_and_b64 s[4:5], vcc, exec
-; GFX9-NEXT:    s_waitcnt vmcnt(18)
+; GFX9-NEXT:    s_waitcnt vmcnt(19)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v59, 8, v6
-; GFX9-NEXT:    s_waitcnt vmcnt(16)
+; GFX9-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v58, 8, v8
-; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    s_waitcnt vmcnt(17)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v61, 8, v10
-; GFX9-NEXT:    s_waitcnt vmcnt(12)
+; GFX9-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v60, 8, v12
-; GFX9-NEXT:    s_waitcnt vmcnt(10)
+; GFX9-NEXT:    s_and_b64 s[4:5], vcc, exec
+; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v63, 8, v14
-; GFX9-NEXT:    s_waitcnt vmcnt(8)
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v62, 8, v36
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(13)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v38, 8, v38
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(12)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v36, 8, v31
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB111_4
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
@@ -93790,38 +93972,46 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
 ; GFX9-NEXT:    s_and_b32 s4, s16, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s5, s17, 8
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(11)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v52, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_or_b32 s4, s4, s5
 ; GFX9-NEXT:    s_and_b32 s5, s18, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s6, s19, 8
 ; GFX9-NEXT:    v_lshl_or_b32 v11, v1, 16, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(10)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v41, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_or_b32 s5, s5, s6
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v40, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s5
 ; GFX9-NEXT:    s_and_b32 s5, s20, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s6, s21, 8
 ; GFX9-NEXT:    v_lshl_or_b32 v12, v1, 16, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v44, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_or_b32 s5, s5, s6
 ; GFX9-NEXT:    s_and_b32 s6, s22, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s7, s23, 8
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v43, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_or_b32 s6, s6, s7
 ; GFX9-NEXT:    v_lshl_or_b32 v13, v1, 16, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v46, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
 ; GFX9-NEXT:    s_and_b32 s6, s24, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s7, s25, 8
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v45, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_or_b32 s6, s6, s7
 ; GFX9-NEXT:    s_and_b32 s7, s26, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s8, s27, 8
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v26, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshl_or_b32 v14, v1, 16, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v57, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_or_b32 s7, s7, s8
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
@@ -93836,6 +94026,7 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9-NEXT:    s_cbranch_execnz .LBB111_3
 ; GFX9-NEXT:  .LBB111_2: ; %cmp.true
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_add_u32_e32 v3, 3, v45
 ; GFX9-NEXT:    v_or_b32_sdwa v3, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u32_e32 v14, 0x300, v3
@@ -93922,6 +94113,7 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
 ; GFX9-NEXT:    s_and_b32 s9, s16, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s10, s17, 8
 ; GFX9-NEXT:    s_add_i32 s18, s18, 3
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_add_u32_e32 v0, 3, v57
 ; GFX9-NEXT:    v_add_u32_e32 v2, 3, v46
 ; GFX9-NEXT:    s_or_b32 s9, s10, s9
@@ -94017,8 +94209,6 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v35, v0
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v38, v4 :: v_dual_mov_b32 v37, v2
 ; GFX11-TRUE16-NEXT:    s_clause 0xf
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:56
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:52
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v2, off, s32 offset:60
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v4, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v6, off, s32 offset:8
@@ -94027,6 +94217,8 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v12, off, s32 offset:32
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v14, off, s32 offset:40
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v84, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:52
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v82, off, s32 offset:44
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v69, off, s32 offset:36
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:28
@@ -94050,23 +94242,23 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 8, v29
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v85, 8, v0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(14)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v66, 8, v4
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v29, 8, v6
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(10)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v70, 8, v8
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(9)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v71, 8, v10
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(10)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v83, 8, v12
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(9)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 8, v14
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v84, 8, v84
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v85, 8, v0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB111_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -94209,6 +94401,7 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s1, 0x300
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s10, 0x300
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s4, 0x300
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v64
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s9, s10
@@ -94343,8 +94536,6 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v35, v0
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v38, v4 :: v_dual_mov_b32 v37, v2
 ; GFX11-FAKE16-NEXT:    s_clause 0xf
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:56
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v65, off, s32 offset:52
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v2, off, s32 offset:60
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:8
@@ -94353,6 +94544,8 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v12, off, s32 offset:32
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32 offset:40
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v84, off, s32 offset:48
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:56
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v65, off, s32 offset:52
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v82, off, s32 offset:44
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v69, off, s32 offset:36
 ; GFX11-FAKE16-NEXT:    scratch_load_u16 v80, off, s32 offset:28
@@ -94376,23 +94569,23 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 8, v29
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s4, 0
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v85, 8, v0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(13)
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(14)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 8, v4
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(13)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 8, v6
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(10)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(12)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v70, 8, v8
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(9)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(11)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v71, 8, v10
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(10)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v83, 8, v12
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(9)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v81, 8, v14
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(8)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v84, 8, v84
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v85, 8, v0
 ; GFX11-FAKE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
 ; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB111_4
 ; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
index 5d4df4bde1af8..f0c32e8b2b7a3 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
@@ -3418,10 +3418,10 @@ define <18 x i32> @bitcast_v36i16_to_v18i32(<36 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_mov_b32_e32 v53, v4
 ; SI-NEXT:    v_mov_b32_e32 v54, v2
 ; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:20
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4
 ; SI-NEXT:    v_mov_b32_e32 v36, v22
@@ -3449,13 +3449,13 @@ define <18 x i32> @bitcast_v36i16_to_v18i32(<36 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v46, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v29
 ; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v0
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v43, 16, v6
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v4
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -10551,10 +10551,10 @@ define <18 x float> @bitcast_v36i16_to_v18f32(<36 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_mov_b32_e32 v53, v4
 ; SI-NEXT:    v_mov_b32_e32 v54, v2
 ; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:20
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4
 ; SI-NEXT:    v_mov_b32_e32 v36, v22
@@ -10582,13 +10582,13 @@ define <18 x float> @bitcast_v36i16_to_v18f32(<36 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v46, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v29
 ; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v0
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v43, 16, v6
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v4
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -16900,10 +16900,10 @@ define <9 x i64> @bitcast_v36i16_to_v9i64(<36 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_mov_b32_e32 v53, v4
 ; SI-NEXT:    v_mov_b32_e32 v54, v2
 ; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:20
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4
 ; SI-NEXT:    v_mov_b32_e32 v36, v22
@@ -16931,13 +16931,13 @@ define <9 x i64> @bitcast_v36i16_to_v9i64(<36 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v46, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v29
 ; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v0
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v43, 16, v6
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v4
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -22476,10 +22476,10 @@ define <9 x double> @bitcast_v36i16_to_v9f64(<36 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_mov_b32_e32 v53, v4
 ; SI-NEXT:    v_mov_b32_e32 v54, v2
 ; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:20
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4
 ; SI-NEXT:    v_mov_b32_e32 v36, v22
@@ -22507,13 +22507,13 @@ define <9 x double> @bitcast_v36i16_to_v9f64(<36 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v46, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v29
 ; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v0
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v43, 16, v6
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v4
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -26814,12 +26814,12 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:24
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:16
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:24
 ; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32
 ; SI-NEXT:    ; implicit-def: $vgpr48
 ; SI-NEXT:    ; kill: killed $vgpr48
@@ -26865,7 +26865,7 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr50
 ; SI-NEXT:    ; kill: killed $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr48
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
index 44cfd6c28ca6a..fd392b702568e 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
@@ -3566,10 +3566,10 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_mov_b32_e32 v53, v4
 ; SI-NEXT:    v_mov_b32_e32 v54, v2
 ; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28
 ; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    v_mov_b32_e32 v50, v10
 ; SI-NEXT:    v_mov_b32_e32 v51, v8
@@ -3595,28 +3595,28 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v29
 ; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4
 ; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:20
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
+; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v0
+; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v4
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v47, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v46, 16, v10
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:12
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v47, 16, v8
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v46, 16, v10
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v6
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
@@ -11765,10 +11765,10 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_mov_b32_e32 v53, v4
 ; SI-NEXT:    v_mov_b32_e32 v54, v2
 ; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28
 ; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    v_mov_b32_e32 v50, v10
 ; SI-NEXT:    v_mov_b32_e32 v51, v8
@@ -11794,28 +11794,28 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v29
 ; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4
 ; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:20
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
+; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v0
+; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v4
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v47, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v46, 16, v10
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:12
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v47, 16, v8
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v46, 16, v10
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v6
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
@@ -19274,10 +19274,10 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_mov_b32_e32 v53, v4
 ; SI-NEXT:    v_mov_b32_e32 v54, v2
 ; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28
 ; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    v_mov_b32_e32 v50, v10
 ; SI-NEXT:    v_mov_b32_e32 v51, v8
@@ -19303,28 +19303,28 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v29
 ; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4
 ; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:20
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
+; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v0
+; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v4
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v47, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v46, 16, v10
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:12
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v47, 16, v8
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v46, 16, v10
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v6
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
@@ -26013,10 +26013,10 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_mov_b32_e32 v53, v4
 ; SI-NEXT:    v_mov_b32_e32 v54, v2
 ; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28
 ; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    v_mov_b32_e32 v50, v10
 ; SI-NEXT:    v_mov_b32_e32 v51, v8
@@ -26042,28 +26042,28 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v29
 ; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4
 ; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:20
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
+; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v0
+; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v4
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v47, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v46, 16, v10
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:12
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v47, 16, v8
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v46, 16, v10
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v6
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
@@ -31405,9 +31405,9 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:40
 ; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:40
 ; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:36
 ; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:32
 ; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:28
@@ -31472,7 +31472,7 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr42
 ; SI-NEXT:    ; kill: killed $vgpr40
 ; SI-NEXT:    ; implicit-def: $vgpr40
-; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -31618,6 +31618,7 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v26
 ; SI-NEXT:    v_add_i32_e32 v30, vcc, 3, v30
+; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_add_i32_e32 v38, vcc, 3, v38
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
index 87d5157b3c340..01625cd53ef68 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
@@ -3816,8 +3816,8 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v54, v2
 ; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:16
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:16
 ; SI-NEXT:    v_mov_b32_e32 v53, v4
 ; SI-NEXT:    v_mov_b32_e32 v50, v10
 ; SI-NEXT:    v_mov_b32_e32 v51, v8
@@ -3843,44 +3843,44 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v29
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:4
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:4
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:44
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:52
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:52
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:44
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v2
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v46, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:40
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:36
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:24
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:36
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:40
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v4
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_lshlrev_b32_e32 v46, 16, v8
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v47, 16, v10
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v12
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v14
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:28
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v12
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v14
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshlrev_b32_e32 v47, 16, v10
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -12779,8 +12779,8 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v54, v2
 ; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:16
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:16
 ; SI-NEXT:    v_mov_b32_e32 v53, v4
 ; SI-NEXT:    v_mov_b32_e32 v50, v10
 ; SI-NEXT:    v_mov_b32_e32 v51, v8
@@ -12806,44 +12806,44 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v29
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:4
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:4
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:44
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:52
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:52
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:44
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v2
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v46, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:40
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:36
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:24
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:36
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:40
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v4
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_lshlrev_b32_e32 v46, 16, v8
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v47, 16, v10
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v12
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v14
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:28
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v12
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v14
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshlrev_b32_e32 v47, 16, v10
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -21028,8 +21028,8 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v54, v2
 ; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:16
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:16
 ; SI-NEXT:    v_mov_b32_e32 v53, v4
 ; SI-NEXT:    v_mov_b32_e32 v50, v10
 ; SI-NEXT:    v_mov_b32_e32 v51, v8
@@ -21055,44 +21055,44 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v29
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:4
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:4
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:44
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:52
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:52
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:44
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v2
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v46, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:40
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:36
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:24
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:36
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:40
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v4
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_lshlrev_b32_e32 v46, 16, v8
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v47, 16, v10
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v12
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v14
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:28
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v12
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v14
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshlrev_b32_e32 v47, 16, v10
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -28444,8 +28444,8 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v54, v2
 ; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:16
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:16
 ; SI-NEXT:    v_mov_b32_e32 v53, v4
 ; SI-NEXT:    v_mov_b32_e32 v50, v10
 ; SI-NEXT:    v_mov_b32_e32 v51, v8
@@ -28471,44 +28471,44 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v29
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:4
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:4
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:44
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:52
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:52
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:44
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v2
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v46, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:40
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:36
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:24
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:36
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:40
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v4
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_lshlrev_b32_e32 v46, 16, v8
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v47, 16, v10
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v12
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v14
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:28
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v12
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v14
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshlrev_b32_e32 v47, 16, v10
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -34405,13 +34405,13 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:56
 ; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:16
 ; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:56
 ; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:52
 ; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48
 ; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:44
@@ -34488,7 +34488,7 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr58
 ; SI-NEXT:    ; kill: killed $vgpr56
 ; SI-NEXT:    ; implicit-def: $vgpr56
-; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -34668,6 +34668,7 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v24
 ; SI-NEXT:    v_add_i32_e32 v30, vcc, 3, v30
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_add_i32_e32 v50, vcc, 3, v50
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
index fb2e94fc3b87a..2dc27719b5977 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
@@ -4077,14 +4077,14 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_mov_b32_e32 v53, v4
 ; SI-NEXT:    v_mov_b32_e32 v54, v2
 ; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:32
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:60
 ; SI-NEXT:    v_lshlrev_b32_e32 v47, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v46, 16, v5
@@ -4101,44 +4101,45 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v29
 ; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v0
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:52
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:56
+; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
+; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v8
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v6
 ; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v4
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v2
 ; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v0
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v12
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:56
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:52
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v14
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:44
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:44
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v14
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v16
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:36
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:36
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v16
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v18
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:28
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v18
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -13914,14 +13915,14 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_mov_b32_e32 v53, v4
 ; SI-NEXT:    v_mov_b32_e32 v54, v2
 ; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:32
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:60
 ; SI-NEXT:    v_lshlrev_b32_e32 v47, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v46, 16, v5
@@ -13938,44 +13939,45 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v29
 ; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v0
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:52
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:56
+; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
+; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v8
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v6
 ; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v4
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v2
 ; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v0
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v12
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:56
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:52
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v14
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:44
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:44
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v14
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v16
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:36
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:36
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v16
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v18
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:28
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v18
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -23014,14 +23016,14 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_mov_b32_e32 v53, v4
 ; SI-NEXT:    v_mov_b32_e32 v54, v2
 ; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:32
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:60
 ; SI-NEXT:    v_lshlrev_b32_e32 v47, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v46, 16, v5
@@ -23038,44 +23040,45 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v29
 ; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v0
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:52
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:56
+; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
+; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v8
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v6
 ; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v4
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v2
 ; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v0
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v12
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:56
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:52
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v14
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:44
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:44
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v14
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v16
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:36
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:36
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v16
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v18
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:28
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v18
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -31231,14 +31234,14 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_mov_b32_e32 v53, v4
 ; SI-NEXT:    v_mov_b32_e32 v54, v2
 ; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:32
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:60
 ; SI-NEXT:    v_lshlrev_b32_e32 v47, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v46, 16, v5
@@ -31255,44 +31258,45 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v29
 ; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v0
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:52
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:56
+; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
+; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v8
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v6
 ; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v4
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v2
 ; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v0
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v12
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:56
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:52
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v14
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:44
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:44
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v14
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v16
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:36
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:36
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v16
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v18
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:28
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v18
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -37854,6 +37858,7 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:72
 ; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:68
 ; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:64
 ; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:60
@@ -37864,7 +37869,6 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:40
 ; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:72
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:36
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:32
@@ -37888,7 +37892,7 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr37
 ; SI-NEXT:    ; implicit-def: $vgpr35
 ; SI-NEXT:    ; implicit-def: $vgpr33
-; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:    ; kill: killed $vgpr31
@@ -37988,6 +37992,7 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v19
 ; SI-NEXT:    v_cvt_f32_f16_e32 v33, v16
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v40
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr3
@@ -38053,7 +38058,7 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v30
 ; SI-NEXT:    ; implicit-def: $vgpr30
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v54
 ; SI-NEXT:    ; implicit-def: $vgpr54
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
@@ -38061,7 +38066,7 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v55
 ; SI-NEXT:    ; implicit-def: $vgpr55
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v56
 ; SI-NEXT:    ; implicit-def: $vgpr56
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
@@ -38167,6 +38172,7 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v22
 ; SI-NEXT:    v_add_i32_e32 v30, vcc, 3, v30
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_add_i32_e32 v54, vcc, 3, v54
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
index 07cdbef82d892..5a978534eeb9e 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
@@ -4372,12 +4372,12 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_mov_b32_e32 v53, v4
 ; SI-NEXT:    v_mov_b32_e32 v54, v2
 ; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:40
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:48
 ; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v3
@@ -4395,61 +4395,61 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v29
 ; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:4
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:4
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
+; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v6
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v4
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v2
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v41, 16, v10
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:84
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:84
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:80
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:76
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_lshlrev_b32_e32 v41, 16, v10
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v12
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v14
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v16
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:68
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:72
-; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:68
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v14
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v12
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v16
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v18
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:60
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:60
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v18
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v20
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:52
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:56
-; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:52
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v20
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v22
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:44
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v22
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -5519,10 +5519,10 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:20
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v62, v30
@@ -5556,10 +5556,10 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3
 ; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v2
 ; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v51, 16, v4
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v6
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v8
 ; SI-NEXT:    s_cbranch_scc0 .LBB15_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
@@ -5607,11 +5607,13 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3
 ; SI-NEXT:    s_and_b32 s8, s24, 0xffff
 ; SI-NEXT:    s_lshl_b32 s9, s25, 16
 ; SI-NEXT:    v_or_b32_e32 v22, v0, v45
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v44
 ; SI-NEXT:    s_or_b32 s8, s8, s9
 ; SI-NEXT:    s_and_b32 s9, s26, 0xffff
 ; SI-NEXT:    s_lshl_b32 s10, s27, 16
 ; SI-NEXT:    v_or_b32_e32 v23, v0, v51
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v43
 ; SI-NEXT:    s_or_b32 s9, s9, s10
 ; SI-NEXT:    s_and_b32 s10, s28, 0xffff
@@ -5774,7 +5776,7 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB15_4:
 ; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(2) expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v47, v43
 ; SI-NEXT:    v_mov_b32_e32 v43, v50
 ; SI-NEXT:    v_mov_b32_e32 v50, v38
@@ -15107,12 +15109,12 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_mov_b32_e32 v53, v4
 ; SI-NEXT:    v_mov_b32_e32 v54, v2
 ; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:40
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:48
 ; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v3
@@ -15130,61 +15132,61 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v29
 ; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:4
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:4
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
+; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v6
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v4
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v2
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v41, 16, v10
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:84
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:84
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:80
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:76
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_lshlrev_b32_e32 v41, 16, v10
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v12
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v14
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v16
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:68
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:72
-; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:68
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v14
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v12
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v16
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v18
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:60
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:60
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v18
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v20
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:52
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:56
-; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:52
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v20
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v22
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:44
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v22
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -16254,10 +16256,10 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a,
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:20
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v62, v30
@@ -16291,10 +16293,10 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a,
 ; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v2
 ; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v51, 16, v4
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v6
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v8
 ; SI-NEXT:    s_cbranch_scc0 .LBB31_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
@@ -16342,11 +16344,13 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a,
 ; SI-NEXT:    s_and_b32 s8, s24, 0xffff
 ; SI-NEXT:    s_lshl_b32 s9, s25, 16
 ; SI-NEXT:    v_or_b32_e32 v22, v0, v45
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v44
 ; SI-NEXT:    s_or_b32 s8, s8, s9
 ; SI-NEXT:    s_and_b32 s9, s26, 0xffff
 ; SI-NEXT:    s_lshl_b32 s10, s27, 16
 ; SI-NEXT:    v_or_b32_e32 v23, v0, v51
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v43
 ; SI-NEXT:    s_or_b32 s9, s9, s10
 ; SI-NEXT:    s_and_b32 s10, s28, 0xffff
@@ -16509,7 +16513,7 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a,
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB31_4:
 ; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(2) expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v47, v43
 ; SI-NEXT:    v_mov_b32_e32 v43, v50
 ; SI-NEXT:    v_mov_b32_e32 v50, v38
@@ -25054,12 +25058,12 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_mov_b32_e32 v53, v4
 ; SI-NEXT:    v_mov_b32_e32 v54, v2
 ; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:40
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:48
 ; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v3
@@ -25077,61 +25081,61 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v29
 ; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:4
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:4
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
+; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v6
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v4
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v2
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v41, 16, v10
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:84
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:84
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:80
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:76
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_lshlrev_b32_e32 v41, 16, v10
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v12
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v14
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v16
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:68
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:72
-; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:68
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v14
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v12
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v16
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v18
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:60
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:60
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v18
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v20
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:52
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:56
-; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:52
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v20
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v22
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:44
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v22
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -26201,10 +26205,10 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:20
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v62, v30
@@ -26238,10 +26242,10 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3
 ; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v2
 ; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v51, 16, v4
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v6
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v8
 ; SI-NEXT:    s_cbranch_scc0 .LBB43_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
@@ -26289,11 +26293,13 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3
 ; SI-NEXT:    s_and_b32 s8, s24, 0xffff
 ; SI-NEXT:    s_lshl_b32 s9, s25, 16
 ; SI-NEXT:    v_or_b32_e32 v22, v0, v45
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v44
 ; SI-NEXT:    s_or_b32 s8, s8, s9
 ; SI-NEXT:    s_and_b32 s9, s26, 0xffff
 ; SI-NEXT:    s_lshl_b32 s10, s27, 16
 ; SI-NEXT:    v_or_b32_e32 v23, v0, v51
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v43
 ; SI-NEXT:    s_or_b32 s9, s9, s10
 ; SI-NEXT:    s_and_b32 s10, s28, 0xffff
@@ -26456,7 +26462,7 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB43_4:
 ; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(2) expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v47, v43
 ; SI-NEXT:    v_mov_b32_e32 v43, v50
 ; SI-NEXT:    v_mov_b32_e32 v50, v38
@@ -34084,12 +34090,12 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_mov_b32_e32 v53, v4
 ; SI-NEXT:    v_mov_b32_e32 v54, v2
 ; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:40
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:48
 ; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v3
@@ -34107,61 +34113,61 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v29
 ; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:4
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:4
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
+; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v6
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v4
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v2
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v41, 16, v10
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:84
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:84
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:80
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:76
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_lshlrev_b32_e32 v41, 16, v10
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v12
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v14
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v16
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:68
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:72
-; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:68
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v14
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v12
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v16
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v18
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:60
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:60
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v18
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v20
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:52
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:56
-; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:52
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v20
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v22
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:44
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v22
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -35231,10 +35237,10 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a,
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:20
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v62, v30
@@ -35268,10 +35274,10 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a,
 ; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v2
 ; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v51, 16, v4
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v6
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v8
 ; SI-NEXT:    s_cbranch_scc0 .LBB51_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
@@ -35319,11 +35325,13 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a,
 ; SI-NEXT:    s_and_b32 s8, s24, 0xffff
 ; SI-NEXT:    s_lshl_b32 s9, s25, 16
 ; SI-NEXT:    v_or_b32_e32 v22, v0, v45
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v44
 ; SI-NEXT:    s_or_b32 s8, s8, s9
 ; SI-NEXT:    s_and_b32 s9, s26, 0xffff
 ; SI-NEXT:    s_lshl_b32 s10, s27, 16
 ; SI-NEXT:    v_or_b32_e32 v23, v0, v51
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v43
 ; SI-NEXT:    s_or_b32 s9, s9, s10
 ; SI-NEXT:    s_and_b32 s10, s28, 0xffff
@@ -35486,7 +35494,7 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a,
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB51_4:
 ; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(2) expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v47, v43
 ; SI-NEXT:    v_mov_b32_e32 v43, v50
 ; SI-NEXT:    v_mov_b32_e32 v50, v38
@@ -41338,6 +41346,7 @@ define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:88
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:84
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:80
@@ -41353,7 +41362,6 @@ define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:88
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:52
 ; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:48
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:44
@@ -41372,7 +41380,7 @@ define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr52
 ; SI-NEXT:    ; implicit-def: $vgpr51
 ; SI-NEXT:    ; implicit-def: $vgpr49
-; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v39
 ; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    ; kill: killed $vgpr39
@@ -41561,7 +41569,7 @@ define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v30
 ; SI-NEXT:    ; implicit-def: $vgpr30
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v42
 ; SI-NEXT:    ; implicit-def: $vgpr42
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
@@ -41585,7 +41593,7 @@ define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v47
 ; SI-NEXT:    ; implicit-def: $vgpr47
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v31
 ; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
@@ -41701,6 +41709,7 @@ define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v20
 ; SI-NEXT:    v_add_i32_e32 v30, vcc, 3, v30
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_add_i32_e32 v42, vcc, 3, v42
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -42652,9 +42661,9 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:28
 ; SI-NEXT:    s_waitcnt expcnt(5)
 ; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:28
 ; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:16
 ; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:12
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
index 8eb71e90f8504..a1e41c91784b2 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
@@ -4696,8 +4696,8 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v54, v2
 ; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:96
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:96
 ; SI-NEXT:    v_mov_b32_e32 v53, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v52, 16, v3
@@ -4716,70 +4716,70 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v29
 ; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:84
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:88
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:84
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:76
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:80
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:76
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v2
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:68
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:72
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:68
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v4
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:100
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:100
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:16
 ; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:32
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:60
+; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v6
 ; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:40
+; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:48
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:56
+; SI-NEXT:    s_waitcnt vmcnt(11)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_lshlrev_b32_e32 v47, 16, v8
+; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    v_lshlrev_b32_e32 v49, 16, v18
+; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v16
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v14
 ; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v12
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v14
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v16
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v49, 16, v18
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v41, 16, v20
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:56
-; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:52
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v22
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v24
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v26
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:44
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v24
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v26
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v22
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -5945,14 +5945,14 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:40
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:36
 ; SI-NEXT:    v_mov_b32_e32 v31, v26
 ; SI-NEXT:    v_mov_b32_e32 v41, v24
@@ -5987,14 +5987,13 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v54, 16, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v4
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(11)
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_lshlrev_b32_e32 v53, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_lshlrev_b32_e32 v52, 16, v10
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v12
 ; SI-NEXT:    s_cbranch_scc0 .LBB15_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
@@ -6042,21 +6041,25 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3
 ; SI-NEXT:    s_and_b32 s6, s20, 0xffff
 ; SI-NEXT:    s_lshl_b32 s7, s21, 16
 ; SI-NEXT:    v_or_b32_e32 v22, v0, v54
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v56
 ; SI-NEXT:    s_or_b32 s6, s6, s7
 ; SI-NEXT:    s_and_b32 s7, s22, 0xffff
 ; SI-NEXT:    s_lshl_b32 s8, s23, 16
 ; SI-NEXT:    v_or_b32_e32 v23, v0, v32
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v51
 ; SI-NEXT:    s_or_b32 s7, s7, s8
 ; SI-NEXT:    s_and_b32 s8, s24, 0xffff
 ; SI-NEXT:    s_lshl_b32 s9, s25, 16
 ; SI-NEXT:    v_or_b32_e32 v24, v0, v59
+; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v47
 ; SI-NEXT:    s_or_b32 s8, s8, s9
 ; SI-NEXT:    s_and_b32 s9, s26, 0xffff
 ; SI-NEXT:    s_lshl_b32 s10, s27, 16
 ; SI-NEXT:    v_or_b32_e32 v25, v0, v53
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v46
 ; SI-NEXT:    s_or_b32 s9, s9, s10
 ; SI-NEXT:    s_and_b32 s10, s28, 0xffff
@@ -6238,7 +6241,9 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v62, v58
+; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_mov_b32_e32 v58, v51
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_mov_b32_e32 v51, v47
 ; SI-NEXT:    v_mov_b32_e32 v47, v44
 ; SI-NEXT:    v_mov_b32_e32 v44, v41
@@ -16321,8 +16326,8 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v54, v2
 ; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:96
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:96
 ; SI-NEXT:    v_mov_b32_e32 v53, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v52, 16, v3
@@ -16341,70 +16346,70 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v29
 ; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:84
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:88
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:84
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:76
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:80
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:76
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v2
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:68
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:72
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:68
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v4
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:100
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:100
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:16
 ; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:32
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:60
+; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v6
 ; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:40
+; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:48
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:56
+; SI-NEXT:    s_waitcnt vmcnt(11)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_lshlrev_b32_e32 v47, 16, v8
+; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    v_lshlrev_b32_e32 v49, 16, v18
+; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v16
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v14
 ; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v12
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v14
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v16
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v49, 16, v18
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v41, 16, v20
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:56
-; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:52
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v22
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v24
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v26
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:44
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v24
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v26
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v22
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -17570,14 +17575,14 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a,
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:40
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:36
 ; SI-NEXT:    v_mov_b32_e32 v31, v26
 ; SI-NEXT:    v_mov_b32_e32 v41, v24
@@ -17612,14 +17617,13 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a,
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v54, 16, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v4
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(11)
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_lshlrev_b32_e32 v53, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_lshlrev_b32_e32 v52, 16, v10
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v12
 ; SI-NEXT:    s_cbranch_scc0 .LBB31_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
@@ -17667,21 +17671,25 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a,
 ; SI-NEXT:    s_and_b32 s6, s20, 0xffff
 ; SI-NEXT:    s_lshl_b32 s7, s21, 16
 ; SI-NEXT:    v_or_b32_e32 v22, v0, v54
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v56
 ; SI-NEXT:    s_or_b32 s6, s6, s7
 ; SI-NEXT:    s_and_b32 s7, s22, 0xffff
 ; SI-NEXT:    s_lshl_b32 s8, s23, 16
 ; SI-NEXT:    v_or_b32_e32 v23, v0, v32
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v51
 ; SI-NEXT:    s_or_b32 s7, s7, s8
 ; SI-NEXT:    s_and_b32 s8, s24, 0xffff
 ; SI-NEXT:    s_lshl_b32 s9, s25, 16
 ; SI-NEXT:    v_or_b32_e32 v24, v0, v59
+; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v47
 ; SI-NEXT:    s_or_b32 s8, s8, s9
 ; SI-NEXT:    s_and_b32 s9, s26, 0xffff
 ; SI-NEXT:    s_lshl_b32 s10, s27, 16
 ; SI-NEXT:    v_or_b32_e32 v25, v0, v53
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v46
 ; SI-NEXT:    s_or_b32 s9, s9, s10
 ; SI-NEXT:    s_and_b32 s10, s28, 0xffff
@@ -17863,7 +17871,9 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a,
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v62, v58
+; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_mov_b32_e32 v58, v51
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_mov_b32_e32 v51, v47
 ; SI-NEXT:    v_mov_b32_e32 v47, v44
 ; SI-NEXT:    v_mov_b32_e32 v44, v41
@@ -27111,8 +27121,8 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v54, v2
 ; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:96
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:96
 ; SI-NEXT:    v_mov_b32_e32 v53, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v52, 16, v3
@@ -27131,70 +27141,70 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v29
 ; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:84
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:88
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:84
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:76
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:80
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:76
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v2
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:68
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:72
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:68
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v4
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:100
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:100
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:16
 ; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:32
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:60
+; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v6
 ; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:40
+; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:48
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:56
+; SI-NEXT:    s_waitcnt vmcnt(11)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_lshlrev_b32_e32 v47, 16, v8
+; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    v_lshlrev_b32_e32 v49, 16, v18
+; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v16
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v14
 ; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v12
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v14
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v16
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v49, 16, v18
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v41, 16, v20
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:56
-; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:52
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v22
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v24
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v26
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:44
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v24
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v26
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v22
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -28360,14 +28370,14 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:40
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:36
 ; SI-NEXT:    v_mov_b32_e32 v31, v26
 ; SI-NEXT:    v_mov_b32_e32 v41, v24
@@ -28402,14 +28412,13 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v54, 16, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v4
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(11)
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_lshlrev_b32_e32 v53, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_lshlrev_b32_e32 v52, 16, v10
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v12
 ; SI-NEXT:    s_cbranch_scc0 .LBB43_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
@@ -28457,21 +28466,25 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3
 ; SI-NEXT:    s_and_b32 s6, s20, 0xffff
 ; SI-NEXT:    s_lshl_b32 s7, s21, 16
 ; SI-NEXT:    v_or_b32_e32 v22, v0, v54
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v56
 ; SI-NEXT:    s_or_b32 s6, s6, s7
 ; SI-NEXT:    s_and_b32 s7, s22, 0xffff
 ; SI-NEXT:    s_lshl_b32 s8, s23, 16
 ; SI-NEXT:    v_or_b32_e32 v23, v0, v32
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v51
 ; SI-NEXT:    s_or_b32 s7, s7, s8
 ; SI-NEXT:    s_and_b32 s8, s24, 0xffff
 ; SI-NEXT:    s_lshl_b32 s9, s25, 16
 ; SI-NEXT:    v_or_b32_e32 v24, v0, v59
+; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v47
 ; SI-NEXT:    s_or_b32 s8, s8, s9
 ; SI-NEXT:    s_and_b32 s9, s26, 0xffff
 ; SI-NEXT:    s_lshl_b32 s10, s27, 16
 ; SI-NEXT:    v_or_b32_e32 v25, v0, v53
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v46
 ; SI-NEXT:    s_or_b32 s9, s9, s10
 ; SI-NEXT:    s_and_b32 s10, s28, 0xffff
@@ -28653,7 +28666,9 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v62, v58
+; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_mov_b32_e32 v58, v51
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_mov_b32_e32 v51, v47
 ; SI-NEXT:    v_mov_b32_e32 v47, v44
 ; SI-NEXT:    v_mov_b32_e32 v44, v41
@@ -36929,8 +36944,8 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v54, v2
 ; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:96
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:96
 ; SI-NEXT:    v_mov_b32_e32 v53, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v52, 16, v3
@@ -36949,70 +36964,70 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v29
 ; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:84
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:88
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:84
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:76
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:80
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:76
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v2
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:68
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:72
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:68
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v4
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:100
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:100
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:16
 ; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:32
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:60
+; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v6
 ; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:40
+; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:48
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:56
+; SI-NEXT:    s_waitcnt vmcnt(11)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_lshlrev_b32_e32 v47, 16, v8
+; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    v_lshlrev_b32_e32 v49, 16, v18
+; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v16
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v14
 ; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v12
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v14
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v16
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v49, 16, v18
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v41, 16, v20
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:56
-; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:52
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v22
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v24
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v26
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:44
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v24
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v26
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v22
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -38178,14 +38193,14 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a,
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:40
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:36
 ; SI-NEXT:    v_mov_b32_e32 v31, v26
 ; SI-NEXT:    v_mov_b32_e32 v41, v24
@@ -38220,14 +38235,13 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a,
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v54, 16, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v4
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(11)
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_lshlrev_b32_e32 v53, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_lshlrev_b32_e32 v52, 16, v10
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v12
 ; SI-NEXT:    s_cbranch_scc0 .LBB51_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
@@ -38275,21 +38289,25 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a,
 ; SI-NEXT:    s_and_b32 s6, s20, 0xffff
 ; SI-NEXT:    s_lshl_b32 s7, s21, 16
 ; SI-NEXT:    v_or_b32_e32 v22, v0, v54
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v56
 ; SI-NEXT:    s_or_b32 s6, s6, s7
 ; SI-NEXT:    s_and_b32 s7, s22, 0xffff
 ; SI-NEXT:    s_lshl_b32 s8, s23, 16
 ; SI-NEXT:    v_or_b32_e32 v23, v0, v32
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v51
 ; SI-NEXT:    s_or_b32 s7, s7, s8
 ; SI-NEXT:    s_and_b32 s8, s24, 0xffff
 ; SI-NEXT:    s_lshl_b32 s9, s25, 16
 ; SI-NEXT:    v_or_b32_e32 v24, v0, v59
+; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v47
 ; SI-NEXT:    s_or_b32 s8, s8, s9
 ; SI-NEXT:    s_and_b32 s9, s26, 0xffff
 ; SI-NEXT:    s_lshl_b32 s10, s27, 16
 ; SI-NEXT:    v_or_b32_e32 v25, v0, v53
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v46
 ; SI-NEXT:    s_or_b32 s9, s9, s10
 ; SI-NEXT:    s_and_b32 s10, s28, 0xffff
@@ -38471,7 +38489,9 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a,
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v62, v58
+; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_mov_b32_e32 v58, v51
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_mov_b32_e32 v51, v47
 ; SI-NEXT:    v_mov_b32_e32 v47, v44
 ; SI-NEXT:    v_mov_b32_e32 v44, v41
@@ -44804,6 +44824,7 @@ define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:104
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:100
 ; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:96
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:92
@@ -44823,7 +44844,6 @@ define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40
 ; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:104
 ; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:36
 ; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:32
 ; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:28
@@ -44838,7 +44858,7 @@ define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr45
 ; SI-NEXT:    ; implicit-def: $vgpr43
 ; SI-NEXT:    ; implicit-def: $vgpr40
-; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v55
 ; SI-NEXT:    ; implicit-def: $vgpr55
 ; SI-NEXT:    ; kill: killed $vgpr55
@@ -45051,7 +45071,7 @@ define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v30
 ; SI-NEXT:    ; implicit-def: $vgpr30
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v46
 ; SI-NEXT:    ; implicit-def: $vgpr46
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
@@ -45059,7 +45079,7 @@ define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v47
 ; SI-NEXT:    ; implicit-def: $vgpr47
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v39
 ; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
@@ -45217,6 +45237,7 @@ define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v18
 ; SI-NEXT:    v_add_i32_e32 v30, vcc, 3, v30
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_add_i32_e32 v46, vcc, 3, v46
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -46260,8 +46281,8 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:4
 ; SI-NEXT:    s_waitcnt expcnt(6)
 ; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
index 93c11f13ce3ce..462239804f415 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
@@ -5003,10 +5003,11 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:116
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:12
 ; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v54, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v5
@@ -5024,74 +5025,73 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v46, 16, v29
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:60
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:4
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:4
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:60
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:108
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:112
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:108
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v4
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:100
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:104
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:100
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v6
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v8
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v10
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:92
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:96
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:92
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v10
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:84
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:88
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:84
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v14
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:40
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:24
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:80
-; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:76
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v14
 ; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v16
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_lshlrev_b32_e32 v43, 16, v18
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v36, 16, v20
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v22
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v24
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:68
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:72
-; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:68
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v22
+; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    v_lshlrev_b32_e32 v36, 16, v20
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_lshlrev_b32_e32 v43, 16, v18
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v16
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v24
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v41, 16, v26
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:56
@@ -5099,6 +5099,8 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:52
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_lshlrev_b32_e32 v41, 16, v26
 ; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v28
 ; SI-NEXT:    s_waitcnt vmcnt(2)
@@ -6350,18 +6352,18 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:28
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:36
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:44
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:56
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:44
 ; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:52
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v30, v28
@@ -6401,13 +6403,12 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v4
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v6
 ; SI-NEXT:    v_lshlrev_b32_e32 v36, 16, v8
 ; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v10
 ; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v12
+; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    v_lshlrev_b32_e32 v54, 16, v14
-; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v16
 ; SI-NEXT:    s_cbranch_scc0 .LBB15_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
@@ -6701,6 +6702,7 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v61, v52
 ; SI-NEXT:    v_mov_b32_e32 v52, v59
 ; SI-NEXT:    v_mov_b32_e32 v59, v51
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_mov_b32_e32 v51, v57
 ; SI-NEXT:    v_mov_b32_e32 v57, v50
 ; SI-NEXT:    v_mov_b32_e32 v50, v47
@@ -10355,17 +10357,17 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:16
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:32
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:24
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:32
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:28
 ; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:40
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:36
 ; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:48
 ; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:44
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:64
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:56
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:64
 ; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:72
 ; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:68
@@ -17541,10 +17543,11 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:116
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:12
 ; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v54, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v5
@@ -17562,74 +17565,73 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v46, 16, v29
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:60
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:4
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:4
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:60
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:108
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:112
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:108
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v4
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:100
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:104
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:100
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v6
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v8
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v10
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:92
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:96
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:92
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v10
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:84
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:88
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:84
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v14
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:40
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:24
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:80
-; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:76
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v14
 ; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v16
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_lshlrev_b32_e32 v43, 16, v18
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v36, 16, v20
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v22
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v24
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:68
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:72
-; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:68
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v22
+; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    v_lshlrev_b32_e32 v36, 16, v20
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_lshlrev_b32_e32 v43, 16, v18
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v16
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v24
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v41, 16, v26
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:56
@@ -17637,6 +17639,8 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:52
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_lshlrev_b32_e32 v41, 16, v26
 ; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v28
 ; SI-NEXT:    s_waitcnt vmcnt(2)
@@ -18888,18 +18892,18 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:28
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:36
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:44
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:56
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:44
 ; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:52
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v30, v28
@@ -18939,13 +18943,12 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v4
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v6
 ; SI-NEXT:    v_lshlrev_b32_e32 v36, 16, v8
 ; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v10
 ; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v12
+; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    v_lshlrev_b32_e32 v54, 16, v14
-; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v16
 ; SI-NEXT:    s_cbranch_scc0 .LBB31_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
@@ -19239,6 +19242,7 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v61, v52
 ; SI-NEXT:    v_mov_b32_e32 v52, v59
 ; SI-NEXT:    v_mov_b32_e32 v59, v51
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_mov_b32_e32 v51, v57
 ; SI-NEXT:    v_mov_b32_e32 v57, v50
 ; SI-NEXT:    v_mov_b32_e32 v50, v47
@@ -23054,17 +23058,17 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:16
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:32
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:24
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:32
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:28
 ; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:40
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:36
 ; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:48
 ; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:44
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:64
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:56
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:64
 ; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:72
 ; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:68
@@ -29211,10 +29215,11 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:116
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:12
 ; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v54, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v5
@@ -29232,74 +29237,73 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v46, 16, v29
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:60
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:4
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:4
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:60
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:108
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:112
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:108
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v4
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:100
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:104
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:100
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v6
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v8
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v10
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:92
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:96
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:92
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v10
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:84
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:88
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:84
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v14
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:40
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:24
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:80
-; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:76
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v14
 ; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v16
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_lshlrev_b32_e32 v43, 16, v18
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v36, 16, v20
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v22
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v24
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:68
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:72
-; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:68
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v22
+; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    v_lshlrev_b32_e32 v36, 16, v20
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_lshlrev_b32_e32 v43, 16, v18
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v16
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v24
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v41, 16, v26
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:56
@@ -29307,6 +29311,8 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:52
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_lshlrev_b32_e32 v41, 16, v26
 ; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v28
 ; SI-NEXT:    s_waitcnt vmcnt(2)
@@ -30558,18 +30564,18 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:28
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:36
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:44
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:56
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:44
 ; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:52
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v30, v28
@@ -30609,13 +30615,12 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v4
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v6
 ; SI-NEXT:    v_lshlrev_b32_e32 v36, 16, v8
 ; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v10
 ; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v12
+; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    v_lshlrev_b32_e32 v54, 16, v14
-; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v16
 ; SI-NEXT:    s_cbranch_scc0 .LBB43_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
@@ -30909,6 +30914,7 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v61, v52
 ; SI-NEXT:    v_mov_b32_e32 v52, v59
 ; SI-NEXT:    v_mov_b32_e32 v59, v51
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_mov_b32_e32 v51, v57
 ; SI-NEXT:    v_mov_b32_e32 v57, v50
 ; SI-NEXT:    v_mov_b32_e32 v50, v47
@@ -34580,17 +34586,17 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:16
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:32
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:24
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:32
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:28
 ; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:40
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:36
 ; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:48
 ; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:44
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:64
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:56
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:64
 ; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:72
 ; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:68
@@ -39859,10 +39865,11 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:116
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:12
 ; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v54, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v5
@@ -39880,74 +39887,73 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v46, 16, v29
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:60
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:4
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:4
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:60
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:108
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:112
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:108
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v4
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:100
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:104
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:100
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v6
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v8
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v10
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:92
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:96
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:92
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v10
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:84
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:88
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:84
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v14
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:40
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:24
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:80
-; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:76
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v14
 ; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v16
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_lshlrev_b32_e32 v43, 16, v18
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v36, 16, v20
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v22
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v24
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:68
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:72
-; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:68
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v22
+; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    v_lshlrev_b32_e32 v36, 16, v20
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_lshlrev_b32_e32 v43, 16, v18
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v16
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v24
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v41, 16, v26
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:56
@@ -39955,6 +39961,8 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:52
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_lshlrev_b32_e32 v41, 16, v26
 ; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v28
 ; SI-NEXT:    s_waitcnt vmcnt(2)
@@ -41206,18 +41214,18 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:28
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:36
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:44
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:56
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:44
 ; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:52
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v30, v28
@@ -41257,13 +41265,12 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v4
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v6
 ; SI-NEXT:    v_lshlrev_b32_e32 v36, 16, v8
 ; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v10
 ; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v12
+; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    v_lshlrev_b32_e32 v54, 16, v14
-; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v16
 ; SI-NEXT:    s_cbranch_scc0 .LBB51_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
@@ -41557,6 +41564,7 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v61, v52
 ; SI-NEXT:    v_mov_b32_e32 v52, v59
 ; SI-NEXT:    v_mov_b32_e32 v59, v51
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_mov_b32_e32 v51, v57
 ; SI-NEXT:    v_mov_b32_e32 v57, v50
 ; SI-NEXT:    v_mov_b32_e32 v50, v47
@@ -45272,17 +45280,17 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:16
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:32
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:24
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:32
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:28
 ; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:40
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:36
 ; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:48
 ; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:44
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:64
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:56
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:64
 ; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:72
 ; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:68
@@ -49981,6 +49989,7 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:64
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:20
 ; SI-NEXT:    s_waitcnt expcnt(5)
 ; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:60
 ; SI-NEXT:    s_waitcnt expcnt(4)
@@ -49997,7 +50006,6 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:32
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:28
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:16
 ; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:8
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 30ad46d959b7e..34ebe60dd1e03 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -815,13 +815,13 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) {
 ; GCN-NEXT:    buffer_store_dword v5, v11, s[0:3], 0 offen
 ; GCN-NEXT:    buffer_store_dword v4, v12, s[0:3], 0 offen
 ; GCN-NEXT:    buffer_store_dword v3, v7, s[0:3], 0 offen
+; GCN-NEXT:    buffer_load_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:48
 ; GCN-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NEXT:    buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:32
-; GCN-NEXT:    buffer_load_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:48
 ; GCN-NEXT:    v_add_i32_e32 v21, vcc, 52, v0
-; GCN-NEXT:    buffer_load_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64
 ; GCN-NEXT:    buffer_load_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64 offset:16
-; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    buffer_load_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(3)
 ; GCN-NEXT:    buffer_store_dword v10, v19, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, 48, v0
 ; GCN-NEXT:    buffer_store_dword v9, v20, s[0:3], 0 offen
@@ -836,6 +836,7 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) {
 ; GCN-NEXT:    v_add_i32_e32 v9, vcc, 28, v0
 ; GCN-NEXT:    v_add_i32_e32 v10, vcc, 24, v0
 ; GCN-NEXT:    v_add_i32_e32 v19, vcc, 20, v0
+; GCN-NEXT:    s_waitcnt vmcnt(6)
 ; GCN-NEXT:    buffer_store_dword v6, v2, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, 16, v0
 ; GCN-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen
@@ -846,11 +847,12 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) {
 ; GCN-NEXT:    buffer_store_dword v3, v7, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, 4, v0
-; GCN-NEXT:    s_waitcnt vmcnt(8)
+; GCN-NEXT:    s_waitcnt vmcnt(9)
 ; GCN-NEXT:    buffer_store_dword v18, v9, s[0:3], 0 offen
 ; GCN-NEXT:    buffer_store_dword v17, v10, s[0:3], 0 offen
 ; GCN-NEXT:    buffer_store_dword v16, v19, s[0:3], 0 offen
 ; GCN-NEXT:    buffer_store_dword v15, v2, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(12)
 ; GCN-NEXT:    buffer_store_dword v14, v5, s[0:3], 0 offen
 ; GCN-NEXT:    buffer_store_dword v13, v1, s[0:3], 0 offen
 ; GCN-NEXT:    buffer_store_dword v12, v3, s[0:3], 0 offen
@@ -9024,6 +9026,21 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s4, s6
 ; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_ushort v30, v[1:2], s[4:7], 0 addr64 offset:62
+; GCN-NEXT:    buffer_load_ushort v29, v[1:2], s[4:7], 0 addr64 offset:60
+; GCN-NEXT:    buffer_load_ushort v28, v[1:2], s[4:7], 0 addr64 offset:58
+; GCN-NEXT:    buffer_load_ushort v27, v[1:2], s[4:7], 0 addr64 offset:56
+; GCN-NEXT:    buffer_load_ushort v26, v[1:2], s[4:7], 0 addr64 offset:54
+; GCN-NEXT:    buffer_load_ushort v25, v[1:2], s[4:7], 0 addr64 offset:52
+; GCN-NEXT:    buffer_load_ushort v24, v[1:2], s[4:7], 0 addr64 offset:50
+; GCN-NEXT:    buffer_load_ushort v23, v[1:2], s[4:7], 0 addr64 offset:48
+; GCN-NEXT:    buffer_load_ushort v34, v[1:2], s[4:7], 0 addr64 offset:46
+; GCN-NEXT:    buffer_load_ushort v33, v[1:2], s[4:7], 0 addr64 offset:44
+; GCN-NEXT:    buffer_load_ushort v32, v[1:2], s[4:7], 0 addr64 offset:42
+; GCN-NEXT:    buffer_load_ushort v31, v[1:2], s[4:7], 0 addr64 offset:40
+; GCN-NEXT:    buffer_load_ushort v22, v[1:2], s[4:7], 0 addr64 offset:38
+; GCN-NEXT:    buffer_load_ushort v21, v[1:2], s[4:7], 0 addr64 offset:36
+; GCN-NEXT:    buffer_load_ushort v20, v[1:2], s[4:7], 0 addr64 offset:34
 ; GCN-NEXT:    buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64
 ; GCN-NEXT:    buffer_load_ushort v4, v[1:2], s[4:7], 0 addr64 offset:2
 ; GCN-NEXT:    buffer_load_ushort v5, v[1:2], s[4:7], 0 addr64 offset:4
@@ -9040,23 +9057,8 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
 ; GCN-NEXT:    buffer_load_ushort v16, v[1:2], s[4:7], 0 addr64 offset:26
 ; GCN-NEXT:    buffer_load_ushort v17, v[1:2], s[4:7], 0 addr64 offset:28
 ; GCN-NEXT:    buffer_load_ushort v18, v[1:2], s[4:7], 0 addr64 offset:30
-; GCN-NEXT:    buffer_load_ushort v23, v[1:2], s[4:7], 0 addr64 offset:48
-; GCN-NEXT:    buffer_load_ushort v24, v[1:2], s[4:7], 0 addr64 offset:50
-; GCN-NEXT:    buffer_load_ushort v25, v[1:2], s[4:7], 0 addr64 offset:52
-; GCN-NEXT:    buffer_load_ushort v26, v[1:2], s[4:7], 0 addr64 offset:54
-; GCN-NEXT:    buffer_load_ushort v27, v[1:2], s[4:7], 0 addr64 offset:56
-; GCN-NEXT:    buffer_load_ushort v28, v[1:2], s[4:7], 0 addr64 offset:58
-; GCN-NEXT:    buffer_load_ushort v29, v[1:2], s[4:7], 0 addr64 offset:60
-; GCN-NEXT:    buffer_load_ushort v30, v[1:2], s[4:7], 0 addr64 offset:62
 ; GCN-NEXT:    buffer_load_ushort v19, v[1:2], s[4:7], 0 addr64 offset:32
-; GCN-NEXT:    buffer_load_ushort v20, v[1:2], s[4:7], 0 addr64 offset:34
-; GCN-NEXT:    buffer_load_ushort v21, v[1:2], s[4:7], 0 addr64 offset:36
-; GCN-NEXT:    buffer_load_ushort v22, v[1:2], s[4:7], 0 addr64 offset:38
-; GCN-NEXT:    buffer_load_ushort v31, v[1:2], s[4:7], 0 addr64 offset:40
-; GCN-NEXT:    buffer_load_ushort v32, v[1:2], s[4:7], 0 addr64 offset:42
-; GCN-NEXT:    buffer_load_ushort v33, v[1:2], s[4:7], 0 addr64 offset:44
-; GCN-NEXT:    buffer_load_ushort v34, v[1:2], s[4:7], 0 addr64 offset:46
-; GCN-NEXT:    s_waitcnt vmcnt(8)
+; GCN-NEXT:    s_waitcnt vmcnt(14)
 ; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v30
 ; GCN-NEXT:    v_add_i32_e32 v30, vcc, 0xfc, v0
 ; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
@@ -9122,7 +9124,7 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
 ; GCN-NEXT:    buffer_store_dword v1, v27, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_i32_e32 v26, vcc, 0xac, v0
 ; GCN-NEXT:    v_add_i32_e32 v27, vcc, 0xa8, v0
-; GCN-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; GCN-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v34
 ; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
 ; GCN-NEXT:    buffer_store_dword v2, v24, s[0:3], 0 offen
@@ -9178,7 +9180,7 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
 ; GCN-NEXT:    buffer_store_dword v1, v27, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_i32_e32 v26, vcc, 0x58, v0
 ; GCN-NEXT:    v_add_i32_e32 v27, vcc, 0x54, v0
-; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
 ; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v19
 ; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
 ; GCN-NEXT:    buffer_store_dword v2, v32, s[0:3], 0 offen
@@ -9294,15 +9296,16 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
 ; GFX7-NEXT:    buffer_load_ushort v22, v[1:2], s[4:7], 0 addr64 offset:52
 ; GFX7-NEXT:    buffer_load_ushort v23, v[1:2], s[4:7], 0 addr64 offset:50
 ; GFX7-NEXT:    buffer_load_ushort v24, v[1:2], s[4:7], 0 addr64 offset:48
-; GFX7-NEXT:    buffer_load_ushort v16, v[1:2], s[4:7], 0 addr64 offset:32
-; GFX7-NEXT:    buffer_load_ushort v25, v[1:2], s[4:7], 0 addr64 offset:34
-; GFX7-NEXT:    buffer_load_ushort v26, v[1:2], s[4:7], 0 addr64 offset:36
-; GFX7-NEXT:    buffer_load_ushort v27, v[1:2], s[4:7], 0 addr64 offset:38
-; GFX7-NEXT:    buffer_load_ushort v28, v[1:2], s[4:7], 0 addr64 offset:40
-; GFX7-NEXT:    buffer_load_ushort v29, v[1:2], s[4:7], 0 addr64 offset:42
-; GFX7-NEXT:    buffer_load_ushort v30, v[1:2], s[4:7], 0 addr64 offset:44
 ; GFX7-NEXT:    buffer_load_ushort v31, v[1:2], s[4:7], 0 addr64 offset:46
+; GFX7-NEXT:    buffer_load_ushort v30, v[1:2], s[4:7], 0 addr64 offset:44
+; GFX7-NEXT:    buffer_load_ushort v29, v[1:2], s[4:7], 0 addr64 offset:42
+; GFX7-NEXT:    buffer_load_ushort v28, v[1:2], s[4:7], 0 addr64 offset:40
+; GFX7-NEXT:    buffer_load_ushort v27, v[1:2], s[4:7], 0 addr64 offset:38
+; GFX7-NEXT:    buffer_load_ushort v26, v[1:2], s[4:7], 0 addr64 offset:36
+; GFX7-NEXT:    buffer_load_ushort v25, v[1:2], s[4:7], 0 addr64 offset:34
+; GFX7-NEXT:    buffer_load_ushort v16, v[1:2], s[4:7], 0 addr64 offset:32
 ; GFX7-NEXT:    buffer_load_ushort v32, v[1:2], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_ushort v34, v[1:2], s[4:7], 0 addr64 offset:30
 ; GFX7-NEXT:    buffer_load_ushort v15, v[1:2], s[4:7], 0 addr64 offset:2
 ; GFX7-NEXT:    buffer_load_ushort v13, v[1:2], s[4:7], 0 addr64 offset:4
 ; GFX7-NEXT:    buffer_load_ushort v11, v[1:2], s[4:7], 0 addr64 offset:6
@@ -9317,7 +9320,6 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
 ; GFX7-NEXT:    buffer_load_ushort v12, v[1:2], s[4:7], 0 addr64 offset:24
 ; GFX7-NEXT:    buffer_load_ushort v14, v[1:2], s[4:7], 0 addr64 offset:26
 ; GFX7-NEXT:    buffer_load_ushort v33, v[1:2], s[4:7], 0 addr64 offset:28
-; GFX7-NEXT:    buffer_load_ushort v34, v[1:2], s[4:7], 0 addr64 offset:30
 ; GFX7-NEXT:    s_waitcnt vmcnt(14)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v17
 ; GFX7-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
@@ -9419,7 +9421,6 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
 ; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x80, v0
 ; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; GFX7-NEXT:    v_cvt_f64_f32_e32 v[1:2], v16
-; GFX7-NEXT:    s_waitcnt vmcnt(14)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 16, v34
 ; GFX7-NEXT:    v_cvt_f64_f32_e32 v[16:17], v16
 ; GFX7-NEXT:    v_add_i32_e32 v18, vcc, 0x7c, v0
@@ -9427,6 +9428,7 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
 ; GFX7-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
 ; GFX7-NEXT:    v_add_i32_e32 v17, vcc, 0x78, v0
 ; GFX7-NEXT:    buffer_store_dword v16, v17, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(14)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v17, 16, v33
 ; GFX7-NEXT:    v_cvt_f64_f32_e32 v[17:18], v17
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
@@ -9597,14 +9599,14 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
 ; GFX8-NEXT:    v_add_u32_e32 v42, vcc, 50, v1
 ; GFX8-NEXT:    v_addc_u32_e32 v43, vcc, 0, v2, vcc
 ; GFX8-NEXT:    flat_load_ushort v42, v[42:43]
-; GFX8-NEXT:    flat_load_ushort v34, v[33:34]
-; GFX8-NEXT:    flat_load_ushort v36, v[35:36]
-; GFX8-NEXT:    flat_load_ushort v38, v[37:38]
 ; GFX8-NEXT:    flat_load_ushort v39, v[48:49]
 ; GFX8-NEXT:    flat_load_ushort v48, v[50:51]
 ; GFX8-NEXT:    flat_load_ushort v51, v[52:53]
-; GFX8-NEXT:    flat_load_ushort v52, v[54:55]
 ; GFX8-NEXT:    flat_load_ushort v53, v[40:41]
+; GFX8-NEXT:    flat_load_ushort v52, v[54:55]
+; GFX8-NEXT:    flat_load_ushort v38, v[37:38]
+; GFX8-NEXT:    flat_load_ushort v34, v[33:34]
+; GFX8-NEXT:    flat_load_ushort v36, v[35:36]
 ; GFX8-NEXT:    v_add_u32_e32 v49, vcc, 32, v1
 ; GFX8-NEXT:    v_addc_u32_e32 v50, vcc, 0, v2, vcc
 ; GFX8-NEXT:    flat_load_ushort v37, v[3:4]
@@ -9620,6 +9622,7 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(14)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v44
 ; GFX8-NEXT:    v_cvt_f64_f32_e32 v[14:15], v3
+; GFX8-NEXT:    flat_load_ushort v13, v[49:50]
 ; GFX8-NEXT:    flat_load_ushort v3, v[17:18]
 ; GFX8-NEXT:    flat_load_ushort v5, v[21:22]
 ; GFX8-NEXT:    flat_load_ushort v7, v[23:24]
@@ -9627,7 +9630,6 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
 ; GFX8-NEXT:    flat_load_ushort v10, v[27:28]
 ; GFX8-NEXT:    flat_load_ushort v11, v[29:30]
 ; GFX8-NEXT:    flat_load_ushort v12, v[31:32]
-; GFX8-NEXT:    flat_load_ushort v13, v[49:50]
 ; GFX8-NEXT:    v_add_u32_e32 v18, vcc, 0x84, v0
 ; GFX8-NEXT:    buffer_store_dword v15, v16, s[0:3], 0 offen
 ; GFX8-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen
@@ -9848,22 +9850,22 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
 ; GFX900-NEXT:    global_load_ushort v22, v[1:2], off offset:38
 ; GFX900-NEXT:    global_load_ushort v23, v[1:2], off offset:36
 ; GFX900-NEXT:    global_load_ushort v24, v[1:2], off offset:34
-; GFX900-NEXT:    global_load_ushort v25, v[1:2], off offset:32
 ; GFX900-NEXT:    global_load_ushort v26, v[1:2], off
+; GFX900-NEXT:    global_load_ushort v25, v[1:2], off offset:32
 ; GFX900-NEXT:    global_load_ushort v27, v[1:2], off offset:2
-; GFX900-NEXT:    global_load_ushort v3, v[1:2], off offset:16
-; GFX900-NEXT:    global_load_ushort v4, v[1:2], off offset:18
-; GFX900-NEXT:    global_load_ushort v5, v[1:2], off offset:20
-; GFX900-NEXT:    global_load_ushort v6, v[1:2], off offset:22
-; GFX900-NEXT:    global_load_ushort v8, v[1:2], off offset:24
 ; GFX900-NEXT:    global_load_ushort v28, v[1:2], off offset:30
-; GFX900-NEXT:    global_load_ushort v29, v[1:2], off offset:26
-; GFX900-NEXT:    global_load_ushort v30, v[1:2], off offset:28
 ; GFX900-NEXT:    global_load_ushort v31, v[1:2], off offset:4
+; GFX900-NEXT:    global_load_ushort v30, v[1:2], off offset:28
 ; GFX900-NEXT:    global_load_ushort v32, v[1:2], off offset:6
+; GFX900-NEXT:    global_load_ushort v29, v[1:2], off offset:26
 ; GFX900-NEXT:    global_load_ushort v33, v[1:2], off offset:8
+; GFX900-NEXT:    global_load_ushort v8, v[1:2], off offset:24
 ; GFX900-NEXT:    global_load_ushort v34, v[1:2], off offset:10
+; GFX900-NEXT:    global_load_ushort v6, v[1:2], off offset:22
 ; GFX900-NEXT:    global_load_ushort v7, v[1:2], off offset:12
+; GFX900-NEXT:    global_load_ushort v5, v[1:2], off offset:20
+; GFX900-NEXT:    global_load_ushort v3, v[1:2], off offset:16
+; GFX900-NEXT:    global_load_ushort v4, v[1:2], off offset:18
 ; GFX900-NEXT:    s_nop 0
 ; GFX900-NEXT:    global_load_ushort v1, v[1:2], off offset:14
 ; GFX900-NEXT:    s_waitcnt vmcnt(31)
@@ -9936,21 +9938,22 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
 ; GFX900-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:148
 ; GFX900-NEXT:    v_cvt_f64_f32_e32 v[9:10], v2
 ; GFX900-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:144
-; GFX900-NEXT:    s_waitcnt vmcnt(44)
+; GFX900-NEXT:    s_waitcnt vmcnt(43)
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v25
 ; GFX900-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:140
 ; GFX900-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:136
 ; GFX900-NEXT:    v_cvt_f64_f32_e32 v[9:10], v11
-; GFX900-NEXT:    s_waitcnt vmcnt(38)
+; GFX900-NEXT:    s_waitcnt vmcnt(43)
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v28
 ; GFX900-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:132
 ; GFX900-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:128
 ; GFX900-NEXT:    v_cvt_f64_f32_e32 v[9:10], v13
-; GFX900-NEXT:    s_waitcnt vmcnt(38)
+; GFX900-NEXT:    s_waitcnt vmcnt(43)
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v15, 16, v30
 ; GFX900-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:124
 ; GFX900-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:120
 ; GFX900-NEXT:    v_cvt_f64_f32_e32 v[9:10], v15
+; GFX900-NEXT:    s_waitcnt vmcnt(43)
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v29
 ; GFX900-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:116
 ; GFX900-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:112
@@ -9959,29 +9962,28 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
 ; GFX900-NEXT:    v_cvt_f64_f32_e32 v[11:12], v2
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v27
 ; GFX900-NEXT:    v_cvt_f64_f32_e32 v[13:14], v2
-; GFX900-NEXT:    s_waitcnt vmcnt(41)
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v31
+; GFX900-NEXT:    s_waitcnt vmcnt(43)
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; GFX900-NEXT:    v_cvt_f64_f32_e32 v[15:16], v2
-; GFX900-NEXT:    s_waitcnt vmcnt(40)
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v32
 ; GFX900-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:108
 ; GFX900-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:104
 ; GFX900-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
 ; GFX900-NEXT:    v_cvt_f64_f32_e32 v[17:18], v2
-; GFX900-NEXT:    s_waitcnt vmcnt(41)
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v33
 ; GFX900-NEXT:    v_cvt_f64_f32_e32 v[19:20], v2
-; GFX900-NEXT:    s_waitcnt vmcnt(40)
+; GFX900-NEXT:    s_waitcnt vmcnt(44)
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v34
+; GFX900-NEXT:    s_waitcnt vmcnt(43)
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX900-NEXT:    v_cvt_f64_f32_e32 v[21:22], v2
+; GFX900-NEXT:    s_waitcnt vmcnt(41)
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
 ; GFX900-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:100
 ; GFX900-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:96
 ; GFX900-NEXT:    v_cvt_f64_f32_e32 v[8:9], v6
 ; GFX900-NEXT:    v_cvt_f64_f32_e32 v[5:6], v2
-; GFX900-NEXT:    s_waitcnt vmcnt(41)
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
 ; GFX900-NEXT:    s_waitcnt vmcnt(40)
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
@@ -10032,37 +10034,37 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
 ; GFX950-NEXT:    v_accvgpr_write_b32 a10, v58 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_write_b32 a12, v60 ; Reload Reuse
 ; GFX950-NEXT:    global_load_ushort v1, v[2:3], off offset:2
-; GFX950-NEXT:    global_load_ushort v4, v[2:3], off offset:12
-; GFX950-NEXT:    global_load_ushort v5, v[2:3], off offset:8
-; GFX950-NEXT:    global_load_ushort v6, v[2:3], off offset:4
 ; GFX950-NEXT:    global_load_ushort v7, v[2:3], off
 ; GFX950-NEXT:    global_load_ushort v8, v[2:3], off offset:6
+; GFX950-NEXT:    global_load_ushort v6, v[2:3], off offset:4
 ; GFX950-NEXT:    global_load_ushort v9, v[2:3], off offset:10
+; GFX950-NEXT:    global_load_ushort v5, v[2:3], off offset:8
 ; GFX950-NEXT:    global_load_ushort v10, v[2:3], off offset:14
+; GFX950-NEXT:    global_load_ushort v4, v[2:3], off offset:12
 ; GFX950-NEXT:    global_load_ushort v11, v[2:3], off offset:18
-; GFX950-NEXT:    global_load_ushort v12, v[2:3], off offset:28
-; GFX950-NEXT:    global_load_ushort v13, v[2:3], off offset:24
-; GFX950-NEXT:    global_load_ushort v14, v[2:3], off offset:20
 ; GFX950-NEXT:    global_load_ushort v15, v[2:3], off offset:16
 ; GFX950-NEXT:    global_load_ushort v16, v[2:3], off offset:22
+; GFX950-NEXT:    global_load_ushort v14, v[2:3], off offset:20
 ; GFX950-NEXT:    global_load_ushort v17, v[2:3], off offset:26
+; GFX950-NEXT:    global_load_ushort v13, v[2:3], off offset:24
 ; GFX950-NEXT:    global_load_ushort v18, v[2:3], off offset:30
+; GFX950-NEXT:    global_load_ushort v12, v[2:3], off offset:28
 ; GFX950-NEXT:    global_load_ushort v19, v[2:3], off offset:34
-; GFX950-NEXT:    global_load_ushort v20, v[2:3], off offset:44
-; GFX950-NEXT:    global_load_ushort v21, v[2:3], off offset:40
-; GFX950-NEXT:    global_load_ushort v22, v[2:3], off offset:36
 ; GFX950-NEXT:    global_load_ushort v23, v[2:3], off offset:32
 ; GFX950-NEXT:    global_load_ushort v24, v[2:3], off offset:38
+; GFX950-NEXT:    global_load_ushort v22, v[2:3], off offset:36
 ; GFX950-NEXT:    global_load_ushort v25, v[2:3], off offset:42
+; GFX950-NEXT:    global_load_ushort v21, v[2:3], off offset:40
 ; GFX950-NEXT:    global_load_ushort v26, v[2:3], off offset:46
+; GFX950-NEXT:    global_load_ushort v20, v[2:3], off offset:44
 ; GFX950-NEXT:    global_load_ushort v42, v[2:3], off offset:50
 ; GFX950-NEXT:    global_load_ushort v43, v[2:3], off offset:62
 ; GFX950-NEXT:    global_load_ushort v46, v[2:3], off offset:60
-; GFX950-NEXT:    global_load_ushort v47, v[2:3], off offset:56
-; GFX950-NEXT:    global_load_ushort v60, v[2:3], off offset:52
 ; GFX950-NEXT:    global_load_ushort v56, v[2:3], off offset:48
 ; GFX950-NEXT:    global_load_ushort v57, v[2:3], off offset:54
 ; GFX950-NEXT:    global_load_ushort v58, v[2:3], off offset:58
+; GFX950-NEXT:    global_load_ushort v47, v[2:3], off offset:56
+; GFX950-NEXT:    global_load_ushort v60, v[2:3], off offset:52
 ; GFX950-NEXT:    v_accvgpr_write_b32 a4, v44 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_write_b32 a0, v40 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_write_b32 a1, v41 ; Reload Reuse
@@ -10071,51 +10073,53 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
 ; GFX950-NEXT:    s_waitcnt vmcnt(31)
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX950-NEXT:    s_waitcnt vmcnt(30)
-; GFX950-NEXT:    v_lshlrev_b32_e32 v30, 16, v4
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
 ; GFX950-NEXT:    s_waitcnt vmcnt(29)
-; GFX950-NEXT:    v_lshlrev_b32_e32 v28, 16, v5
-; GFX950-NEXT:    v_cvt_f64_f32_e32 v[4:5], v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
+; GFX950-NEXT:    s_waitcnt vmcnt(28)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX950-NEXT:    s_waitcnt vmcnt(27)
-; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
+; GFX950-NEXT:    v_lshlrev_b32_e32 v27, 16, v9
 ; GFX950-NEXT:    s_waitcnt vmcnt(26)
-; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
+; GFX950-NEXT:    v_lshlrev_b32_e32 v28, 16, v5
 ; GFX950-NEXT:    s_waitcnt vmcnt(25)
-; GFX950-NEXT:    v_lshlrev_b32_e32 v27, 16, v9
-; GFX950-NEXT:    s_waitcnt vmcnt(24)
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v29, 16, v10
+; GFX950-NEXT:    s_waitcnt vmcnt(24)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v30, 16, v4
 ; GFX950-NEXT:    s_waitcnt vmcnt(23)
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v31, 16, v11
 ; GFX950-NEXT:    s_waitcnt vmcnt(22)
-; GFX950-NEXT:    v_lshlrev_b32_e32 v38, 16, v12
+; GFX950-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
 ; GFX950-NEXT:    s_waitcnt vmcnt(21)
-; GFX950-NEXT:    v_lshlrev_b32_e32 v36, 16, v13
+; GFX950-NEXT:    v_lshlrev_b32_e32 v33, 16, v16
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[4:5], v1
 ; GFX950-NEXT:    s_waitcnt vmcnt(20)
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v34, 16, v14
-; GFX950-NEXT:    s_waitcnt vmcnt(19)
-; GFX950-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
 ; GFX950-NEXT:    s_waitcnt vmcnt(18)
-; GFX950-NEXT:    v_lshlrev_b32_e32 v33, 16, v16
-; GFX950-NEXT:    v_cvt_f64_f32_e32 v[12:13], v27
-; GFX950-NEXT:    s_waitcnt vmcnt(16)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v36, 16, v13
+; GFX950-NEXT:    s_waitcnt vmcnt(17)
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v37, 16, v18
+; GFX950-NEXT:    s_waitcnt vmcnt(16)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v38, 16, v12
 ; GFX950-NEXT:    s_waitcnt vmcnt(15)
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v39, 16, v19
-; GFX950-NEXT:    s_waitcnt vmcnt(14)
-; GFX950-NEXT:    v_lshlrev_b32_e32 v44, 16, v20
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[12:13], v27
 ; GFX950-NEXT:    s_waitcnt vmcnt(13)
-; GFX950-NEXT:    v_lshlrev_b32_e32 v40, 16, v21
+; GFX950-NEXT:    v_lshlrev_b32_e32 v49, 16, v24
 ; GFX950-NEXT:    v_cvt_f64_f32_e32 v[14:15], v30
-; GFX950-NEXT:    v_cvt_f64_f32_e32 v[20:21], v31
+; GFX950-NEXT:    s_waitcnt vmcnt(11)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v53, 16, v25
 ; GFX950-NEXT:    s_waitcnt vmcnt(10)
-; GFX950-NEXT:    v_lshlrev_b32_e32 v49, 16, v24
+; GFX950-NEXT:    v_lshlrev_b32_e32 v40, 16, v21
 ; GFX950-NEXT:    s_waitcnt vmcnt(9)
-; GFX950-NEXT:    v_lshlrev_b32_e32 v53, 16, v25
-; GFX950-NEXT:    s_waitcnt vmcnt(8)
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v41, 16, v26
+; GFX950-NEXT:    s_waitcnt vmcnt(8)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v44, 16, v20
 ; GFX950-NEXT:    s_waitcnt vmcnt(7)
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v42
 ; GFX950-NEXT:    s_waitcnt vmcnt(6)
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v42, 16, v43
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[20:21], v31
 ; GFX950-NEXT:    v_cvt_f64_f32_e32 v[18:19], v32
 ; GFX950-NEXT:    v_cvt_f64_f32_e32 v[24:25], v33
 ; GFX950-NEXT:    v_cvt_f64_f32_e32 v[26:27], v36
@@ -10127,10 +10131,11 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
 ; GFX950-NEXT:    s_waitcnt vmcnt(5)
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v42, 16, v46
 ; GFX950-NEXT:    v_cvt_f64_f32_e32 v[42:43], v42
-; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    s_waitcnt vmcnt(2)
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v46, 16, v58
 ; GFX950-NEXT:    scratch_store_dwordx4 v0, v[42:45], off offset:240
 ; GFX950-NEXT:    v_cvt_f64_f32_e32 v[58:59], v46
+; GFX950-NEXT:    s_waitcnt vmcnt(2)
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v46, 16, v47
 ; GFX950-NEXT:    v_cvt_f64_f32_e32 v[44:45], v1
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v56
@@ -10141,11 +10146,11 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v48, 16, v23
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v52, 16, v22
 ; GFX950-NEXT:    scratch_store_dwordx4 v0, v[56:59], off offset:224
-; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX950-NEXT:    v_cvt_f64_f32_e32 v[10:11], v28
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[16:17], v29
 ; GFX950-NEXT:    v_cvt_f64_f32_e32 v[58:59], v1
+; GFX950-NEXT:    s_waitcnt vmcnt(2)
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v60
-; GFX950-NEXT:    v_cvt_f64_f32_e32 v[16:17], v29
 ; GFX950-NEXT:    v_cvt_f64_f32_e32 v[22:23], v34
 ; GFX950-NEXT:    v_cvt_f64_f32_e32 v[28:29], v35
 ; GFX950-NEXT:    v_cvt_f64_f32_e32 v[34:35], v48
@@ -10386,105 +10391,105 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1f
 ; GFX11-NEXT:    global_load_u16 v3, v[1:2], off offset:2
-; GFX11-NEXT:    global_load_u16 v4, v[1:2], off offset:12
-; GFX11-NEXT:    global_load_u16 v5, v[1:2], off offset:8
-; GFX11-NEXT:    global_load_u16 v6, v[1:2], off offset:4
 ; GFX11-NEXT:    global_load_u16 v7, v[1:2], off
 ; GFX11-NEXT:    global_load_u16 v8, v[1:2], off offset:6
+; GFX11-NEXT:    global_load_u16 v6, v[1:2], off offset:4
 ; GFX11-NEXT:    global_load_u16 v9, v[1:2], off offset:10
+; GFX11-NEXT:    global_load_u16 v5, v[1:2], off offset:8
 ; GFX11-NEXT:    global_load_u16 v10, v[1:2], off offset:14
+; GFX11-NEXT:    global_load_u16 v4, v[1:2], off offset:12
 ; GFX11-NEXT:    global_load_u16 v11, v[1:2], off offset:18
-; GFX11-NEXT:    global_load_u16 v12, v[1:2], off offset:28
-; GFX11-NEXT:    global_load_u16 v13, v[1:2], off offset:24
-; GFX11-NEXT:    global_load_u16 v14, v[1:2], off offset:20
 ; GFX11-NEXT:    global_load_u16 v15, v[1:2], off offset:16
 ; GFX11-NEXT:    global_load_u16 v16, v[1:2], off offset:22
+; GFX11-NEXT:    global_load_u16 v14, v[1:2], off offset:20
 ; GFX11-NEXT:    global_load_u16 v17, v[1:2], off offset:26
+; GFX11-NEXT:    global_load_u16 v13, v[1:2], off offset:24
 ; GFX11-NEXT:    global_load_u16 v18, v[1:2], off offset:30
+; GFX11-NEXT:    global_load_u16 v12, v[1:2], off offset:28
 ; GFX11-NEXT:    global_load_u16 v19, v[1:2], off offset:34
-; GFX11-NEXT:    global_load_u16 v20, v[1:2], off offset:44
-; GFX11-NEXT:    global_load_u16 v21, v[1:2], off offset:40
-; GFX11-NEXT:    global_load_u16 v22, v[1:2], off offset:36
 ; GFX11-NEXT:    global_load_u16 v23, v[1:2], off offset:32
 ; GFX11-NEXT:    global_load_u16 v24, v[1:2], off offset:38
+; GFX11-NEXT:    global_load_u16 v22, v[1:2], off offset:36
 ; GFX11-NEXT:    global_load_u16 v25, v[1:2], off offset:42
+; GFX11-NEXT:    global_load_u16 v21, v[1:2], off offset:40
 ; GFX11-NEXT:    global_load_u16 v26, v[1:2], off offset:46
+; GFX11-NEXT:    global_load_u16 v20, v[1:2], off offset:44
 ; GFX11-NEXT:    global_load_u16 v27, v[1:2], off offset:50
-; GFX11-NEXT:    global_load_u16 v28, v[1:2], off offset:60
-; GFX11-NEXT:    global_load_u16 v29, v[1:2], off offset:56
-; GFX11-NEXT:    global_load_u16 v30, v[1:2], off offset:52
 ; GFX11-NEXT:    global_load_u16 v31, v[1:2], off offset:48
 ; GFX11-NEXT:    global_load_u16 v32, v[1:2], off offset:54
+; GFX11-NEXT:    global_load_u16 v30, v[1:2], off offset:52
 ; GFX11-NEXT:    global_load_u16 v33, v[1:2], off offset:58
+; GFX11-NEXT:    global_load_u16 v29, v[1:2], off offset:56
+; GFX11-NEXT:    global_load_u16 v28, v[1:2], off offset:60
 ; GFX11-NEXT:    global_load_u16 v1, v[1:2], off offset:62
 ; GFX11-NEXT:    s_waitcnt vmcnt(31)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX11-NEXT:    s_waitcnt vmcnt(30)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v7
 ; GFX11-NEXT:    s_waitcnt vmcnt(29)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
 ; GFX11-NEXT:    s_waitcnt vmcnt(28)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX11-NEXT:    s_waitcnt vmcnt(27)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; GFX11-NEXT:    s_waitcnt vmcnt(26)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX11-NEXT:    s_waitcnt vmcnt(25)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    s_waitcnt vmcnt(24)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-NEXT:    s_waitcnt vmcnt(24)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v4
 ; GFX11-NEXT:    s_waitcnt vmcnt(23)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v11
 ; GFX11-NEXT:    s_waitcnt vmcnt(22)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v100, 16, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v15
 ; GFX11-NEXT:    s_waitcnt vmcnt(21)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v16
 ; GFX11-NEXT:    s_waitcnt vmcnt(20)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
 ; GFX11-NEXT:    s_waitcnt vmcnt(19)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
 ; GFX11-NEXT:    s_waitcnt vmcnt(18)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v16
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
 ; GFX11-NEXT:    s_waitcnt vmcnt(17)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-NEXT:    s_waitcnt vmcnt(16)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-NEXT:    s_waitcnt vmcnt(16)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v100, 16, v12
 ; GFX11-NEXT:    s_waitcnt vmcnt(15)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v19
 ; GFX11-NEXT:    s_waitcnt vmcnt(14)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v49, 16, v20
+; GFX11-NEXT:    v_lshlrev_b32_e32 v101, 16, v23
 ; GFX11-NEXT:    s_waitcnt vmcnt(13)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v24
 ; GFX11-NEXT:    s_waitcnt vmcnt(12)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
 ; GFX11-NEXT:    s_waitcnt vmcnt(11)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v101, 16, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
 ; GFX11-NEXT:    s_waitcnt vmcnt(10)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
 ; GFX11-NEXT:    s_waitcnt vmcnt(9)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-NEXT:    s_waitcnt vmcnt(8)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v49, 16, v20
 ; GFX11-NEXT:    s_waitcnt vmcnt(7)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v27
 ; GFX11-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 16, v28
+; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v31
 ; GFX11-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX11-NEXT:    v_lshlrev_b32_e32 v64, 16, v32
 ; GFX11-NEXT:    s_waitcnt vmcnt(4)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
 ; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v31
+; GFX11-NEXT:    v_lshlrev_b32_e32 v33, 16, v33
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v64, 16, v32
+; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v33, 16, v33
+; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 16, v28
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[96:97], v65
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[84:85], v29
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[82:83], v64
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[86:87], v33
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[84:85], v29
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[96:97], v65
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[98:99], v1
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[80:81], v30
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[70:71], v52
@@ -10539,46 +10544,48 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
 ; GFX1250-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-NEXT:    s_clause 0x1f
 ; GFX1250-NEXT:    global_load_u16 v1, v[2:3], off offset:2
-; GFX1250-NEXT:    global_load_u16 v4, v[2:3], off offset:12
-; GFX1250-NEXT:    global_load_u16 v5, v[2:3], off offset:8
-; GFX1250-NEXT:    global_load_u16 v6, v[2:3], off offset:4
 ; GFX1250-NEXT:    global_load_u16 v7, v[2:3], off
 ; GFX1250-NEXT:    global_load_u16 v8, v[2:3], off offset:6
+; GFX1250-NEXT:    global_load_u16 v6, v[2:3], off offset:4
 ; GFX1250-NEXT:    global_load_u16 v9, v[2:3], off offset:10
+; GFX1250-NEXT:    global_load_u16 v5, v[2:3], off offset:8
 ; GFX1250-NEXT:    global_load_u16 v10, v[2:3], off offset:14
-; GFX1250-NEXT:    global_load_u16 v11, v[2:3], off offset:18
+; GFX1250-NEXT:    global_load_u16 v4, v[2:3], off offset:12
 ; GFX1250-NEXT:    global_load_u16 v12, v[2:3], off offset:62
 ; GFX1250-NEXT:    global_load_u16 v13, v[2:3], off offset:60
+; GFX1250-NEXT:    global_load_u16 v11, v[2:3], off offset:18
 ; GFX1250-NEXT:    global_load_u16 v14, v[2:3], off offset:58
 ; GFX1250-NEXT:    global_load_u16 v15, v[2:3], off offset:56
-; GFX1250-NEXT:    global_load_u16 v16, v[2:3], off offset:28
-; GFX1250-NEXT:    global_load_u16 v17, v[2:3], off offset:24
-; GFX1250-NEXT:    global_load_u16 v18, v[2:3], off offset:20
 ; GFX1250-NEXT:    global_load_u16 v19, v[2:3], off offset:16
 ; GFX1250-NEXT:    global_load_u16 v20, v[2:3], off offset:22
+; GFX1250-NEXT:    global_load_u16 v18, v[2:3], off offset:20
 ; GFX1250-NEXT:    global_load_u16 v21, v[2:3], off offset:26
+; GFX1250-NEXT:    global_load_u16 v17, v[2:3], off offset:24
 ; GFX1250-NEXT:    global_load_u16 v22, v[2:3], off offset:30
+; GFX1250-NEXT:    global_load_u16 v16, v[2:3], off offset:28
 ; GFX1250-NEXT:    global_load_u16 v23, v[2:3], off offset:34
-; GFX1250-NEXT:    global_load_u16 v24, v[2:3], off offset:44
-; GFX1250-NEXT:    global_load_u16 v25, v[2:3], off offset:40
-; GFX1250-NEXT:    global_load_u16 v26, v[2:3], off offset:36
 ; GFX1250-NEXT:    global_load_u16 v27, v[2:3], off offset:32
 ; GFX1250-NEXT:    global_load_u16 v28, v[2:3], off offset:38
+; GFX1250-NEXT:    global_load_u16 v26, v[2:3], off offset:36
 ; GFX1250-NEXT:    global_load_u16 v29, v[2:3], off offset:42
+; GFX1250-NEXT:    global_load_u16 v25, v[2:3], off offset:40
 ; GFX1250-NEXT:    global_load_u16 v30, v[2:3], off offset:46
+; GFX1250-NEXT:    global_load_u16 v24, v[2:3], off offset:44
 ; GFX1250-NEXT:    global_load_u16 v31, v[2:3], off offset:50
-; GFX1250-NEXT:    global_load_u16 v32, v[2:3], off offset:52
 ; GFX1250-NEXT:    global_load_u16 v33, v[2:3], off offset:48
 ; GFX1250-NEXT:    global_load_u16 v34, v[2:3], off offset:54
+; GFX1250-NEXT:    global_load_u16 v32, v[2:3], off offset:52
 ; GFX1250-NEXT:    s_wait_loadcnt 0x1e
-; GFX1250-NEXT:    v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v37, 16, v4
+; GFX1250-NEXT:    v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v84, 16, v7
 ; GFX1250-NEXT:    s_wait_loadcnt 0x1c
-; GFX1250-NEXT:    v_dual_lshlrev_b32 v81, 16, v5 :: v_dual_lshlrev_b32 v85, 16, v6
-; GFX1250-NEXT:    s_wait_loadcnt 0x1a
-; GFX1250-NEXT:    v_dual_lshlrev_b32 v84, 16, v7 :: v_dual_lshlrev_b32 v35, 16, v8
+; GFX1250-NEXT:    v_dual_lshlrev_b32 v35, 16, v8 :: v_dual_lshlrev_b32 v85, 16, v6
+; GFX1250-NEXT:    s_wait_loadcnt 0x1b
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v80, 16, v9
+; GFX1250-NEXT:    s_wait_loadcnt 0x19
+; GFX1250-NEXT:    v_dual_lshlrev_b32 v81, 16, v5 :: v_dual_lshlrev_b32 v36, 16, v10
 ; GFX1250-NEXT:    s_wait_loadcnt 0x18
-; GFX1250-NEXT:    v_dual_lshlrev_b32 v80, 16, v9 :: v_dual_lshlrev_b32 v36, 16, v10
-; GFX1250-NEXT:    s_wait_loadcnt 0x15
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v37, 16, v4
+; GFX1250-NEXT:    s_wait_loadcnt 0x16
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    v_dual_lshlrev_b32 v2, 16, v12 :: v_dual_lshlrev_b32 v3, 16, v13
 ; GFX1250-NEXT:    s_wait_loadcnt 0x14
@@ -10588,27 +10595,29 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
 ; GFX1250-NEXT:    v_cvt_f64_f32_e32 v[4:5], v2
 ; GFX1250-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
 ; GFX1250-NEXT:    s_wait_loadcnt 0x11
-; GFX1250-NEXT:    v_dual_lshlrev_b32 v68, 16, v17 :: v_dual_lshlrev_b32 v39, 16, v16
-; GFX1250-NEXT:    s_wait_loadcnt 0xe
 ; GFX1250-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
 ; GFX1250-NEXT:    v_cvt_f64_f32_e32 v[8:9], v6
 ; GFX1250-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
-; GFX1250-NEXT:    s_wait_loadcnt 0xc
+; GFX1250-NEXT:    s_wait_loadcnt 0xd
 ; GFX1250-NEXT:    v_dual_lshlrev_b32 v21, 16, v21 :: v_dual_lshlrev_b32 v38, 16, v22
+; GFX1250-NEXT:    s_wait_loadcnt 0xc
+; GFX1250-NEXT:    v_dual_lshlrev_b32 v68, 16, v17 :: v_dual_lshlrev_b32 v39, 16, v16
 ; GFX1250-NEXT:    s_wait_loadcnt 0x9
-; GFX1250-NEXT:    v_dual_lshlrev_b32 v48, 16, v23 :: v_dual_lshlrev_b32 v25, 16, v25
-; GFX1250-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX1250-NEXT:    v_dual_lshlrev_b32 v48, 16, v23 :: v_dual_lshlrev_b32 v49, 16, v28
+; GFX1250-NEXT:    s_wait_loadcnt 0x7
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v64, 16, v29
 ; GFX1250-NEXT:    s_wait_loadcnt 0x5
-; GFX1250-NEXT:    v_dual_lshlrev_b32 v49, 16, v28 :: v_dual_lshlrev_b32 v64, 16, v29
+; GFX1250-NEXT:    v_dual_lshlrev_b32 v25, 16, v25 :: v_dual_lshlrev_b32 v50, 16, v30
 ; GFX1250-NEXT:    s_wait_loadcnt 0x3
-; GFX1250-NEXT:    v_dual_lshlrev_b32 v50, 16, v30 :: v_dual_lshlrev_b32 v51, 16, v31
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_dual_lshlrev_b32 v24, 16, v24 :: v_dual_lshlrev_b32 v51, 16, v31
+; GFX1250-NEXT:    s_wait_loadcnt 0x1
 ; GFX1250-NEXT:    v_dual_lshlrev_b32 v33, 16, v33 :: v_dual_lshlrev_b32 v52, 16, v34
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    v_dual_lshlrev_b32 v32, 16, v32 :: v_dual_lshlrev_b32 v69, 16, v27
 ; GFX1250-NEXT:    v_lshlrev_b32_e32 v70, 16, v26
 ; GFX1250-NEXT:    v_cvt_f64_f32_e32 v[14:15], v35
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1250-NEXT:    v_cvt_f64_f32_e32 v[54:55], v52
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX1250-NEXT:    v_cvt_f64_f32_e32 v[52:53], v32
 ; GFX1250-NEXT:    v_cvt_f64_f32_e32 v[30:31], v38
 ; GFX1250-NEXT:    v_cvt_f64_f32_e32 v[28:29], v39
@@ -48168,43 +48177,43 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GCN-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GCN-NEXT:    v_and_b32_e32 v36, 1, v13
-; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:52
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:180
 ; GCN-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:56
 ; GCN-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:184
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:52
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:180
 ; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
 ; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:188
 ; GCN-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:64
 ; GCN-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:192
 ; GCN-NEXT:    v_and_b32_e32 v53, 1, v26
-; GCN-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:84
-; GCN-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:88
-; GCN-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:92
-; GCN-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:96
-; GCN-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:100
-; GCN-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:104
-; GCN-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:108
 ; GCN-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:112
+; GCN-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:108
+; GCN-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:104
+; GCN-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:100
+; GCN-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:96
+; GCN-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:92
+; GCN-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:88
+; GCN-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:84
 ; GCN-NEXT:    v_and_b32_e32 v27, 1, v27
 ; GCN-NEXT:    v_and_b32_e32 v28, 1, v28
 ; GCN-NEXT:    v_and_b32_e32 v29, 1, v29
 ; GCN-NEXT:    v_and_b32_e32 v30, 1, v30
-; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:116
-; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:120
 ; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:124
-; GCN-NEXT:    buffer_load_dword v26, off, s[0:3], s32
 ; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:252
+; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:120
 ; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:248
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:116
 ; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:244
 ; GCN-NEXT:    s_waitcnt expcnt(6)
 ; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:240
+; GCN-NEXT:    buffer_load_dword v26, off, s[0:3], s32
 ; GCN-NEXT:    s_waitcnt vmcnt(14)
 ; GCN-NEXT:    v_mul_f32_e32 v40, 1.0, v37
 ; GCN-NEXT:    v_mul_f32_e32 v38, 1.0, v38
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v36
-; GCN-NEXT:    s_waitcnt vmcnt(5)
+; GCN-NEXT:    s_waitcnt vmcnt(7)
 ; GCN-NEXT:    v_mul_f32_e32 v36, 1.0, v43
-; GCN-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NEXT:    s_waitcnt vmcnt(6)
 ; GCN-NEXT:    v_mul_f32_e32 v37, 1.0, v44
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v30
 ; GCN-NEXT:    v_cndmask_b32_e64 v30, v37, v36, s[4:5]
@@ -48222,14 +48231,16 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GCN-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:212
 ; GCN-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:128
+; GCN-NEXT:    s_waitcnt vmcnt(13)
 ; GCN-NEXT:    v_mul_f32_e32 v42, 1.0, v42
-; GCN-NEXT:    s_waitcnt vmcnt(10)
+; GCN-NEXT:    s_waitcnt vmcnt(12)
 ; GCN-NEXT:    v_mul_f32_e32 v43, 1.0, v45
+; GCN-NEXT:    s_waitcnt vmcnt(11)
 ; GCN-NEXT:    v_mul_f32_e32 v41, 1.0, v41
-; GCN-NEXT:    s_waitcnt vmcnt(9)
+; GCN-NEXT:    s_waitcnt vmcnt(10)
 ; GCN-NEXT:    v_mul_f32_e32 v44, 1.0, v46
 ; GCN-NEXT:    v_mul_f32_e32 v55, 1.0, v55
-; GCN-NEXT:    s_waitcnt vmcnt(8)
+; GCN-NEXT:    s_waitcnt vmcnt(9)
 ; GCN-NEXT:    v_mul_f32_e32 v45, 1.0, v47
 ; GCN-NEXT:    v_mul_f32_e32 v54, 1.0, v54
 ; GCN-NEXT:    s_waitcnt vmcnt(7)
@@ -48279,14 +48290,14 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GCN-NEXT:    v_cndmask_b32_e64 v23, v56, v50, s[4:5]
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v22
 ; GCN-NEXT:    v_cndmask_b32_e64 v22, v57, v49, s[4:5]
-; GCN-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:68
-; GCN-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:196
-; GCN-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:72
-; GCN-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:200
 ; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:76
 ; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:204
 ; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:80
 ; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:208
+; GCN-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:68
+; GCN-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:196
+; GCN-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:72
+; GCN-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:200
 ; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
 ; GCN-NEXT:    v_and_b32_e32 v20, 1, v20
 ; GCN-NEXT:    v_and_b32_e32 v21, 1, v21
@@ -48295,13 +48306,13 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GCN-NEXT:    v_mul_f32_e32 v58, 1.0, v60
 ; GCN-NEXT:    v_mul_f32_e32 v39, 1.0, v39
 ; GCN-NEXT:    v_mul_f32_e32 v59, 1.0, v61
-; GCN-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NEXT:    s_waitcnt vmcnt(7)
 ; GCN-NEXT:    v_mul_f32_e32 v46, 1.0, v46
-; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    s_waitcnt vmcnt(6)
 ; GCN-NEXT:    v_mul_f32_e32 v47, 1.0, v47
-; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    s_waitcnt vmcnt(5)
 ; GCN-NEXT:    v_mul_f32_e32 v56, 1.0, v56
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(4)
 ; GCN-NEXT:    v_mul_f32_e32 v57, 1.0, v57
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v21
 ; GCN-NEXT:    v_cndmask_b32_e64 v21, v58, v48, s[4:5]
@@ -48331,9 +48342,13 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
 ; GCN-NEXT:    v_mul_f32_e32 v34, 1.0, v34
 ; GCN-NEXT:    v_mul_f32_e32 v35, 1.0, v35
+; GCN-NEXT:    s_waitcnt vmcnt(11)
 ; GCN-NEXT:    v_mul_f32_e32 v49, 1.0, v49
+; GCN-NEXT:    s_waitcnt vmcnt(10)
 ; GCN-NEXT:    v_mul_f32_e32 v50, 1.0, v50
+; GCN-NEXT:    s_waitcnt vmcnt(9)
 ; GCN-NEXT:    v_mul_f32_e32 v51, 1.0, v51
+; GCN-NEXT:    s_waitcnt vmcnt(8)
 ; GCN-NEXT:    v_mul_f32_e32 v52, 1.0, v52
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v17
 ; GCN-NEXT:    v_cndmask_b32_e64 v17, v52, v51, s[4:5]
@@ -48883,42 +48898,41 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[38:39], 1, v0
-; GFX8-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:68
-; GFX8-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4
-; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:72
-; GFX8-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8
-; GFX8-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:76
-; GFX8-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:12
-; GFX8-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:80
-; GFX8-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:16
-; GFX8-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:84
-; GFX8-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:20
-; GFX8-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:88
-; GFX8-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:24
-; GFX8-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:92
-; GFX8-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:28
-; GFX8-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:96
-; GFX8-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:32
-; GFX8-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:100
-; GFX8-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:36
-; GFX8-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:104
-; GFX8-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:40
-; GFX8-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:108
-; GFX8-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:44
-; GFX8-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:112
-; GFX8-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:48
-; GFX8-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:116
-; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
-; GFX8-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:120
-; GFX8-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:56
-; GFX8-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:124
-; GFX8-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:60
-; GFX8-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:128
 ; GFX8-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:64
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:128
+; GFX8-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:60
+; GFX8-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:124
+; GFX8-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:56
+; GFX8-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:120
+; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
+; GFX8-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:116
+; GFX8-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:48
+; GFX8-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:112
+; GFX8-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:44
+; GFX8-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:108
+; GFX8-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:40
+; GFX8-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:104
+; GFX8-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:36
+; GFX8-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:100
+; GFX8-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:32
+; GFX8-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:96
+; GFX8-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:28
+; GFX8-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:92
+; GFX8-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:24
+; GFX8-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:88
+; GFX8-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:20
+; GFX8-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:84
+; GFX8-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:16
+; GFX8-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:80
+; GFX8-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:12
+; GFX8-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:76
+; GFX8-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8
+; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:72
+; GFX8-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4
+; GFX8-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:68
+; GFX8-NEXT:    s_waitcnt vmcnt(14)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v24, 16, v26
+; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
 ; GFX8-NEXT:    v_cndmask_b32_e64 v24, v33, v24, s[38:39]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v25, v25, v26, s[36:37]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v26, 16, v28
@@ -48953,31 +48967,45 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
 ; GFX8-NEXT:    v_cndmask_b32_e64 v17, v33, v17, s[42:43]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v14, v14, v15, s[40:41]
+; GFX8-NEXT:    s_waitcnt vmcnt(13)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v13
+; GFX8-NEXT:    s_waitcnt vmcnt(12)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
 ; GFX8-NEXT:    v_cndmask_b32_e64 v15, v33, v15, s[28:29]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v12, v12, v13, s[26:27]
+; GFX8-NEXT:    s_waitcnt vmcnt(11)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v11
+; GFX8-NEXT:    s_waitcnt vmcnt(10)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v10
 ; GFX8-NEXT:    v_cndmask_b32_e64 v13, v33, v13, s[24:25]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[22:23]
+; GFX8-NEXT:    s_waitcnt vmcnt(9)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v9
+; GFX8-NEXT:    s_waitcnt vmcnt(8)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, v33, v11, s[20:21]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[18:19]
+; GFX8-NEXT:    s_waitcnt vmcnt(7)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
+; GFX8-NEXT:    s_waitcnt vmcnt(6)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v6
 ; GFX8-NEXT:    v_cndmask_b32_e64 v9, v33, v9, s[16:17]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[14:15]
+; GFX8-NEXT:    s_waitcnt vmcnt(5)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
+; GFX8-NEXT:    s_waitcnt vmcnt(4)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v4
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, v33, v7, s[12:13]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[10:11]
+; GFX8-NEXT:    s_waitcnt vmcnt(3)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
+; GFX8-NEXT:    s_waitcnt vmcnt(2)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, v33, v5, s[8:9]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[6:7]
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, v33, v3, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
@@ -49103,99 +49131,114 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX900-NEXT:    v_cmp_eq_u32_e64 s[30:31], 1, v0
 ; GFX900-NEXT:    v_and_b32_e32 v0, 1, v30
 ; GFX900-NEXT:    v_cmp_eq_u32_e64 s[34:35], 1, v0
-; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:68
-; GFX900-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4
-; GFX900-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:72
-; GFX900-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8
-; GFX900-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:76
-; GFX900-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:12
-; GFX900-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:80
-; GFX900-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:16
-; GFX900-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:84
-; GFX900-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:20
-; GFX900-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:88
-; GFX900-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:24
-; GFX900-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:92
-; GFX900-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:28
-; GFX900-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:96
-; GFX900-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:32
-; GFX900-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:100
-; GFX900-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:36
-; GFX900-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:104
-; GFX900-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:40
-; GFX900-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:108
-; GFX900-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:44
-; GFX900-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:112
-; GFX900-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:48
-; GFX900-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116
-; GFX900-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:52
-; GFX900-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:120
-; GFX900-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:56
-; GFX900-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:124
-; GFX900-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:60
 ; GFX900-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:128
 ; GFX900-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:124
+; GFX900-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:60
+; GFX900-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:120
+; GFX900-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:56
+; GFX900-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116
+; GFX900-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:52
+; GFX900-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:112
+; GFX900-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:48
+; GFX900-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:108
+; GFX900-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:44
+; GFX900-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:104
+; GFX900-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:40
+; GFX900-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:100
+; GFX900-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:36
+; GFX900-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:96
+; GFX900-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:32
+; GFX900-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:92
+; GFX900-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:28
+; GFX900-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:88
+; GFX900-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:24
+; GFX900-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:84
+; GFX900-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:20
+; GFX900-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:80
+; GFX900-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:16
+; GFX900-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:76
+; GFX900-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:12
+; GFX900-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:72
+; GFX900-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8
+; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:68
+; GFX900-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4
+; GFX900-NEXT:    s_waitcnt vmcnt(30)
 ; GFX900-NEXT:    v_cndmask_b32_e64 v30, v31, v32, s[34:35]
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
 ; GFX900-NEXT:    v_cndmask_b32_e64 v31, v31, v32, s[30:31]
+; GFX900-NEXT:    s_waitcnt vmcnt(28)
 ; GFX900-NEXT:    v_cndmask_b32_e64 v32, v28, v29, s[94:95]
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
 ; GFX900-NEXT:    v_cndmask_b32_e64 v28, v28, v29, s[92:93]
+; GFX900-NEXT:    s_waitcnt vmcnt(26)
 ; GFX900-NEXT:    v_cndmask_b32_e64 v29, v26, v27, s[90:91]
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
 ; GFX900-NEXT:    v_cndmask_b32_e64 v26, v26, v27, s[88:89]
+; GFX900-NEXT:    s_waitcnt vmcnt(24)
 ; GFX900-NEXT:    v_cndmask_b32_e64 v27, v24, v25, s[78:79]
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
 ; GFX900-NEXT:    v_cndmask_b32_e64 v24, v24, v25, s[76:77]
+; GFX900-NEXT:    s_waitcnt vmcnt(22)
 ; GFX900-NEXT:    v_cndmask_b32_e64 v25, v22, v23, s[74:75]
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
 ; GFX900-NEXT:    v_cndmask_b32_e64 v22, v22, v23, s[72:73]
+; GFX900-NEXT:    s_waitcnt vmcnt(20)
 ; GFX900-NEXT:    v_cndmask_b32_e64 v23, v20, v21, s[62:63]
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
 ; GFX900-NEXT:    v_cndmask_b32_e64 v20, v20, v21, s[60:61]
+; GFX900-NEXT:    s_waitcnt vmcnt(18)
 ; GFX900-NEXT:    v_cndmask_b32_e64 v21, v18, v19, s[58:59]
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
 ; GFX900-NEXT:    v_cndmask_b32_e64 v18, v18, v19, s[56:57]
+; GFX900-NEXT:    s_waitcnt vmcnt(16)
 ; GFX900-NEXT:    v_cndmask_b32_e64 v19, v16, v17, s[46:47]
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
 ; GFX900-NEXT:    v_cndmask_b32_e64 v16, v16, v17, s[44:45]
+; GFX900-NEXT:    s_waitcnt vmcnt(14)
 ; GFX900-NEXT:    v_cndmask_b32_e64 v17, v14, v15, s[42:43]
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
 ; GFX900-NEXT:    v_cndmask_b32_e64 v14, v14, v15, s[40:41]
+; GFX900-NEXT:    s_waitcnt vmcnt(12)
 ; GFX900-NEXT:    v_cndmask_b32_e64 v15, v12, v13, s[28:29]
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
 ; GFX900-NEXT:    v_cndmask_b32_e64 v12, v12, v13, s[26:27]
+; GFX900-NEXT:    s_waitcnt vmcnt(10)
 ; GFX900-NEXT:    v_cndmask_b32_e64 v13, v10, v11, s[24:25]
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
 ; GFX900-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[22:23]
+; GFX900-NEXT:    s_waitcnt vmcnt(8)
 ; GFX900-NEXT:    v_cndmask_b32_e64 v11, v8, v9, s[20:21]
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
 ; GFX900-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[18:19]
+; GFX900-NEXT:    s_waitcnt vmcnt(6)
 ; GFX900-NEXT:    v_cndmask_b32_e64 v9, v6, v7, s[16:17]
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
 ; GFX900-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[14:15]
+; GFX900-NEXT:    s_waitcnt vmcnt(4)
 ; GFX900-NEXT:    v_cndmask_b32_e64 v7, v4, v5, s[12:13]
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
 ; GFX900-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[10:11]
+; GFX900-NEXT:    s_waitcnt vmcnt(2)
 ; GFX900-NEXT:    v_cndmask_b32_e64 v5, v2, v3, s[8:9]
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX900-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[6:7]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_cndmask_b32_e64 v3, v0, v1, s[4:5]
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -49247,18 +49290,18 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX950-NEXT:    scratch_load_dword v39, off, s32 offset:52
 ; GFX950-NEXT:    scratch_load_dword v48, off, s32 offset:112
 ; GFX950-NEXT:    scratch_load_dword v49, off, s32 offset:48
-; GFX950-NEXT:    scratch_load_dword v50, off, s32 offset:88
-; GFX950-NEXT:    scratch_load_dword v51, off, s32 offset:24
-; GFX950-NEXT:    scratch_load_dword v52, off, s32 offset:92
-; GFX950-NEXT:    scratch_load_dword v53, off, s32 offset:28
 ; GFX950-NEXT:    scratch_load_dword v54, off, s32 offset:108
 ; GFX950-NEXT:    scratch_load_dword v55, off, s32 offset:44
-; GFX950-NEXT:    scratch_load_dword v40, off, s32 offset:96
-; GFX950-NEXT:    scratch_load_dword v41, off, s32 offset:32
-; GFX950-NEXT:    scratch_load_dword v42, off, s32 offset:100
-; GFX950-NEXT:    scratch_load_dword v43, off, s32 offset:36
 ; GFX950-NEXT:    scratch_load_dword v44, off, s32 offset:104
 ; GFX950-NEXT:    scratch_load_dword v45, off, s32 offset:40
+; GFX950-NEXT:    scratch_load_dword v42, off, s32 offset:100
+; GFX950-NEXT:    scratch_load_dword v43, off, s32 offset:36
+; GFX950-NEXT:    scratch_load_dword v40, off, s32 offset:96
+; GFX950-NEXT:    scratch_load_dword v41, off, s32 offset:32
+; GFX950-NEXT:    scratch_load_dword v52, off, s32 offset:92
+; GFX950-NEXT:    scratch_load_dword v53, off, s32 offset:28
+; GFX950-NEXT:    scratch_load_dword v50, off, s32 offset:88
+; GFX950-NEXT:    scratch_load_dword v51, off, s32 offset:24
 ; GFX950-NEXT:    v_and_b32_e32 v29, 1, v29
 ; GFX950-NEXT:    v_accvgpr_write_b32 a8, v56 ; Reload Reuse
 ; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v29
@@ -49350,7 +49393,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v23, v37, v36, vcc
 ; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v20
-; GFX950-NEXT:    s_waitcnt vmcnt(16)
+; GFX950-NEXT:    s_waitcnt vmcnt(20)
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v36, 16, v55
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v37, 16, v54
 ; GFX950-NEXT:    v_cndmask_b32_e32 v20, v54, v55, vcc
@@ -49358,7 +49401,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v21, v37, v36, vcc
 ; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v18
-; GFX950-NEXT:    s_waitcnt vmcnt(10)
+; GFX950-NEXT:    s_waitcnt vmcnt(18)
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v36, 16, v45
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v37, 16, v44
 ; GFX950-NEXT:    v_cndmask_b32_e32 v18, v44, v45, vcc
@@ -49367,6 +49410,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX950-NEXT:    v_accvgpr_read_b32 v44, a4 ; Reload Reuse
 ; GFX950-NEXT:    v_cndmask_b32_e32 v19, v37, v36, vcc
 ; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v16
+; GFX950-NEXT:    s_waitcnt vmcnt(16)
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v36, 16, v43
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v37, 16, v42
 ; GFX950-NEXT:    v_cndmask_b32_e32 v16, v42, v43, vcc
@@ -49375,6 +49419,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX950-NEXT:    v_accvgpr_read_b32 v42, a2 ; Reload Reuse
 ; GFX950-NEXT:    v_cndmask_b32_e32 v17, v37, v36, vcc
 ; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v14
+; GFX950-NEXT:    s_waitcnt vmcnt(14)
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v36, 16, v41
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v37, 16, v40
 ; GFX950-NEXT:    v_cndmask_b32_e32 v14, v40, v41, vcc
@@ -49383,6 +49428,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX950-NEXT:    v_accvgpr_read_b32 v40, a0 ; Reload Reuse
 ; GFX950-NEXT:    v_cndmask_b32_e32 v15, v37, v36, vcc
 ; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v12
+; GFX950-NEXT:    s_waitcnt vmcnt(12)
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v36, 16, v53
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v37, 16, v52
 ; GFX950-NEXT:    v_cndmask_b32_e32 v12, v52, v53, vcc
@@ -49390,6 +49436,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e32 v13, v37, v36, vcc
 ; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v10
+; GFX950-NEXT:    s_waitcnt vmcnt(10)
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v36, 16, v51
 ; GFX950-NEXT:    v_lshrrev_b32_e32 v37, 16, v50
 ; GFX950-NEXT:    v_cndmask_b32_e32 v10, v50, v51, vcc
@@ -49530,99 +49577,114 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v30
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s44, 1, v0
 ; GFX10-NEXT:    s_clause 0x1f
-; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:68
-; GFX10-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4
-; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:72
-; GFX10-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8
-; GFX10-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:76
-; GFX10-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:12
-; GFX10-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:80
-; GFX10-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:16
-; GFX10-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:84
-; GFX10-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:20
-; GFX10-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:88
-; GFX10-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:24
-; GFX10-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:92
-; GFX10-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:28
-; GFX10-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:96
-; GFX10-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:32
-; GFX10-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:100
-; GFX10-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:36
-; GFX10-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:104
-; GFX10-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:40
-; GFX10-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:108
-; GFX10-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:44
-; GFX10-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:112
-; GFX10-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:48
-; GFX10-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116
-; GFX10-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:52
-; GFX10-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:120
-; GFX10-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:56
-; GFX10-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:124
-; GFX10-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:60
 ; GFX10-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:128
 ; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:64
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:124
+; GFX10-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:60
+; GFX10-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:120
+; GFX10-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:56
+; GFX10-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116
+; GFX10-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:52
+; GFX10-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:112
+; GFX10-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:48
+; GFX10-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:108
+; GFX10-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:44
+; GFX10-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:104
+; GFX10-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:40
+; GFX10-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:100
+; GFX10-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:36
+; GFX10-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:96
+; GFX10-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:32
+; GFX10-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:92
+; GFX10-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:28
+; GFX10-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:88
+; GFX10-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:24
+; GFX10-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:84
+; GFX10-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:20
+; GFX10-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:80
+; GFX10-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:16
+; GFX10-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:76
+; GFX10-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:12
+; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:72
+; GFX10-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8
+; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:68
+; GFX10-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(30)
 ; GFX10-NEXT:    v_cndmask_b32_e64 v32, v30, v31, s44
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
 ; GFX10-NEXT:    v_cndmask_b32_e64 v30, v30, v31, s43
+; GFX10-NEXT:    s_waitcnt vmcnt(28)
 ; GFX10-NEXT:    v_cndmask_b32_e64 v31, v28, v29, s42
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
 ; GFX10-NEXT:    v_cndmask_b32_e64 v28, v28, v29, s41
+; GFX10-NEXT:    s_waitcnt vmcnt(26)
 ; GFX10-NEXT:    v_cndmask_b32_e64 v29, v26, v27, s40
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
 ; GFX10-NEXT:    v_cndmask_b32_e64 v26, v26, v27, s29
+; GFX10-NEXT:    s_waitcnt vmcnt(24)
 ; GFX10-NEXT:    v_cndmask_b32_e64 v27, v24, v25, s28
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
 ; GFX10-NEXT:    v_cndmask_b32_e64 v24, v24, v25, s27
+; GFX10-NEXT:    s_waitcnt vmcnt(22)
 ; GFX10-NEXT:    v_cndmask_b32_e64 v25, v22, v23, s26
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
 ; GFX10-NEXT:    v_cndmask_b32_e64 v22, v22, v23, s25
+; GFX10-NEXT:    s_waitcnt vmcnt(20)
 ; GFX10-NEXT:    v_cndmask_b32_e64 v23, v20, v21, s24
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
 ; GFX10-NEXT:    v_cndmask_b32_e64 v20, v20, v21, s23
+; GFX10-NEXT:    s_waitcnt vmcnt(18)
 ; GFX10-NEXT:    v_cndmask_b32_e64 v21, v18, v19, s22
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
 ; GFX10-NEXT:    v_cndmask_b32_e64 v18, v18, v19, s21
+; GFX10-NEXT:    s_waitcnt vmcnt(16)
 ; GFX10-NEXT:    v_cndmask_b32_e64 v19, v16, v17, s20
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
 ; GFX10-NEXT:    v_cndmask_b32_e64 v16, v16, v17, s19
+; GFX10-NEXT:    s_waitcnt vmcnt(14)
 ; GFX10-NEXT:    v_cndmask_b32_e64 v17, v14, v15, s18
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
 ; GFX10-NEXT:    v_cndmask_b32_e64 v14, v14, v15, s17
+; GFX10-NEXT:    s_waitcnt vmcnt(12)
 ; GFX10-NEXT:    v_cndmask_b32_e64 v15, v12, v13, s16
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
 ; GFX10-NEXT:    v_cndmask_b32_e64 v12, v12, v13, s15
+; GFX10-NEXT:    s_waitcnt vmcnt(10)
 ; GFX10-NEXT:    v_cndmask_b32_e64 v13, v10, v11, s14
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
 ; GFX10-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s13
+; GFX10-NEXT:    s_waitcnt vmcnt(8)
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, v8, v9, s12
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s11
+; GFX10-NEXT:    s_waitcnt vmcnt(6)
 ; GFX10-NEXT:    v_cndmask_b32_e64 v9, v6, v7, s10
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s9
+; GFX10-NEXT:    s_waitcnt vmcnt(4)
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, v4, v5, s8
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s7
+; GFX10-NEXT:    s_waitcnt vmcnt(2)
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v2, v3, s6
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s5
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v0, v1, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -49650,17 +49712,17 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11TRUE16-NEXT:    s_clause 0x1f
 ; GFX11TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32
-; GFX11TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:68
-; GFX11TRUE16-NEXT:    scratch_load_b32 v33, off, s32 offset:72
-; GFX11TRUE16-NEXT:    scratch_load_b32 v34, off, s32 offset:76
-; GFX11TRUE16-NEXT:    scratch_load_b32 v35, off, s32 offset:124
 ; GFX11TRUE16-NEXT:    scratch_load_b32 v36, off, s32 offset:128
 ; GFX11TRUE16-NEXT:    scratch_load_b32 v37, off, s32 offset:64
+; GFX11TRUE16-NEXT:    scratch_load_b32 v35, off, s32 offset:124
 ; GFX11TRUE16-NEXT:    scratch_load_b32 v38, off, s32 offset:60
 ; GFX11TRUE16-NEXT:    scratch_load_b32 v39, off, s32 offset:120
 ; GFX11TRUE16-NEXT:    scratch_load_b32 v48, off, s32 offset:56
 ; GFX11TRUE16-NEXT:    scratch_load_b32 v49, off, s32 offset:116
 ; GFX11TRUE16-NEXT:    scratch_load_b32 v50, off, s32 offset:52
+; GFX11TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:68
+; GFX11TRUE16-NEXT:    scratch_load_b32 v33, off, s32 offset:72
+; GFX11TRUE16-NEXT:    scratch_load_b32 v34, off, s32 offset:76
 ; GFX11TRUE16-NEXT:    scratch_load_b32 v51, off, s32 offset:112
 ; GFX11TRUE16-NEXT:    scratch_load_b32 v52, off, s32 offset:48
 ; GFX11TRUE16-NEXT:    scratch_load_b32 v53, off, s32 offset:108
@@ -49746,15 +49808,15 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u16_e64 s29, 1, v13.h
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(32)
 ; GFX11TRUE16-NEXT:    v_and_b16 v0.h, 1, v31.l
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(26)
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(30)
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v15.l, v36.l, v37.l, s26
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(25)
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(28)
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v14.l, v35.l, v38.l, s27
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v14.h, v35.h, v38.h, s28
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(23)
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(26)
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v13.l, v39.l, v48.l, s29
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v13.h, v39.h, v48.h, s25
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(21)
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(24)
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v12.l, v49.l, v50.l, s24
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v12.h, v49.h, v50.h, s23
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(19)
@@ -50003,12 +50065,9 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX1250TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250TRUE16-NEXT:    s_clause 0x20
 ; GFX1250TRUE16-NEXT:    scratch_load_u16 v31, off, s32
-; GFX1250TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:68
-; GFX1250TRUE16-NEXT:    scratch_load_b32 v33, off, s32 offset:72
-; GFX1250TRUE16-NEXT:    scratch_load_b32 v34, off, s32 offset:76
-; GFX1250TRUE16-NEXT:    scratch_load_b32 v35, off, s32 offset:124
 ; GFX1250TRUE16-NEXT:    scratch_load_b32 v36, off, s32 offset:128
 ; GFX1250TRUE16-NEXT:    scratch_load_b32 v37, off, s32 offset:64
+; GFX1250TRUE16-NEXT:    scratch_load_b32 v35, off, s32 offset:124
 ; GFX1250TRUE16-NEXT:    scratch_load_b32 v38, off, s32 offset:60
 ; GFX1250TRUE16-NEXT:    scratch_load_b32 v39, off, s32 offset:120
 ; GFX1250TRUE16-NEXT:    scratch_load_b32 v48, off, s32 offset:56
@@ -50026,6 +50085,9 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX1250TRUE16-NEXT:    scratch_load_b32 v68, off, s32 offset:32
 ; GFX1250TRUE16-NEXT:    scratch_load_b32 v69, off, s32 offset:92
 ; GFX1250TRUE16-NEXT:    scratch_load_b32 v70, off, s32 offset:28
+; GFX1250TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:68
+; GFX1250TRUE16-NEXT:    scratch_load_b32 v33, off, s32 offset:72
+; GFX1250TRUE16-NEXT:    scratch_load_b32 v34, off, s32 offset:76
 ; GFX1250TRUE16-NEXT:    scratch_load_b32 v71, off, s32 offset:88
 ; GFX1250TRUE16-NEXT:    scratch_load_b32 v80, off, s32 offset:24
 ; GFX1250TRUE16-NEXT:    scratch_load_b32 v81, off, s32 offset:84
@@ -50099,33 +50161,33 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX1250TRUE16-NEXT:    v_cmp_eq_u16_e64 s29, 1, v11.h
 ; GFX1250TRUE16-NEXT:    s_wait_loadcnt 0x20
 ; GFX1250TRUE16-NEXT:    v_and_b16 v0.h, 1, v31.l
-; GFX1250TRUE16-NEXT:    s_wait_loadcnt 0x1a
+; GFX1250TRUE16-NEXT:    s_wait_loadcnt 0x1e
 ; GFX1250TRUE16-NEXT:    v_cndmask_b16 v15.l, v36.l, v37.l, s26
-; GFX1250TRUE16-NEXT:    s_wait_loadcnt 0x19
+; GFX1250TRUE16-NEXT:    s_wait_loadcnt 0x1c
 ; GFX1250TRUE16-NEXT:    v_cndmask_b16 v14.l, v35.l, v38.l, s27
 ; GFX1250TRUE16-NEXT:    v_cndmask_b16 v14.h, v35.h, v38.h, s28
-; GFX1250TRUE16-NEXT:    s_wait_loadcnt 0x17
+; GFX1250TRUE16-NEXT:    s_wait_loadcnt 0x1a
 ; GFX1250TRUE16-NEXT:    v_cndmask_b16 v13.l, v39.l, v48.l, s29
 ; GFX1250TRUE16-NEXT:    v_cndmask_b16 v13.h, v39.h, v48.h, s25
-; GFX1250TRUE16-NEXT:    s_wait_loadcnt 0x15
+; GFX1250TRUE16-NEXT:    s_wait_loadcnt 0x18
 ; GFX1250TRUE16-NEXT:    v_cndmask_b16 v12.l, v49.l, v50.l, s24
 ; GFX1250TRUE16-NEXT:    v_cndmask_b16 v12.h, v49.h, v50.h, s23
-; GFX1250TRUE16-NEXT:    s_wait_loadcnt 0x13
+; GFX1250TRUE16-NEXT:    s_wait_loadcnt 0x16
 ; GFX1250TRUE16-NEXT:    v_cndmask_b16 v11.l, v51.l, v52.l, s22
 ; GFX1250TRUE16-NEXT:    v_cndmask_b16 v11.h, v51.h, v52.h, s21
-; GFX1250TRUE16-NEXT:    s_wait_loadcnt 0x11
+; GFX1250TRUE16-NEXT:    s_wait_loadcnt 0x14
 ; GFX1250TRUE16-NEXT:    v_cndmask_b16 v10.l, v53.l, v54.l, s20
 ; GFX1250TRUE16-NEXT:    v_cndmask_b16 v10.h, v53.h, v54.h, s19
-; GFX1250TRUE16-NEXT:    s_wait_loadcnt 0xf
+; GFX1250TRUE16-NEXT:    s_wait_loadcnt 0x12
 ; GFX1250TRUE16-NEXT:    v_cndmask_b16 v9.l, v55.l, v64.l, s18
 ; GFX1250TRUE16-NEXT:    v_cndmask_b16 v9.h, v55.h, v64.h, s17
-; GFX1250TRUE16-NEXT:    s_wait_loadcnt 0xd
+; GFX1250TRUE16-NEXT:    s_wait_loadcnt 0x10
 ; GFX1250TRUE16-NEXT:    v_cndmask_b16 v8.l, v65.l, v66.l, s16
 ; GFX1250TRUE16-NEXT:    v_cndmask_b16 v8.h, v65.h, v66.h, s15
-; GFX1250TRUE16-NEXT:    s_wait_loadcnt 0xb
+; GFX1250TRUE16-NEXT:    s_wait_loadcnt 0xe
 ; GFX1250TRUE16-NEXT:    v_cndmask_b16 v7.l, v67.l, v68.l, s14
 ; GFX1250TRUE16-NEXT:    v_cndmask_b16 v7.h, v67.h, v68.h, s13
-; GFX1250TRUE16-NEXT:    s_wait_loadcnt 0x9
+; GFX1250TRUE16-NEXT:    s_wait_loadcnt 0xc
 ; GFX1250TRUE16-NEXT:    v_cndmask_b16 v6.l, v69.l, v70.l, s12
 ; GFX1250TRUE16-NEXT:    v_cndmask_b16 v6.h, v69.h, v70.h, s11
 ; GFX1250TRUE16-NEXT:    s_wait_loadcnt 0x7
@@ -50172,15 +50234,15 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX1250FAKE16-NEXT:    scratch_load_b32 v53, off, s32 offset:40
 ; GFX1250FAKE16-NEXT:    scratch_load_b32 v54, off, s32 offset:100
 ; GFX1250FAKE16-NEXT:    scratch_load_b32 v55, off, s32 offset:36
-; GFX1250FAKE16-NEXT:    scratch_load_b32 v64, off, s32 offset:76
-; GFX1250FAKE16-NEXT:    scratch_load_b32 v65, off, s32 offset:12
 ; GFX1250FAKE16-NEXT:    scratch_load_b32 v66, off, s32 offset:96
 ; GFX1250FAKE16-NEXT:    scratch_load_b32 v67, off, s32 offset:32
-; GFX1250FAKE16-NEXT:    scratch_load_b32 v68, off, s32 offset:80
-; GFX1250FAKE16-NEXT:    scratch_load_b32 v69, off, s32 offset:84
 ; GFX1250FAKE16-NEXT:    scratch_load_b32 v70, off, s32 offset:92
 ; GFX1250FAKE16-NEXT:    scratch_load_b32 v71, off, s32 offset:28
+; GFX1250FAKE16-NEXT:    scratch_load_b32 v69, off, s32 offset:84
 ; GFX1250FAKE16-NEXT:    scratch_load_b32 v80, off, s32 offset:20
+; GFX1250FAKE16-NEXT:    scratch_load_b32 v68, off, s32 offset:80
+; GFX1250FAKE16-NEXT:    scratch_load_b32 v64, off, s32 offset:76
+; GFX1250FAKE16-NEXT:    scratch_load_b32 v65, off, s32 offset:12
 ; GFX1250FAKE16-NEXT:    scratch_load_b32 v81, off, s32 offset:88
 ; GFX1250FAKE16-NEXT:    scratch_load_b32 v82, off, s32 offset:24
 ; GFX1250FAKE16-NEXT:    v_and_b32_e32 v30, 1, v30
@@ -50251,11 +50313,11 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1250FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v14
 ; GFX1250FAKE16-NEXT:    v_dual_lshrrev_b32 v55, 16, v55 :: v_dual_lshrrev_b32 v54, 16, v54
-; GFX1250FAKE16-NEXT:    s_wait_loadcnt 0xc
+; GFX1250FAKE16-NEXT:    s_wait_loadcnt 0xe
 ; GFX1250FAKE16-NEXT:    v_cndmask_b32_e32 v14, v66, v67, vcc_lo
 ; GFX1250FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
 ; GFX1250FAKE16-NEXT:    v_dual_lshrrev_b32 v67, 16, v67 :: v_dual_lshrrev_b32 v66, 16, v66
-; GFX1250FAKE16-NEXT:    s_wait_loadcnt 0x8
+; GFX1250FAKE16-NEXT:    s_wait_loadcnt 0xc
 ; GFX1250FAKE16-NEXT:    v_cndmask_b32_e32 v12, v70, v71, vcc_lo
 ; GFX1250FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v10
 ; GFX1250FAKE16-NEXT:    v_dual_lshrrev_b32 v70, 16, v70 :: v_dual_bitop2_b32 v25, 1, v25 bitop3:0x40
@@ -54700,12 +54762,12 @@ define <32 x bfloat> @v_fma_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bf
 ; GFX950-NEXT:    scratch_load_dword v52, off, s32 offset:36
 ; GFX950-NEXT:    scratch_load_dword v53, off, s32 offset:32
 ; GFX950-NEXT:    scratch_load_dword v54, off, s32 offset:28
+; GFX950-NEXT:    scratch_load_dword v55, off, s32 offset:24
+; GFX950-NEXT:    scratch_load_dword v37, off, s32 offset:20
+; GFX950-NEXT:    scratch_load_dword v34, off, s32 offset:16
 ; GFX950-NEXT:    scratch_load_dword v31, off, s32 offset:4
 ; GFX950-NEXT:    scratch_load_dword v32, off, s32 offset:8
 ; GFX950-NEXT:    scratch_load_dword v33, off, s32 offset:12
-; GFX950-NEXT:    scratch_load_dword v34, off, s32 offset:16
-; GFX950-NEXT:    scratch_load_dword v37, off, s32 offset:20
-; GFX950-NEXT:    scratch_load_dword v55, off, s32 offset:24
 ; GFX950-NEXT:    v_accvgpr_write_b32 a3, v43 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_write_b32 a5, v45 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_write_b32 a6, v46 ; Reload Reuse
@@ -54800,7 +54862,7 @@ define <32 x bfloat> @v_fma_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bf
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX950-NEXT:    v_fmac_f32_e32 v23, v6, v22
-; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    s_waitcnt vmcnt(5)
 ; GFX950-NEXT:    v_and_b32_e32 v6, 0xffff0000, v55
 ; GFX950-NEXT:    v_and_b32_e32 v22, 0xffff0000, v21
 ; GFX950-NEXT:    v_and_b32_e32 v39, 0xffff0000, v5
@@ -54809,6 +54871,7 @@ define <32 x bfloat> @v_fma_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bf
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX950-NEXT:    v_fmac_f32_e32 v22, v5, v21
+; GFX950-NEXT:    s_waitcnt vmcnt(4)
 ; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v37
 ; GFX950-NEXT:    v_and_b32_e32 v21, 0xffff0000, v20
 ; GFX950-NEXT:    v_and_b32_e32 v39, 0xffff0000, v4
@@ -54817,6 +54880,7 @@ define <32 x bfloat> @v_fma_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bf
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX950-NEXT:    v_fmac_f32_e32 v21, v4, v20
+; GFX950-NEXT:    s_waitcnt vmcnt(3)
 ; GFX950-NEXT:    v_and_b32_e32 v4, 0xffff0000, v34
 ; GFX950-NEXT:    v_and_b32_e32 v20, 0xffff0000, v19
 ; GFX950-NEXT:    v_and_b32_e32 v37, 0xffff0000, v3
@@ -54825,6 +54889,7 @@ define <32 x bfloat> @v_fma_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bf
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX950-NEXT:    v_fmac_f32_e32 v20, v3, v19
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-NEXT:    v_and_b32_e32 v3, 0xffff0000, v33
 ; GFX950-NEXT:    v_and_b32_e32 v19, 0xffff0000, v18
 ; GFX950-NEXT:    v_and_b32_e32 v34, 0xffff0000, v2
@@ -55903,7 +55968,6 @@ define <32 x bfloat> @v_fma_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bf
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_clause 0x10
-; GFX1250-NEXT:    scratch_load_b32 v31, off, s32 offset:64
 ; GFX1250-NEXT:    scratch_load_b32 v32, off, s32 offset:4
 ; GFX1250-NEXT:    scratch_load_b32 v33, off, s32 offset:8
 ; GFX1250-NEXT:    scratch_load_b32 v34, off, s32 offset:12
@@ -55919,36 +55983,37 @@ define <32 x bfloat> @v_fma_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bf
 ; GFX1250-NEXT:    scratch_load_b32 v52, off, s32 offset:52
 ; GFX1250-NEXT:    scratch_load_b32 v53, off, s32 offset:56
 ; GFX1250-NEXT:    scratch_load_b32 v54, off, s32 offset:60
+; GFX1250-NEXT:    scratch_load_b32 v31, off, s32 offset:64
 ; GFX1250-NEXT:    scratch_load_b32 v55, off, s32
-; GFX1250-NEXT:    s_wait_loadcnt 0xf
+; GFX1250-NEXT:    s_wait_loadcnt 0x10
 ; GFX1250-NEXT:    v_pk_fma_bf16 v0, v0, v16, v32
-; GFX1250-NEXT:    s_wait_loadcnt 0xe
+; GFX1250-NEXT:    s_wait_loadcnt 0xf
 ; GFX1250-NEXT:    v_pk_fma_bf16 v1, v1, v17, v33
-; GFX1250-NEXT:    s_wait_loadcnt 0xd
+; GFX1250-NEXT:    s_wait_loadcnt 0xe
 ; GFX1250-NEXT:    v_pk_fma_bf16 v2, v2, v18, v34
-; GFX1250-NEXT:    s_wait_loadcnt 0xc
+; GFX1250-NEXT:    s_wait_loadcnt 0xd
 ; GFX1250-NEXT:    v_pk_fma_bf16 v3, v3, v19, v35
-; GFX1250-NEXT:    s_wait_loadcnt 0xb
+; GFX1250-NEXT:    s_wait_loadcnt 0xc
 ; GFX1250-NEXT:    v_pk_fma_bf16 v4, v4, v20, v36
-; GFX1250-NEXT:    s_wait_loadcnt 0xa
+; GFX1250-NEXT:    s_wait_loadcnt 0xb
 ; GFX1250-NEXT:    v_pk_fma_bf16 v5, v5, v21, v37
-; GFX1250-NEXT:    s_wait_loadcnt 0x9
+; GFX1250-NEXT:    s_wait_loadcnt 0xa
 ; GFX1250-NEXT:    v_pk_fma_bf16 v6, v6, v22, v38
-; GFX1250-NEXT:    s_wait_loadcnt 0x8
+; GFX1250-NEXT:    s_wait_loadcnt 0x9
 ; GFX1250-NEXT:    v_pk_fma_bf16 v7, v7, v23, v39
-; GFX1250-NEXT:    s_wait_loadcnt 0x7
+; GFX1250-NEXT:    s_wait_loadcnt 0x8
 ; GFX1250-NEXT:    v_pk_fma_bf16 v8, v8, v24, v48
-; GFX1250-NEXT:    s_wait_loadcnt 0x6
+; GFX1250-NEXT:    s_wait_loadcnt 0x7
 ; GFX1250-NEXT:    v_pk_fma_bf16 v9, v9, v25, v49
-; GFX1250-NEXT:    s_wait_loadcnt 0x5
+; GFX1250-NEXT:    s_wait_loadcnt 0x6
 ; GFX1250-NEXT:    v_pk_fma_bf16 v10, v10, v26, v50
-; GFX1250-NEXT:    s_wait_loadcnt 0x4
+; GFX1250-NEXT:    s_wait_loadcnt 0x5
 ; GFX1250-NEXT:    v_pk_fma_bf16 v11, v11, v27, v51
-; GFX1250-NEXT:    s_wait_loadcnt 0x3
+; GFX1250-NEXT:    s_wait_loadcnt 0x4
 ; GFX1250-NEXT:    v_pk_fma_bf16 v12, v12, v28, v52
-; GFX1250-NEXT:    s_wait_loadcnt 0x2
+; GFX1250-NEXT:    s_wait_loadcnt 0x3
 ; GFX1250-NEXT:    v_pk_fma_bf16 v13, v13, v29, v53
-; GFX1250-NEXT:    s_wait_loadcnt 0x1
+; GFX1250-NEXT:    s_wait_loadcnt 0x2
 ; GFX1250-NEXT:    v_pk_fma_bf16 v14, v14, v30, v54
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    v_pk_fma_bf16 v15, v15, v55, v31
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
index 68313807c427f..6c25b36ed5e10 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
@@ -259,14 +259,14 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
 ; SDAG-GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SDAG-GFX942-NEXT:    s_add_i32 s1, s0, s16
 ; SDAG-GFX942-NEXT:    v_mov_b32_e32 v60, s1
+; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[12:15], v60, s[4:7], 0 offen offset:32
 ; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[8:11], v60, s[4:7], 0 offen
 ; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[4:7], v60, s[4:7], 0 offen offset:16
-; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[12:15], v60, s[4:7], 0 offen offset:32
 ; SDAG-GFX942-NEXT:    s_add_i32 s2, s8, s16
 ; SDAG-GFX942-NEXT:    v_mov_b32_e32 v0, s2
 ; SDAG-GFX942-NEXT:    s_addk_i32 s16, 0x100
 ; SDAG-GFX942-NEXT:    s_cmpk_lt_u32 s16, 0x2000
-; SDAG-GFX942-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-GFX942-NEXT:    s_waitcnt vmcnt(2)
 ; SDAG-GFX942-NEXT:    v_accvgpr_write_b32 a0, v15 ; Reload Reuse
 ; SDAG-GFX942-NEXT:    v_accvgpr_write_b32 a1, v14 ; Reload Reuse
 ; SDAG-GFX942-NEXT:    v_accvgpr_write_b32 a2, v13 ; Reload Reuse
@@ -285,8 +285,9 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
 ; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[56:59], v60, s[4:7], 0 offen offset:224
 ; SDAG-GFX942-NEXT:    s_nop 0
 ; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[60:63], v60, s[4:7], 0 offen offset:240
-; SDAG-GFX942-NEXT:    s_nop 0
+; SDAG-GFX942-NEXT:    s_waitcnt vmcnt(14)
 ; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[8:11], v0, s[12:15], 0 offen
+; SDAG-GFX942-NEXT:    s_waitcnt vmcnt(14)
 ; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[4:7], v0, s[12:15], 0 offen offset:16
 ; SDAG-GFX942-NEXT:    s_nop 1
 ; SDAG-GFX942-NEXT:    v_accvgpr_read_b32 v5, a0 ; Reload Reuse
@@ -431,6 +432,7 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
 ; GISEL-GFX942-NEXT:  .LBB0_1: ; %load-store-loop
 ; GISEL-GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GISEL-GFX942-NEXT:    v_add_u32_e32 v62, s0, v1
+; GISEL-GFX942-NEXT:    buffer_load_dwordx4 a[0:3], v62, s[8:11], 0 offen offset:240
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[2:5], v62, s[8:11], 0 offen
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[6:9], v62, s[8:11], 0 offen offset:16
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[10:13], v62, s[8:11], 0 offen offset:32
@@ -446,26 +448,40 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[50:53], v62, s[8:11], 0 offen offset:192
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[54:57], v62, s[8:11], 0 offen offset:208
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[58:61], v62, s[8:11], 0 offen offset:224
-; GISEL-GFX942-NEXT:    buffer_load_dwordx4 a[0:3], v62, s[8:11], 0 offen offset:240
 ; GISEL-GFX942-NEXT:    v_add_u32_e32 v63, s12, v1
 ; GISEL-GFX942-NEXT:    v_add_u32_e32 v1, 0x100, v1
 ; GISEL-GFX942-NEXT:    v_cmp_lt_u32_e32 vcc, v1, v0
-; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[6:9], v63, s[4:7], 0 offen offset:16
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[10:13], v63, s[4:7], 0 offen offset:32
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[14:17], v63, s[4:7], 0 offen offset:48
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[18:21], v63, s[4:7], 0 offen offset:64
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[22:25], v63, s[4:7], 0 offen offset:80
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[26:29], v63, s[4:7], 0 offen offset:96
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[30:33], v63, s[4:7], 0 offen offset:112
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[34:37], v63, s[4:7], 0 offen offset:128
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[38:41], v63, s[4:7], 0 offen offset:144
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[42:45], v63, s[4:7], 0 offen offset:160
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[46:49], v63, s[4:7], 0 offen offset:176
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[50:53], v63, s[4:7], 0 offen offset:192
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[54:57], v63, s[4:7], 0 offen offset:208
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[58:61], v63, s[4:7], 0 offen offset:224
 ; GISEL-GFX942-NEXT:    scratch_load_dwordx4 v[2:5], off, off ; 16-byte Folded Reload
 ; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(0)
@@ -785,14 +801,14 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
 ; SDAG-GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SDAG-GFX942-NEXT:    s_add_i32 s1, s0, s16
 ; SDAG-GFX942-NEXT:    v_mov_b32_e32 v60, s1
+; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[12:15], v60, s[4:7], 0 offen offset:32
 ; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[8:11], v60, s[4:7], 0 offen
 ; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[4:7], v60, s[4:7], 0 offen offset:16
-; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[12:15], v60, s[4:7], 0 offen offset:32
 ; SDAG-GFX942-NEXT:    s_add_i32 s2, s8, s16
 ; SDAG-GFX942-NEXT:    v_mov_b32_e32 v0, s2
 ; SDAG-GFX942-NEXT:    s_addk_i32 s16, 0x100
 ; SDAG-GFX942-NEXT:    s_cmpk_lt_u32 s16, 0x100
-; SDAG-GFX942-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-GFX942-NEXT:    s_waitcnt vmcnt(2)
 ; SDAG-GFX942-NEXT:    v_accvgpr_write_b32 a0, v15 ; Reload Reuse
 ; SDAG-GFX942-NEXT:    v_accvgpr_write_b32 a1, v14 ; Reload Reuse
 ; SDAG-GFX942-NEXT:    v_accvgpr_write_b32 a2, v13 ; Reload Reuse
@@ -811,8 +827,9 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
 ; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[56:59], v60, s[4:7], 0 offen offset:224
 ; SDAG-GFX942-NEXT:    s_nop 0
 ; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[60:63], v60, s[4:7], 0 offen offset:240
-; SDAG-GFX942-NEXT:    s_nop 0
+; SDAG-GFX942-NEXT:    s_waitcnt vmcnt(14)
 ; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[8:11], v0, s[12:15], 0 offen
+; SDAG-GFX942-NEXT:    s_waitcnt vmcnt(14)
 ; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[4:7], v0, s[12:15], 0 offen offset:16
 ; SDAG-GFX942-NEXT:    s_nop 1
 ; SDAG-GFX942-NEXT:    v_accvgpr_read_b32 v5, a0 ; Reload Reuse
@@ -957,6 +974,7 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
 ; GISEL-GFX942-NEXT:  .LBB1_1: ; %load-store-loop
 ; GISEL-GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GISEL-GFX942-NEXT:    v_add_u32_e32 v62, s0, v1
+; GISEL-GFX942-NEXT:    buffer_load_dwordx4 a[0:3], v62, s[8:11], 0 offen offset:240
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[2:5], v62, s[8:11], 0 offen
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[6:9], v62, s[8:11], 0 offen offset:16
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[10:13], v62, s[8:11], 0 offen offset:32
@@ -972,26 +990,40 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[50:53], v62, s[8:11], 0 offen offset:192
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[54:57], v62, s[8:11], 0 offen offset:208
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[58:61], v62, s[8:11], 0 offen offset:224
-; GISEL-GFX942-NEXT:    buffer_load_dwordx4 a[0:3], v62, s[8:11], 0 offen offset:240
 ; GISEL-GFX942-NEXT:    v_add_u32_e32 v63, s12, v1
 ; GISEL-GFX942-NEXT:    v_add_u32_e32 v1, 0x100, v1
 ; GISEL-GFX942-NEXT:    v_cmp_lt_u32_e32 vcc, v1, v0
-; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[6:9], v63, s[4:7], 0 offen offset:16
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[10:13], v63, s[4:7], 0 offen offset:32
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[14:17], v63, s[4:7], 0 offen offset:48
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[18:21], v63, s[4:7], 0 offen offset:64
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[22:25], v63, s[4:7], 0 offen offset:80
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[26:29], v63, s[4:7], 0 offen offset:96
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[30:33], v63, s[4:7], 0 offen offset:112
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[34:37], v63, s[4:7], 0 offen offset:128
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[38:41], v63, s[4:7], 0 offen offset:144
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[42:45], v63, s[4:7], 0 offen offset:160
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[46:49], v63, s[4:7], 0 offen offset:176
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[50:53], v63, s[4:7], 0 offen offset:192
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[54:57], v63, s[4:7], 0 offen offset:208
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[58:61], v63, s[4:7], 0 offen offset:224
 ; GISEL-GFX942-NEXT:    scratch_load_dwordx4 v[2:5], off, off ; 16-byte Folded Reload
 ; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
index 2b63a8cf69476..ec72ab930ddc0 100644
--- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
+++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
@@ -726,9 +726,9 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
 ; GCN-O0-NEXT:    buffer_load_dword v6, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b32 s0, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s2, 0
 ; GCN-O0-NEXT:    s_mov_b32 s4, s2
@@ -738,9 +738,10 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
 ; GCN-O0-NEXT:    s_mov_b64 s[2:3], s[4:5]
 ; GCN-O0-NEXT:    v_mov_b32_e32 v1, 1
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
+; GCN-O0-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-O0-NEXT:    buffer_store_dword v1, v[2:3], s[0:3], 0 addr64 offset:4
 ; GCN-O0-NEXT:    s_mov_b32 s0, 2
+; GCN-O0-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-O0-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s0
 ; GCN-O0-NEXT:    s_mov_b64 s[0:1], exec
 ; GCN-O0-NEXT:    v_writelane_b32 v6, s0, 4
@@ -772,9 +773,9 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
 ; GCN-O0-NEXT:    buffer_load_dword v6, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b32 s1, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s0, 0
 ; GCN-O0-NEXT:    s_mov_b32 s2, s0
@@ -784,8 +785,9 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7
 ; GCN-O0-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GCN-O0-NEXT:    v_mov_b32_e32 v1, 3
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
+; GCN-O0-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-O0-NEXT:    buffer_store_dword v1, v[2:3], s[4:7], 0 addr64 offset:12
+; GCN-O0-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-O0-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s0
 ; GCN-O0-NEXT:    s_mov_b64 s[0:1], exec
 ; GCN-O0-NEXT:    v_writelane_b32 v6, s0, 6
@@ -1312,15 +1314,12 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
 ; GCN-O0-NEXT:    v_readlane_b32 s5, v7, 19
 ; GCN-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-O0-NEXT:  ; %bb.11: ; %bb12
-; GCN-O0-NEXT:    s_waitcnt expcnt(3)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_waitcnt expcnt(2)
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_waitcnt expcnt(1)
-; GCN-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
 ; GCN-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
+; GCN-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    s_waitcnt vmcnt(3)
 ; GCN-O0-NEXT:    v_mov_b32_e32 v4, v3
 ; GCN-O0-NEXT:    ; implicit-def: $sgpr4
 ; GCN-O0-NEXT:    v_mov_b32_e32 v5, s4
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 07e6a76d14cf9..d38dbd9ba1f93 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -909,27 +909,31 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s8, s6
 ; SI-NEXT:    s_mov_b32 s9, s7
-; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
-; SI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:1
-; SI-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0 offset:2
-; SI-NEXT:    buffer_load_ubyte v3, off, s[8:11], 0 offset:3
 ; SI-NEXT:    buffer_load_ubyte v4, off, s[8:11], 0 offset:5
 ; SI-NEXT:    buffer_load_ubyte v5, off, s[8:11], 0 offset:7
+; SI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:1
+; SI-NEXT:    buffer_load_ubyte v3, off, s[8:11], 0 offset:3
 ; SI-NEXT:    buffer_load_ubyte v6, off, s[8:11], 0 offset:4
 ; SI-NEXT:    buffer_load_ubyte v7, off, s[8:11], 0 offset:6
+; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
+; SI-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0 offset:2
 ; SI-NEXT:    s_mov_b32 s0, s4
 ; SI-NEXT:    s_mov_b32 s1, s5
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_or_b32_e32 v4, v4, v6
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_or_b32_e32 v5, v5, v7
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -1055,8 +1059,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v1, s[2:3]
 ; GFX9-GISEL-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:1
-; GFX9-GISEL-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:2
 ; GFX9-GISEL-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:3
+; GFX9-GISEL-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:2
 ; GFX9-GISEL-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:4
 ; GFX9-GISEL-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:5
 ; GFX9-GISEL-NEXT:    global_load_ubyte v7, v1, s[2:3] offset:7
@@ -1064,9 +1068,9 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v2, 8, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(5)
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX9-GISEL-NEXT:    v_or3_b32 v2, v2, v3, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v4, v6, 8, v5
diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
index 137acd34ecc2a..5ef2a5b9df344 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
@@ -874,27 +874,31 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s8, s6
 ; SI-NEXT:    s_mov_b32 s9, s7
-; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
-; SI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:1
-; SI-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0 offset:2
-; SI-NEXT:    buffer_load_ubyte v3, off, s[8:11], 0 offset:3
 ; SI-NEXT:    buffer_load_ubyte v4, off, s[8:11], 0 offset:5
 ; SI-NEXT:    buffer_load_ubyte v5, off, s[8:11], 0 offset:7
+; SI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:1
+; SI-NEXT:    buffer_load_ubyte v3, off, s[8:11], 0 offset:3
 ; SI-NEXT:    buffer_load_ubyte v6, off, s[8:11], 0 offset:4
 ; SI-NEXT:    buffer_load_ubyte v7, off, s[8:11], 0 offset:6
+; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
+; SI-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0 offset:2
 ; SI-NEXT:    s_mov_b32 s0, s4
 ; SI-NEXT:    s_mov_b32 s1, s5
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_or_b32_e32 v4, v4, v6
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_or_b32_e32 v5, v5, v7
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -1020,8 +1024,8 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v1, s[2:3]
 ; GFX9-GISEL-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:1
-; GFX9-GISEL-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:2
 ; GFX9-GISEL-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:3
+; GFX9-GISEL-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:2
 ; GFX9-GISEL-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:4
 ; GFX9-GISEL-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:5
 ; GFX9-GISEL-NEXT:    global_load_ubyte v7, v1, s[2:3] offset:7
@@ -1029,9 +1033,9 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v2, 8, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(5)
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX9-GISEL-NEXT:    v_or3_b32 v2, v2, v3, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v4, v6, 8, v5
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index 14897b68bf57b..70fc83117f5a2 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -1908,27 +1908,29 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr
 ; VI-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v10, vcc, 3, v0
 ; VI-NEXT:    v_addc_u32_e32 v11, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_ubyte v12, v[2:3]
-; VI-NEXT:    flat_load_ubyte v2, v[8:9]
-; VI-NEXT:    flat_load_ubyte v3, v[10:11]
+; VI-NEXT:    flat_load_ubyte v6, v[6:7]
 ; VI-NEXT:    flat_load_ubyte v4, v[4:5]
 ; VI-NEXT:    flat_load_ubyte v5, v[0:1]
-; VI-NEXT:    flat_load_ubyte v6, v[6:7]
+; VI-NEXT:    flat_load_ubyte v12, v[2:3]
+; VI-NEXT:    flat_load_ubyte v3, v[10:11]
+; VI-NEXT:    flat_load_ubyte v2, v[8:9]
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 4, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_ubyte v7, v[0:1]
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v6
 ; VI-NEXT:    s_waitcnt vmcnt(5)
-; VI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
+; VI-NEXT:    v_cvt_f32_ubyte0_e32 v6, v4
 ; VI-NEXT:    s_waitcnt vmcnt(4)
-; VI-NEXT:    v_cvt_f32_ubyte0_e32 v3, v3
-; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v5
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v6
-; VI-NEXT:    v_cvt_f32_ubyte0_e32 v6, v4
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v5, v12
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_cvt_f32_ubyte0_e32 v3, v3
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v4, v7
 ; VI-NEXT:    buffer_store_dwordx3 v[4:6], off, s[0:3], 0 offset:16
@@ -1942,20 +1944,20 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr
 ; GFX10-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x5
-; GFX10-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:6
 ; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:3
 ; GFX10-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:2
 ; GFX10-NEXT:    global_load_ubyte v5, v0, s[2:3] offset:1
+; GFX10-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:6
 ; GFX10-NEXT:    global_load_short_d16 v7, v0, s[2:3] offset:4
 ; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(5)
-; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v6, v4
-; GFX10-NEXT:    s_waitcnt vmcnt(4)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v3, v1
-; GFX10-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-NEXT:    s_waitcnt vmcnt(4)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
-; GFX10-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-NEXT:    s_waitcnt vmcnt(3)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v6, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v5, v7
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v4, v7
@@ -2003,20 +2005,20 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x5
-; GFX11-NEXT:    global_load_u8 v4, v0, s[2:3] offset:6
 ; GFX11-NEXT:    global_load_u8 v1, v0, s[2:3] offset:3
 ; GFX11-NEXT:    global_load_u8 v2, v0, s[2:3] offset:2
 ; GFX11-NEXT:    global_load_u8 v5, v0, s[2:3] offset:1
+; GFX11-NEXT:    global_load_u8 v4, v0, s[2:3] offset:6
 ; GFX11-NEXT:    global_load_d16_b16 v7, v0, s[2:3] offset:4
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v6, v4
-; GFX11-NEXT:    s_waitcnt vmcnt(4)
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v3, v1
-; GFX11-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-NEXT:    s_waitcnt vmcnt(4)
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
-; GFX11-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-NEXT:    s_waitcnt vmcnt(3)
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v1, v5
+; GFX11-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v6, v4
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v5, v7
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v4, v7
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index df7f8c6f39b3f..9f617a914e81a 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -548,47 +548,52 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_readlane_b32 s4, v30, 4
 ; GFX9-O0-NEXT:    v_readlane_b32 s5, v30, 5
 ; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB0_9
 ; GFX9-O0-NEXT:  .LBB0_4: ; %udiv-loop-exit
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 1
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[2:3], s4, v[0:1]
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[9:10], s4, v[9:10]
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 63
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[0:1], s4, v[0:1]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v8
 ; GFX9-O0-NEXT:    v_or3_b32 v4, v4, v11, v12
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v9
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-O0-NEXT:    v_or3_b32 v0, v0, v1, v7
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v6
 ; GFX9-O0-NEXT:    v_or_b32_e64 v4, v4, v7
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v5
 ; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
 ; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
@@ -608,26 +613,29 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_readlane_b32 s4, v30, 8
 ; GFX9-O0-NEXT:    v_readlane_b32 s5, v30, 9
 ; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB0_4
 ; GFX9-O0-NEXT:  .LBB0_6: ; %udiv-do-while
@@ -638,35 +646,36 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_readlane_b32 s6, v30, 10
 ; GFX9-O0-NEXT:    v_readlane_b32 s7, v30, 11
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 63
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(16)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(22)
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[28:29], s4, v[2:3]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v29
 ; GFX9-O0-NEXT:    s_mov_b32 s5, 1
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(20)
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[22:23], s5, v[22:23]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v23
 ; GFX9-O0-NEXT:    v_or_b32_e64 v4, v4, v5
@@ -676,6 +685,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v4
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[28:29], s5, v[2:3]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[4:5], s4, v[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v29
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v5
@@ -685,25 +695,28 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_or_b32_e64 v4, v3, v4
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[2:3], s5, v[0:1]
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[28:29], s5, v[6:7]
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[0:1], s4, v[0:1]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v29
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(10)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v27
 ; GFX9-O0-NEXT:    v_or3_b32 v6, v6, v7, v10
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v28
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(14)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v26
 ; GFX9-O0-NEXT:    v_or3_b32 v0, v0, v1, v7
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(8)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(13)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v25
 ; GFX9-O0-NEXT:    v_or_b32_e64 v6, v6, v7
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(12)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v24
 ; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
 ; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
@@ -712,12 +725,14 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v22
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v23
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(11)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v11
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(10)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v14
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v15
 ; GFX9-O0-NEXT:    v_sub_co_u32_e32 v13, vcc, v13, v6
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v12, vcc, v12, v10, vcc
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v11, vcc, v11, v4, vcc
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v5, vcc
@@ -735,13 +750,17 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v12
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, 0
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v21
 ; GFX9-O0-NEXT:    v_and_b32_e64 v22, v7, v22
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-O0-NEXT:    v_and_b32_e64 v20, v11, v20
 ; GFX9-O0-NEXT:    ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v22
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v19
 ; GFX9-O0-NEXT:    v_and_b32_e64 v7, v7, v22
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-O0-NEXT:    v_and_b32_e64 v22, v11, v18
 ; GFX9-O0-NEXT:    ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v7
@@ -757,12 +776,15 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v7
 ; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v10
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 killed $vgpr8_vgpr9 killed $exec
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], -1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s8
 ; GFX9-O0-NEXT:    s_mov_b32 s4, s9
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v16
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v17
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v16, s5
 ; GFX9-O0-NEXT:    v_add_co_u32_e32 v19, vcc, v11, v16
@@ -850,29 +872,30 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(9)
+; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(13)
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[6:7], v4, v[18:19]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v7
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 64
 ; GFX9-O0-NEXT:    v_sub_u32_e64 v20, s4, v4
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(11)
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[20:21], v20, v[14:15]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v21
 ; GFX9-O0-NEXT:    v_or_b32_e64 v5, v5, v22
@@ -910,12 +933,16 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[6:7]
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v14
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(10)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v12
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v13
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], -1
 ; GFX9-O0-NEXT:    s_mov_b32 s7, s8
 ; GFX9-O0-NEXT:    s_mov_b32 s6, s9
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v16
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v17
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v16, s7
 ; GFX9-O0-NEXT:    v_add_co_u32_e32 v16, vcc, v15, v16
@@ -951,10 +978,11 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
@@ -962,39 +990,43 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(20)
 ; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(20)
 ; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(20)
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(20)
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB0_6
 ; GFX9-O0-NEXT:  .LBB0_8: ; %udiv-bb1
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s6
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-O0-NEXT:    s_mov_b32 s4, s7
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX9-O0-NEXT:    s_mov_b32 s8, s6
 ; GFX9-O0-NEXT:    s_mov_b32 s9, s7
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s5
 ; GFX9-O0-NEXT:    v_add_co_u32_e32 v8, vcc, v3, v4
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-O0-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s8
 ; GFX9-O0-NEXT:    v_addc_co_u32_e32 v0, vcc, v0, v4, vcc
@@ -1016,10 +1048,12 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 0x7f
 ; GFX9-O0-NEXT:    v_sub_u32_e64 v2, s4, v3
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[4:5], v2, v[10:11]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v5
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 64
 ; GFX9-O0-NEXT:    v_sub_u32_e64 v13, s4, v2
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[13:14], v13, v[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v14
 ; GFX9-O0-NEXT:    v_or_b32_e64 v12, v12, v15
@@ -1098,28 +1132,35 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_cbranch_execz .LBB0_5
 ; GFX9-O0-NEXT:    s_branch .LBB0_7
 ; GFX9-O0-NEXT:  .LBB0_9: ; %udiv-end
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v8
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v6
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v3, v3, v2
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v8, v5, v4
 ; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v3
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v3, v3, v6
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v10
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v0, v0, v7
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
@@ -1661,18 +1702,18 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_branch .LBB0_9
 ; GFX9-G-O0-NEXT:  .LBB0_4: ; %udiv-loop-exit
-; GFX9-G-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-G-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v5
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, v6
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, v7
 ; GFX9-G-O0-NEXT:    s_mov_b32 s4, 1
@@ -1687,8 +1728,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    v_lshrrev_b32_e64 v5, v2, v3
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v11
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v12
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, v13
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, v14
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, v1
@@ -1722,26 +1765,29 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    v_readlane_b32 s4, v31, 4
 ; GFX9-G-O0-NEXT:    v_readlane_b32 s5, v31, 5
 ; GFX9-G-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-G-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-G-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    s_nop 0
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_branch .LBB0_4
 ; GFX9-G-O0-NEXT:  .LBB0_6: ; %udiv-do-while
@@ -1752,35 +1798,35 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-G-O0-NEXT:    v_readlane_b32 s6, v31, 6
 ; GFX9-G-O0-NEXT:    v_readlane_b32 s7, v31, 7
-; GFX9-G-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    s_mov_b64 s[4:5], 0
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(18)
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(22)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v3
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(16)
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(20)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, v5
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, v4
 ; GFX9-G-O0-NEXT:    s_mov_b32 s8, 1
@@ -1797,6 +1843,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 killed $vgpr4_vgpr5 killed $exec
 ; GFX9-G-O0-NEXT:    v_or_b32_e64 v7, v0, v1
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr0_vgpr1 killed $vgpr22_vgpr23 killed $exec
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, v24
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v25
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr0 killed $exec
@@ -1822,10 +1869,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    v_lshrrev_b32_e64 v15, v2, v3
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, v0
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(10)
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(14)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v27
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v28
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(8)
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(12)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v23, v29
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v24, v30
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, v1
@@ -1845,10 +1892,13 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v14
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v15
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(11)
 ; GFX9-G-O0-NEXT:    v_sub_co_u32_e64 v13, s[8:9], v13, v4
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(10)
 ; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v12, s[8:9], v12, v9, s[8:9]
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v10, s[8:9], v10, v7, s[8:9]
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v12, s[8:9], v6, v5, s[8:9]
 ; GFX9-G-O0-NEXT:    s_mov_b32 s8, 31
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, s8
@@ -1867,9 +1917,13 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr12_vgpr13 killed $vgpr12_vgpr13 def $vgpr12_vgpr13_vgpr14_vgpr15 killed $exec
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, v22
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, v23
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-G-O0-NEXT:    v_and_b32_e64 v11, v10, v11
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-G-O0-NEXT:    v_and_b32_e64 v10, v10, v21
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-G-O0-NEXT:    v_and_b32_e64 v8, v6, v8
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-G-O0-NEXT:    v_and_b32_e64 v6, v6, v20
 ; GFX9-G-O0-NEXT:    v_sub_co_u32_e64 v4, s[8:9], v4, v11
 ; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v10, s[8:9], v9, v10, s[8:9]
@@ -1879,9 +1933,13 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, v10
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, v9
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, v8
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, v16
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v17
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v18
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, v19
 ; GFX9-G-O0-NEXT:    s_mov_b32 s8, -1
 ; GFX9-G-O0-NEXT:    s_mov_b32 s12, -1
@@ -1961,32 +2019,34 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
 ; GFX9-G-O0-NEXT:    buffer_load_dword v31, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[20:21]
-; GFX9-G-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    s_mov_b32 s4, 64
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v22, v17
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v23, v16
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(13)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v21, v4
 ; GFX9-G-O0-NEXT:    s_mov_b32 s5, 0xffffffc0
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, s5
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(12)
 ; GFX9-G-O0-NEXT:    v_add_u32_e64 v4, v18, v4
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX9-G-O0-NEXT:    v_sub_u32_e64 v5, v5, v18
@@ -1995,7 +2055,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    v_cmp_lt_u32_e64 s[4:5], v18, v6
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX9-G-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v18, v6
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-G-O0-NEXT:    v_lshrrev_b64 v[6:7], v18, v[20:21]
 ; GFX9-G-O0-NEXT:    v_lshrrev_b64 v[25:26], v18, v[22:23]
 ; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[23:24], v5, v[20:21]
@@ -2031,15 +2090,19 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    s_mov_b32 s7, -1
 ; GFX9-G-O0-NEXT:    s_mov_b32 s6, -1
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v16, s4
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(11)
 ; GFX9-G-O0-NEXT:    v_add_co_u32_e64 v15, s[4:5], v15, v16
 ; GFX9-G-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, s10
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(11)
 ; GFX9-G-O0-NEXT:    v_addc_co_u32_e64 v14, s[4:5], v14, v15, s[4:5]
 ; GFX9-G-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, s7
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(11)
 ; GFX9-G-O0-NEXT:    v_addc_co_u32_e64 v13, s[4:5], v13, v14, s[4:5]
 ; GFX9-G-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, s6
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(11)
 ; GFX9-G-O0-NEXT:    v_addc_co_u32_e64 v12, s[4:5], v12, v13, s[4:5]
 ; GFX9-G-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_mov_b64 s[4:5], s[8:9]
@@ -2058,49 +2121,57 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    s_nop 0
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_nop 0
 ; GFX9-G-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(20)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    s_nop 0
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(20)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(20)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(20)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_branch .LBB0_6
 ; GFX9-G-O0-NEXT:  .LBB0_8: ; %udiv-bb1
 ; GFX9-G-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
 ; GFX9-G-O0-NEXT:    buffer_load_dword v31, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[20:21]
-; GFX9-G-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-G-O0-NEXT:    s_mov_b32 s6, 1
 ; GFX9-G-O0-NEXT:    s_mov_b32 s10, 0
 ; GFX9-G-O0-NEXT:    s_mov_b32 s9, 0
 ; GFX9-G-O0-NEXT:    s_mov_b32 s8, 0
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, s6
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-G-O0-NEXT:    v_add_co_u32_e64 v4, s[6:7], v2, v4
 ; GFX9-G-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, s10
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-G-O0-NEXT:    v_addc_co_u32_e64 v5, s[6:7], v5, v7, s[6:7]
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, s9
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-G-O0-NEXT:    v_addc_co_u32_e64 v7, s[6:7], v6, v7, s[6:7]
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, s8
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-G-O0-NEXT:    v_addc_co_u32_e64 v6, s[6:7], v1, v6, s[6:7]
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, v4
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, v5
@@ -2116,8 +2187,11 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    v_sub_co_u32_e64 v8, s[6:7], v1, v2
 ; GFX9-G-O0-NEXT:    s_mov_b32 s7, 64
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, v0
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v9
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, v3
 ; GFX9-G-O0-NEXT:    s_mov_b32 s6, 0xffffffc0
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, s6
@@ -2198,27 +2272,31 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    s_cbranch_execz .LBB0_5
 ; GFX9-G-O0-NEXT:    s_branch .LBB0_7
 ; GFX9-G-O0-NEXT:  .LBB0_9: ; %udiv-end
-; GFX9-G-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-G-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v8
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v9
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, v10
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v11
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v2
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-G-O0-NEXT:    v_xor_b32_e64 v0, v0, v7
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-G-O0-NEXT:    v_xor_b32_e64 v1, v1, v6
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v8
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v9
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-G-O0-NEXT:    v_xor_b32_e64 v2, v2, v5
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-G-O0-NEXT:    v_xor_b32_e64 v3, v3, v4
 ; GFX9-G-O0-NEXT:    v_sub_co_u32_e64 v0, s[4:5], v0, v7
 ; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v1, s[4:5], v1, v6, s[4:5]
@@ -2671,47 +2749,52 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_readlane_b32 s4, v30, 2
 ; GFX9-O0-NEXT:    v_readlane_b32 s5, v30, 3
 ; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB1_9
 ; GFX9-O0-NEXT:  .LBB1_4: ; %udiv-loop-exit
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 1
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[2:3], s4, v[0:1]
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[9:10], s4, v[9:10]
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 63
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[0:1], s4, v[0:1]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v8
 ; GFX9-O0-NEXT:    v_or3_b32 v4, v4, v11, v12
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v9
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-O0-NEXT:    v_or3_b32 v0, v0, v1, v7
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v6
 ; GFX9-O0-NEXT:    v_or_b32_e64 v4, v4, v7
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v5
 ; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
 ; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
@@ -2731,26 +2814,29 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_readlane_b32 s4, v30, 6
 ; GFX9-O0-NEXT:    v_readlane_b32 s5, v30, 7
 ; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB1_4
 ; GFX9-O0-NEXT:  .LBB1_6: ; %udiv-do-while
@@ -2761,35 +2847,36 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_readlane_b32 s6, v30, 8
 ; GFX9-O0-NEXT:    v_readlane_b32 s7, v30, 9
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 63
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(16)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(22)
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[28:29], s4, v[2:3]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v29
 ; GFX9-O0-NEXT:    s_mov_b32 s5, 1
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(20)
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[22:23], s5, v[22:23]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v23
 ; GFX9-O0-NEXT:    v_or_b32_e64 v4, v4, v5
@@ -2799,6 +2886,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v4
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[28:29], s5, v[2:3]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[4:5], s4, v[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v29
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v5
@@ -2808,25 +2896,28 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_or_b32_e64 v4, v3, v4
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[2:3], s5, v[0:1]
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[28:29], s5, v[6:7]
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[0:1], s4, v[0:1]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v29
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(10)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v27
 ; GFX9-O0-NEXT:    v_or3_b32 v6, v6, v7, v10
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v28
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(14)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v26
 ; GFX9-O0-NEXT:    v_or3_b32 v0, v0, v1, v7
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(8)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(13)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v25
 ; GFX9-O0-NEXT:    v_or_b32_e64 v6, v6, v7
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(12)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v24
 ; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
 ; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
@@ -2835,12 +2926,14 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v22
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v23
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(11)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v11
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(10)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v14
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v15
 ; GFX9-O0-NEXT:    v_sub_co_u32_e32 v13, vcc, v13, v6
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v12, vcc, v12, v10, vcc
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v11, vcc, v11, v4, vcc
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v5, vcc
@@ -2858,13 +2951,17 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v12
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, 0
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v21
 ; GFX9-O0-NEXT:    v_and_b32_e64 v22, v7, v22
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-O0-NEXT:    v_and_b32_e64 v20, v11, v20
 ; GFX9-O0-NEXT:    ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v22
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v19
 ; GFX9-O0-NEXT:    v_and_b32_e64 v7, v7, v22
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-O0-NEXT:    v_and_b32_e64 v22, v11, v18
 ; GFX9-O0-NEXT:    ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v7
@@ -2880,12 +2977,15 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v7
 ; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v10
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 killed $vgpr8_vgpr9 killed $exec
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], -1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s8
 ; GFX9-O0-NEXT:    s_mov_b32 s4, s9
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v16
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v17
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v16, s5
 ; GFX9-O0-NEXT:    v_add_co_u32_e32 v19, vcc, v11, v16
@@ -2973,29 +3073,30 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(9)
+; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(13)
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[6:7], v4, v[18:19]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v7
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 64
 ; GFX9-O0-NEXT:    v_sub_u32_e64 v20, s4, v4
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(11)
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[20:21], v20, v[14:15]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v21
 ; GFX9-O0-NEXT:    v_or_b32_e64 v5, v5, v22
@@ -3033,12 +3134,16 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[6:7]
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v14
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(10)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v12
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v13
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], -1
 ; GFX9-O0-NEXT:    s_mov_b32 s7, s8
 ; GFX9-O0-NEXT:    s_mov_b32 s6, s9
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v16
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v17
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v16, s7
 ; GFX9-O0-NEXT:    v_add_co_u32_e32 v16, vcc, v15, v16
@@ -3074,10 +3179,11 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
@@ -3085,39 +3191,43 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(20)
 ; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(20)
 ; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(20)
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(20)
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB1_6
 ; GFX9-O0-NEXT:  .LBB1_8: ; %udiv-bb1
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s6
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-O0-NEXT:    s_mov_b32 s4, s7
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX9-O0-NEXT:    s_mov_b32 s8, s6
 ; GFX9-O0-NEXT:    s_mov_b32 s9, s7
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s5
 ; GFX9-O0-NEXT:    v_add_co_u32_e32 v8, vcc, v3, v4
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-O0-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s8
 ; GFX9-O0-NEXT:    v_addc_co_u32_e32 v0, vcc, v0, v4, vcc
@@ -3139,10 +3249,12 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 0x7f
 ; GFX9-O0-NEXT:    v_sub_u32_e64 v2, s4, v3
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[4:5], v2, v[10:11]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v5
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 64
 ; GFX9-O0-NEXT:    v_sub_u32_e64 v13, s4, v2
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[13:14], v13, v[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v14
 ; GFX9-O0-NEXT:    v_or_b32_e64 v12, v12, v15
@@ -3221,14 +3333,15 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_cbranch_execz .LBB1_5
 ; GFX9-O0-NEXT:    s_branch .LBB1_7
 ; GFX9-O0-NEXT:  .LBB1_9: ; %udiv-end
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 32
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[0:1], s4, v[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[2:3], s4, v[4:5]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v6
@@ -3691,18 +3804,18 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_branch .LBB1_9
 ; GFX9-G-O0-NEXT:  .LBB1_4: ; %udiv-loop-exit
-; GFX9-G-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-G-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v5
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, v6
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, v7
 ; GFX9-G-O0-NEXT:    s_mov_b32 s4, 1
@@ -3717,8 +3830,10 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    v_lshrrev_b32_e64 v5, v2, v3
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v11
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v12
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, v13
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, v14
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, v1
@@ -3752,26 +3867,29 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    v_readlane_b32 s4, v32, 4
 ; GFX9-G-O0-NEXT:    v_readlane_b32 s5, v32, 5
 ; GFX9-G-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-G-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-G-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    s_nop 0
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_branch .LBB1_4
 ; GFX9-G-O0-NEXT:  .LBB1_6: ; %udiv-do-while
@@ -3782,35 +3900,35 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-G-O0-NEXT:    v_readlane_b32 s6, v32, 6
 ; GFX9-G-O0-NEXT:    v_readlane_b32 s7, v32, 7
-; GFX9-G-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    s_mov_b64 s[4:5], 0
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(18)
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(22)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v3
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(16)
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(20)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v21, v5
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v20, v4
 ; GFX9-G-O0-NEXT:    s_mov_b32 s8, 1
@@ -3827,6 +3945,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 killed $vgpr4_vgpr5 killed $exec
 ; GFX9-G-O0-NEXT:    v_or_b32_e64 v7, v0, v1
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr0_vgpr1 killed $vgpr12_vgpr13 killed $exec
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, v14
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v15
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr0 killed $exec
@@ -3852,10 +3971,10 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    v_lshrrev_b32_e64 v13, v2, v3
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, v0
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(10)
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(14)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v28
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v29
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(8)
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(12)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v20, v30
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v21, v31
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, v1
@@ -3875,10 +3994,13 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v12
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v13
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(11)
 ; GFX9-G-O0-NEXT:    v_sub_co_u32_e64 v11, s[8:9], v11, v4
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(10)
 ; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v10, s[8:9], v10, v9, s[8:9]
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v8, s[8:9], v8, v7, s[8:9]
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v10, s[8:9], v6, v5, s[8:9]
 ; GFX9-G-O0-NEXT:    s_mov_b32 s8, 31
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, s8
@@ -3897,8 +4019,10 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr12_vgpr13 killed $vgpr12_vgpr13 def $vgpr12_vgpr13_vgpr14_vgpr15 killed $exec
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, v11
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, v10
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v22, v24
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v23, v25
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v20, v26
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v21, v27
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, v22
@@ -3917,9 +4041,13 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, v10
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, v9
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, v8
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, v16
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v17
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v18
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, v19
 ; GFX9-G-O0-NEXT:    s_mov_b32 s8, -1
 ; GFX9-G-O0-NEXT:    s_mov_b32 s12, -1
@@ -3999,32 +4127,33 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GFX9-G-O0-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX9-G-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    s_mov_b32 s4, 64
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, v5
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, v4
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(13)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v21, v7
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v20, v6
 ; GFX9-G-O0-NEXT:    s_mov_b32 s5, 0xffffffc0
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, s5
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(12)
 ; GFX9-G-O0-NEXT:    v_add_u32_e64 v4, v12, v4
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX9-G-O0-NEXT:    v_sub_u32_e64 v5, v5, v12
@@ -4065,9 +4194,13 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr4_vgpr5 killed $vgpr4_vgpr5 def $vgpr4_vgpr5_vgpr6_vgpr7 killed $exec
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, v12
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, v13
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(11)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, v16
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(10)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, v17
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, v18
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, v19
 ; GFX9-G-O0-NEXT:    s_mov_b32 s4, -1
 ; GFX9-G-O0-NEXT:    s_mov_b32 s10, -1
@@ -4101,49 +4234,57 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    s_nop 0
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_nop 0
 ; GFX9-G-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(20)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    s_nop 0
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(20)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(20)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(20)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_branch .LBB1_6
 ; GFX9-G-O0-NEXT:  .LBB1_8: ; %udiv-bb1
 ; GFX9-G-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GFX9-G-O0-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-G-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-G-O0-NEXT:    s_mov_b32 s6, 1
 ; GFX9-G-O0-NEXT:    s_mov_b32 s10, 0
 ; GFX9-G-O0-NEXT:    s_mov_b32 s9, 0
 ; GFX9-G-O0-NEXT:    s_mov_b32 s8, 0
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, s6
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-G-O0-NEXT:    v_add_co_u32_e64 v4, s[6:7], v1, v4
 ; GFX9-G-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, s10
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-G-O0-NEXT:    v_addc_co_u32_e64 v5, s[6:7], v3, v5, s[6:7]
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, s9
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-G-O0-NEXT:    v_addc_co_u32_e64 v7, s[6:7], v2, v3, s[6:7]
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, s8
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-G-O0-NEXT:    v_addc_co_u32_e64 v6, s[6:7], v0, v2, s[6:7]
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, v4
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, v5
@@ -4158,6 +4299,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-G-O0-NEXT:    v_sub_co_u32_e64 v3, s[6:7], v0, v1
 ; GFX9-G-O0-NEXT:    s_mov_b32 s7, 64
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, v8
 ; GFX9-G-O0-NEXT:    s_mov_b32 s6, 0xffffffc0
@@ -4172,6 +4314,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v3, v0
 ; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[0:1], v3, v[12:13]
 ; GFX9-G-O0-NEXT:    v_lshrrev_b64 v[17:18], v8, v[12:13]
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[15:16], v3, v[10:11]
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v17
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v18
diff --git a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll
index 5dcf5d437bae6..d85e31cfee807 100644
--- a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll
@@ -237,21 +237,24 @@ define amdgpu_kernel void @ds8align1(ptr addrspace(3) %in, ptr addrspace(3) %out
 ; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; ALIGNED-SDAG-NEXT:    ds_read_u8 v5, v0 offset:4
+; ALIGNED-SDAG-NEXT:    ds_read_u8 v6, v0 offset:5
 ; ALIGNED-SDAG-NEXT:    ds_read_u8 v2, v0
 ; ALIGNED-SDAG-NEXT:    ds_read_u8 v3, v0 offset:1
 ; ALIGNED-SDAG-NEXT:    ds_read_u8 v4, v0 offset:2
-; ALIGNED-SDAG-NEXT:    ds_read_u8 v5, v0 offset:4
-; ALIGNED-SDAG-NEXT:    ds_read_u8 v6, v0 offset:5
 ; ALIGNED-SDAG-NEXT:    ds_read_u8 v7, v0 offset:3
 ; ALIGNED-SDAG-NEXT:    ds_read_u8 v8, v0 offset:6
 ; ALIGNED-SDAG-NEXT:    ds_read_u8 v0, v0 offset:7
 ; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v1, s1
-; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(4)
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(7)
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v5 offset:4
-; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(4)
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(7)
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v6 offset:5
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(7)
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v2
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(7)
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v3 offset:1
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(7)
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v4 offset:2
 ; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(7)
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v7 offset:3
@@ -449,30 +452,38 @@ define amdgpu_kernel void @ds12align1(ptr addrspace(3) %in, ptr addrspace(3) %ou
 ; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; ALIGNED-SDAG-NEXT:    ds_read_u8 v9, v0 offset:8
+; ALIGNED-SDAG-NEXT:    ds_read_u8 v10, v0 offset:9
+; ALIGNED-SDAG-NEXT:    ds_read_u8 v5, v0 offset:4
+; ALIGNED-SDAG-NEXT:    ds_read_u8 v6, v0 offset:5
 ; ALIGNED-SDAG-NEXT:    ds_read_u8 v1, v0
 ; ALIGNED-SDAG-NEXT:    ds_read_u8 v2, v0 offset:1
 ; ALIGNED-SDAG-NEXT:    ds_read_u8 v3, v0 offset:2
 ; ALIGNED-SDAG-NEXT:    ds_read_u8 v4, v0 offset:3
-; ALIGNED-SDAG-NEXT:    ds_read_u8 v5, v0 offset:4
-; ALIGNED-SDAG-NEXT:    ds_read_u8 v6, v0 offset:5
 ; ALIGNED-SDAG-NEXT:    ds_read_u8 v7, v0 offset:6
 ; ALIGNED-SDAG-NEXT:    ds_read_u8 v8, v0 offset:7
-; ALIGNED-SDAG-NEXT:    ds_read_u8 v9, v0 offset:8
-; ALIGNED-SDAG-NEXT:    ds_read_u8 v10, v0 offset:9
 ; ALIGNED-SDAG-NEXT:    ds_read_u8 v11, v0 offset:10
 ; ALIGNED-SDAG-NEXT:    ds_read_u8 v0, v0 offset:11
 ; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v12, s1
-; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(11)
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v9 offset:8
-; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(11)
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v10 offset:9
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(11)
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v5 offset:4
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(11)
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v6 offset:5
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(11)
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v1
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(11)
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v2 offset:1
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(11)
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v3 offset:2
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(11)
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v4 offset:3
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(11)
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v7 offset:6
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(11)
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v8 offset:7
 ; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(11)
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v11 offset:10
@@ -596,17 +607,18 @@ define amdgpu_kernel void @ds12align2(ptr addrspace(3) %in, ptr addrspace(3) %ou
 ; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
-; ALIGNED-SDAG-NEXT:    ds_read_u16 v2, v0
 ; ALIGNED-SDAG-NEXT:    ds_read_u16 v3, v0 offset:8
 ; ALIGNED-SDAG-NEXT:    ds_read_u16 v4, v0 offset:4
+; ALIGNED-SDAG-NEXT:    ds_read_u16 v2, v0
 ; ALIGNED-SDAG-NEXT:    ds_read_u16 v5, v0 offset:2
 ; ALIGNED-SDAG-NEXT:    ds_read_u16 v6, v0 offset:6
 ; ALIGNED-SDAG-NEXT:    ds_read_u16 v0, v0 offset:10
 ; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v1, s1
-; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(4)
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
 ; ALIGNED-SDAG-NEXT:    ds_write_b16 v1, v3 offset:8
-; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(4)
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
 ; ALIGNED-SDAG-NEXT:    ds_write_b16 v1, v4 offset:4
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
 ; ALIGNED-SDAG-NEXT:    ds_write_b16 v1, v2
 ; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
 ; ALIGNED-SDAG-NEXT:    ds_write_b16 v1, v5 offset:2
@@ -799,37 +811,42 @@ define amdgpu_kernel void @ds16align1(ptr addrspace(3) %in, ptr addrspace(3) %ou
 ; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; ALIGNED-SDAG-NEXT:    ds_read_u8 v13, v0 offset:12
+; ALIGNED-SDAG-NEXT:    ds_read_u8 v14, v0 offset:13
 ; ALIGNED-SDAG-NEXT:    ds_read_u8 v1, v0
 ; ALIGNED-SDAG-NEXT:    ds_read_u8 v2, v0 offset:1
-; ALIGNED-SDAG-NEXT:    ds_read_u8 v3, v0 offset:2
-; ALIGNED-SDAG-NEXT:    ds_read_u8 v4, v0 offset:3
 ; ALIGNED-SDAG-NEXT:    ds_read_u8 v5, v0 offset:4
 ; ALIGNED-SDAG-NEXT:    ds_read_u8 v6, v0 offset:5
-; ALIGNED-SDAG-NEXT:    ds_read_u8 v7, v0 offset:6
-; ALIGNED-SDAG-NEXT:    ds_read_u8 v8, v0 offset:7
 ; ALIGNED-SDAG-NEXT:    ds_read_u8 v9, v0 offset:8
 ; ALIGNED-SDAG-NEXT:    ds_read_u8 v10, v0 offset:9
+; ALIGNED-SDAG-NEXT:    ds_read_u8 v3, v0 offset:2
+; ALIGNED-SDAG-NEXT:    ds_read_u8 v4, v0 offset:3
+; ALIGNED-SDAG-NEXT:    ds_read_u8 v7, v0 offset:6
+; ALIGNED-SDAG-NEXT:    ds_read_u8 v8, v0 offset:7
 ; ALIGNED-SDAG-NEXT:    ds_read_u8 v11, v0 offset:10
 ; ALIGNED-SDAG-NEXT:    ds_read_u8 v12, v0 offset:11
-; ALIGNED-SDAG-NEXT:    ds_read_u8 v13, v0 offset:12
-; ALIGNED-SDAG-NEXT:    ds_read_u8 v14, v0 offset:13
 ; ALIGNED-SDAG-NEXT:    ds_read_u8 v15, v0 offset:14
 ; ALIGNED-SDAG-NEXT:    ds_read_u8 v0, v0 offset:15
 ; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v16, s1
-; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(14)
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v13 offset:12
-; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v14 offset:13
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(14)
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v1
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v2 offset:1
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(14)
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v5 offset:4
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v6 offset:5
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(14)
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v9 offset:8
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v10 offset:9
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(14)
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v3 offset:2
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v4 offset:3
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(14)
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v7 offset:6
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v8 offset:7
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(14)
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v11 offset:10
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v12 offset:11
 ; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(14)
@@ -982,22 +999,26 @@ define amdgpu_kernel void @ds16align2(ptr addrspace(3) %in, ptr addrspace(3) %ou
 ; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; ALIGNED-SDAG-NEXT:    ds_read_u16 v5, v0 offset:12
 ; ALIGNED-SDAG-NEXT:    ds_read_u16 v2, v0
-; ALIGNED-SDAG-NEXT:    ds_read_u16 v3, v0 offset:2
 ; ALIGNED-SDAG-NEXT:    ds_read_u16 v4, v0 offset:4
-; ALIGNED-SDAG-NEXT:    ds_read_u16 v5, v0 offset:12
-; ALIGNED-SDAG-NEXT:    ds_read_u16 v6, v0 offset:6
 ; ALIGNED-SDAG-NEXT:    ds_read_u16 v7, v0 offset:8
+; ALIGNED-SDAG-NEXT:    ds_read_u16 v3, v0 offset:2
+; ALIGNED-SDAG-NEXT:    ds_read_u16 v6, v0 offset:6
 ; ALIGNED-SDAG-NEXT:    ds_read_u16 v8, v0 offset:10
 ; ALIGNED-SDAG-NEXT:    ds_read_u16 v0, v0 offset:14
 ; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v1, s1
-; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(4)
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(7)
 ; ALIGNED-SDAG-NEXT:    ds_write_b16 v1, v5 offset:12
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(7)
 ; ALIGNED-SDAG-NEXT:    ds_write_b16 v1, v2
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(7)
 ; ALIGNED-SDAG-NEXT:    ds_write_b16 v1, v4 offset:4
-; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(7)
 ; ALIGNED-SDAG-NEXT:    ds_write_b16 v1, v7 offset:8
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(7)
 ; ALIGNED-SDAG-NEXT:    ds_write_b16 v1, v3 offset:2
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(7)
 ; ALIGNED-SDAG-NEXT:    ds_write_b16 v1, v6 offset:6
 ; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(7)
 ; ALIGNED-SDAG-NEXT:    ds_write_b16 v1, v8 offset:10
diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
index 9f1b55ea3b1ef..260d98b6371c9 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
@@ -530,19 +530,20 @@ define amdgpu_kernel void @unaligned_read2_f32(ptr addrspace(1) %out, ptr addrsp
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_add_i32_e32 v1, vcc, s2, v0
 ; CI-NEXT:    ds_read_u8 v2, v1 offset:1
-; CI-NEXT:    ds_read_u8 v3, v1 offset:34
-; CI-NEXT:    ds_read_u8 v4, v1 offset:32
-; CI-NEXT:    ds_read_u8 v5, v1 offset:2
 ; CI-NEXT:    ds_read_u8 v6, v1
 ; CI-NEXT:    ds_read_u8 v7, v1 offset:3
+; CI-NEXT:    ds_read_u8 v5, v1 offset:2
 ; CI-NEXT:    ds_read_u8 v8, v1 offset:33
+; CI-NEXT:    ds_read_u8 v4, v1 offset:32
+; CI-NEXT:    ds_read_u8 v3, v1 offset:34
 ; CI-NEXT:    ds_read_u8 v1, v1 offset:35
 ; CI-NEXT:    s_waitcnt lgkmcnt(7)
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; CI-NEXT:    s_waitcnt lgkmcnt(3)
+; CI-NEXT:    s_waitcnt lgkmcnt(6)
 ; CI-NEXT:    v_or_b32_e32 v2, v2, v6
-; CI-NEXT:    s_waitcnt lgkmcnt(2)
+; CI-NEXT:    s_waitcnt lgkmcnt(5)
 ; CI-NEXT:    v_lshlrev_b32_e32 v6, 8, v7
+; CI-NEXT:    s_waitcnt lgkmcnt(4)
 ; CI-NEXT:    v_or_b32_e32 v5, v6, v5
 ; CI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -623,19 +624,20 @@ define amdgpu_kernel void @unaligned_offset_read2_f32(ptr addrspace(1) %out, ptr
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_add_i32_e32 v1, vcc, s2, v0
 ; CI-NEXT:    ds_read_u8 v2, v1 offset:6
-; CI-NEXT:    ds_read_u8 v3, v1 offset:11
-; CI-NEXT:    ds_read_u8 v4, v1 offset:9
-; CI-NEXT:    ds_read_u8 v5, v1 offset:7
 ; CI-NEXT:    ds_read_u8 v6, v1 offset:5
 ; CI-NEXT:    ds_read_u8 v7, v1 offset:8
+; CI-NEXT:    ds_read_u8 v5, v1 offset:7
 ; CI-NEXT:    ds_read_u8 v8, v1 offset:10
+; CI-NEXT:    ds_read_u8 v4, v1 offset:9
+; CI-NEXT:    ds_read_u8 v3, v1 offset:11
 ; CI-NEXT:    ds_read_u8 v1, v1 offset:12
 ; CI-NEXT:    s_waitcnt lgkmcnt(7)
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; CI-NEXT:    s_waitcnt lgkmcnt(3)
+; CI-NEXT:    s_waitcnt lgkmcnt(6)
 ; CI-NEXT:    v_or_b32_e32 v2, v2, v6
-; CI-NEXT:    s_waitcnt lgkmcnt(2)
+; CI-NEXT:    s_waitcnt lgkmcnt(5)
 ; CI-NEXT:    v_lshlrev_b32_e32 v6, 8, v7
+; CI-NEXT:    s_waitcnt lgkmcnt(4)
 ; CI-NEXT:    v_or_b32_e32 v5, v6, v5
 ; CI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -716,8 +718,8 @@ define amdgpu_kernel void @misaligned_2_simple_read2_f32(ptr addrspace(1) %out,
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_add_i32_e32 v1, vcc, s0, v0
 ; CI-NEXT:    ds_read_u16 v2, v1 offset:2
-; CI-NEXT:    ds_read_u16 v3, v1 offset:32
 ; CI-NEXT:    ds_read_u16 v4, v1
+; CI-NEXT:    ds_read_u16 v3, v1 offset:32
 ; CI-NEXT:    ds_read_u16 v1, v1 offset:34
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1454,21 +1456,21 @@ define amdgpu_kernel void @read2_v2i32_align1_odd_offset(ptr addrspace(1) %out)
 ; CI-NEXT:    v_mov_b32_e32 v0, 0
 ; CI-NEXT:    s_mov_b32 m0, -1
 ; CI-NEXT:    ds_read_u8 v1, v0 offset:70
+; CI-NEXT:    ds_read_u8 v4, v0 offset:69
 ; CI-NEXT:    ds_read_u8 v2, v0 offset:72
 ; CI-NEXT:    ds_read_u8 v3, v0 offset:71
-; CI-NEXT:    ds_read_u8 v4, v0 offset:69
 ; CI-NEXT:    ds_read_u8 v5, v0 offset:68
 ; CI-NEXT:    s_waitcnt lgkmcnt(4)
 ; CI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; CI-NEXT:    s_waitcnt lgkmcnt(3)
-; CI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; CI-NEXT:    s_waitcnt lgkmcnt(2)
-; CI-NEXT:    v_or_b32_e32 v2, v2, v3
-; CI-NEXT:    s_waitcnt lgkmcnt(1)
 ; CI-NEXT:    v_or_b32_e32 v1, v1, v4
 ; CI-NEXT:    ds_read_u8 v4, v0 offset:66
 ; CI-NEXT:    ds_read_u8 v6, v0 offset:67
 ; CI-NEXT:    ds_read_u8 v0, v0 offset:65
+; CI-NEXT:    s_waitcnt lgkmcnt(5)
+; CI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; CI-NEXT:    s_waitcnt lgkmcnt(4)
+; CI-NEXT:    v_or_b32_e32 v2, v2, v3
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; CI-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -1487,14 +1489,14 @@ define amdgpu_kernel void @read2_v2i32_align1_odd_offset(ptr addrspace(1) %out)
 ; GFX9-ALIGNED-LABEL: read2_v2i32_align1_odd_offset:
 ; GFX9-ALIGNED:       ; %bb.0: ; %entry
 ; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-ALIGNED-NEXT:    ds_read_u8 v0, v2 offset:65
-; GFX9-ALIGNED-NEXT:    ds_read_u8 v3, v2 offset:66
-; GFX9-ALIGNED-NEXT:    ds_read_u8 v4, v2 offset:67
-; GFX9-ALIGNED-NEXT:    ds_read_u8 v5, v2 offset:68
 ; GFX9-ALIGNED-NEXT:    ds_read_u8 v1, v2 offset:70
 ; GFX9-ALIGNED-NEXT:    ds_read_u8 v6, v2 offset:69
 ; GFX9-ALIGNED-NEXT:    ds_read_u8 v7, v2 offset:72
 ; GFX9-ALIGNED-NEXT:    ds_read_u8 v8, v2 offset:71
+; GFX9-ALIGNED-NEXT:    ds_read_u8 v3, v2 offset:66
+; GFX9-ALIGNED-NEXT:    ds_read_u8 v0, v2 offset:65
+; GFX9-ALIGNED-NEXT:    ds_read_u8 v5, v2 offset:68
+; GFX9-ALIGNED-NEXT:    ds_read_u8 v4, v2 offset:67
 ; GFX9-ALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-ALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
index 52bcaed7ec75a..f55de32728561 100644
--- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
+++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
@@ -154,15 +154,15 @@ define i32 @global_load_2xi16_align1(ptr addrspace(1) %p) #0 {
 ; GFX7-ALIGNED-NEXT:    v_add_i32_e32 v6, vcc, 3, v0
 ; GFX7-ALIGNED-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
 ; GFX7-ALIGNED-NEXT:    flat_load_ubyte v4, v[4:5]
-; GFX7-ALIGNED-NEXT:    flat_load_ubyte v5, v[6:7]
 ; GFX7-ALIGNED-NEXT:    flat_load_ubyte v2, v[2:3]
+; GFX7-ALIGNED-NEXT:    flat_load_ubyte v5, v[6:7]
 ; GFX7-ALIGNED-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(3)
 ; GFX7-ALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 8, v4
 ; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(2)
-; GFX7-ALIGNED-NEXT:    v_lshlrev_b32_e32 v3, 24, v5
-; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; GFX7-ALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-ALIGNED-NEXT:    v_lshlrev_b32_e32 v3, 24, v5
 ; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-ALIGNED-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX7-ALIGNED-NEXT:    v_or_b32_e32 v1, v3, v2
diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
index 6f8da57e223e5..44493498bdbe9 100644
--- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
+++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
@@ -205,15 +205,15 @@ define i32 @private_load_2xi16_align1(ptr addrspace(5) %p) #0 {
 ; GFX7-ALIGNED-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
 ; GFX7-ALIGNED-NEXT:    v_add_i32_e32 v3, vcc, 3, v0
 ; GFX7-ALIGNED-NEXT:    buffer_load_ubyte v2, v2, s[0:3], 0 offen
-; GFX7-ALIGNED-NEXT:    buffer_load_ubyte v3, v3, s[0:3], 0 offen
 ; GFX7-ALIGNED-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen
+; GFX7-ALIGNED-NEXT:    buffer_load_ubyte v3, v3, s[0:3], 0 offen
 ; GFX7-ALIGNED-NEXT:    buffer_load_ubyte v0, v0, s[0:3], 0 offen
 ; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(3)
 ; GFX7-ALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(2)
-; GFX7-ALIGNED-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; GFX7-ALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-ALIGNED-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-ALIGNED-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX7-ALIGNED-NEXT:    v_or_b32_e32 v1, v3, v1
diff --git a/llvm/test/CodeGen/AMDGPU/freeze.ll b/llvm/test/CodeGen/AMDGPU/freeze.ll
index 308e86bbaf8fd..1efa0eeb94b87 100644
--- a/llvm/test/CodeGen/AMDGPU/freeze.ll
+++ b/llvm/test/CodeGen/AMDGPU/freeze.ll
@@ -2890,19 +2890,21 @@ define void @freeze_v19i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 ; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
 ; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
-; GFX6-GISEL-NEXT:    buffer_load_dwordx2 v[20:21], v[0:1], s[4:7], 0 addr64 offset:64
-; GFX6-GISEL-NEXT:    buffer_load_dword v22, v[0:1], s[4:7], 0 addr64 offset:72
 ; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:32
 ; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX6-GISEL-NEXT:    buffer_load_dwordx2 v[20:21], v[0:1], s[4:7], 0 addr64 offset:64
+; GFX6-GISEL-NEXT:    buffer_load_dword v22, v[0:1], s[4:7], 0 addr64 offset:72
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(5)
 ; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(5)
 ; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
-; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(5)
 ; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:32
-; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(5)
 ; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(5)
 ; GFX6-GISEL-NEXT:    buffer_store_dwordx2 v[20:21], v[2:3], s[4:7], 0 addr64 offset:64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(5)
 ; GFX6-GISEL-NEXT:    buffer_store_dword v22, v[2:3], s[4:7], 0 addr64 offset:72
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -4316,28 +4318,30 @@ define void @freeze_v31i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 ; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
 ; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
-; GFX6-GISEL-NEXT:    buffer_load_dword v34, v[0:1], s[4:7], 0 addr64 offset:120
 ; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:32
 ; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:48
 ; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:64
 ; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[24:27], v[0:1], s[4:7], 0 addr64 offset:80
-; GFX6-GISEL-NEXT:    buffer_load_dwordx2 v[32:33], v[0:1], s[4:7], 0 addr64 offset:112
 ; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:96
+; GFX6-GISEL-NEXT:    buffer_load_dwordx2 v[32:33], v[0:1], s[4:7], 0 addr64 offset:112
+; GFX6-GISEL-NEXT:    buffer_load_dword v34, v[0:1], s[4:7], 0 addr64 offset:120
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(8)
 ; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(8)
 ; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
-; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(8)
 ; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:32
-; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(8)
 ; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:48
-; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(8)
 ; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[20:23], v[2:3], s[4:7], 0 addr64 offset:64
-; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(8)
 ; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[24:27], v[2:3], s[4:7], 0 addr64 offset:80
-; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(6)
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(8)
 ; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[28:31], v[2:3], s[4:7], 0 addr64 offset:96
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(8)
 ; GFX6-GISEL-NEXT:    buffer_store_dwordx2 v[32:33], v[2:3], s[4:7], 0 addr64 offset:112
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(8)
 ; GFX6-GISEL-NEXT:    buffer_store_dword v34, v[2:3], s[4:7], 0 addr64 offset:120
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -10380,18 +10384,18 @@ define void @freeze_v8p3(ptr addrspace(3) %ptra, ptr addrspace(3) %ptrb) {
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 24, v0
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v4, vcc, 16, v0
 ; GFX6-SDAG-NEXT:    s_mov_b32 m0, -1
-; GFX6-SDAG-NEXT:    ds_read_b64 v[2:3], v2
-; GFX6-SDAG-NEXT:    ds_read_b64 v[4:5], v4
 ; GFX6-SDAG-NEXT:    ds_read_b64 v[6:7], v0
+; GFX6-SDAG-NEXT:    ds_read_b64 v[4:5], v4
+; GFX6-SDAG-NEXT:    ds_read_b64 v[2:3], v2
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 8, v0
 ; GFX6-SDAG-NEXT:    ds_read_b64 v[8:9], v0
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 16, v1
 ; GFX6-SDAG-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX6-SDAG-NEXT:    ds_write_b64 v0, v[4:5]
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 24, v1
+; GFX6-SDAG-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX6-SDAG-NEXT:    ds_write_b64 v0, v[2:3]
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 8, v1
-; GFX6-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
 ; GFX6-SDAG-NEXT:    ds_write_b64 v1, v[6:7]
 ; GFX6-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
 ; GFX6-SDAG-NEXT:    ds_write_b64 v0, v[8:9]
@@ -10538,31 +10542,35 @@ define void @freeze_v16p3(ptr addrspace(3) %ptra, ptr addrspace(3) %ptrb) {
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v16, vcc, 56, v0
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v14, vcc, 48, v0
 ; GFX6-SDAG-NEXT:    s_mov_b32 m0, -1
-; GFX6-SDAG-NEXT:    ds_read_b64 v[2:3], v2
-; GFX6-SDAG-NEXT:    ds_read_b64 v[4:5], v4
-; GFX6-SDAG-NEXT:    ds_read_b64 v[6:7], v6
 ; GFX6-SDAG-NEXT:    ds_read_b64 v[8:9], v0
-; GFX6-SDAG-NEXT:    ds_read_b64 v[10:11], v10
-; GFX6-SDAG-NEXT:    ds_read_b64 v[12:13], v12
 ; GFX6-SDAG-NEXT:    ds_read_b64 v[14:15], v14
 ; GFX6-SDAG-NEXT:    ds_read_b64 v[16:17], v16
+; GFX6-SDAG-NEXT:    ds_read_b64 v[12:13], v12
+; GFX6-SDAG-NEXT:    ds_read_b64 v[10:11], v10
+; GFX6-SDAG-NEXT:    ds_read_b64 v[6:7], v6
+; GFX6-SDAG-NEXT:    ds_read_b64 v[4:5], v4
+; GFX6-SDAG-NEXT:    ds_read_b64 v[2:3], v2
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 48, v1
-; GFX6-SDAG-NEXT:    s_waitcnt lgkmcnt(4)
-; GFX6-SDAG-NEXT:    ds_write_b64 v1, v[8:9]
-; GFX6-SDAG-NEXT:    s_waitcnt lgkmcnt(2)
+; GFX6-SDAG-NEXT:    s_waitcnt lgkmcnt(6)
 ; GFX6-SDAG-NEXT:    ds_write_b64 v0, v[14:15]
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 56, v1
-; GFX6-SDAG-NEXT:    s_waitcnt lgkmcnt(2)
+; GFX6-SDAG-NEXT:    s_waitcnt lgkmcnt(6)
 ; GFX6-SDAG-NEXT:    ds_write_b64 v0, v[16:17]
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 32, v1
+; GFX6-SDAG-NEXT:    s_waitcnt lgkmcnt(6)
 ; GFX6-SDAG-NEXT:    ds_write_b64 v0, v[12:13]
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 40, v1
+; GFX6-SDAG-NEXT:    s_waitcnt lgkmcnt(6)
 ; GFX6-SDAG-NEXT:    ds_write_b64 v0, v[10:11]
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 16, v1
+; GFX6-SDAG-NEXT:    s_waitcnt lgkmcnt(6)
 ; GFX6-SDAG-NEXT:    ds_write_b64 v0, v[6:7]
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 24, v1
+; GFX6-SDAG-NEXT:    s_waitcnt lgkmcnt(6)
 ; GFX6-SDAG-NEXT:    ds_write_b64 v0, v[4:5]
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 8, v1
+; GFX6-SDAG-NEXT:    ds_write_b64 v1, v[8:9]
+; GFX6-SDAG-NEXT:    s_waitcnt lgkmcnt(7)
 ; GFX6-SDAG-NEXT:    ds_write_b64 v0, v[2:3]
 ; GFX6-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -10934,14 +10942,15 @@ define void @freeze_v3p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) {
 ; GFX6-SDAG:       ; %bb.0:
 ; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
-; GFX6-SDAG-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
 ; GFX6-SDAG-NEXT:    buffer_load_dword v3, v0, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 8, v0
 ; GFX6-SDAG-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v4, vcc, 4, v1
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v5, vcc, 8, v1
-; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(2)
 ; GFX6-SDAG-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(2)
 ; GFX6-SDAG-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
 ; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(2)
 ; GFX6-SDAG-NEXT:    buffer_store_dword v0, v5, s[0:3], 0 offen
@@ -10971,14 +10980,15 @@ define void @freeze_v3p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) {
 ; GFX7-SDAG:       ; %bb.0:
 ; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
-; GFX7-SDAG-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
 ; GFX7-SDAG-NEXT:    buffer_load_dword v3, v0, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 8, v0
 ; GFX7-SDAG-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v4, vcc, 4, v1
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v5, vcc, 8, v1
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(2)
 ; GFX7-SDAG-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(2)
 ; GFX7-SDAG-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
 ; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(2)
 ; GFX7-SDAG-NEXT:    buffer_store_dword v0, v5, s[0:3], 0 offen
@@ -11072,17 +11082,19 @@ define void @freeze_v4p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) {
 ; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v3, vcc, 4, v0
-; GFX6-SDAG-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
-; GFX6-SDAG-NEXT:    buffer_load_dword v3, v3, s[0:3], 0 offen
 ; GFX6-SDAG-NEXT:    buffer_load_dword v4, v0, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_load_dword v3, v3, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 12, v0
 ; GFX6-SDAG-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v5, vcc, 4, v1
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v6, vcc, 8, v1
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v7, vcc, 12, v1
-; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(3)
 ; GFX6-SDAG-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(3)
 ; GFX6-SDAG-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(3)
 ; GFX6-SDAG-NEXT:    buffer_store_dword v2, v6, s[0:3], 0 offen
 ; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(3)
 ; GFX6-SDAG-NEXT:    buffer_store_dword v0, v7, s[0:3], 0 offen
@@ -11118,17 +11130,19 @@ define void @freeze_v4p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) {
 ; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v3, vcc, 4, v0
-; GFX7-SDAG-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
-; GFX7-SDAG-NEXT:    buffer_load_dword v3, v3, s[0:3], 0 offen
 ; GFX7-SDAG-NEXT:    buffer_load_dword v4, v0, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_load_dword v3, v3, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 12, v0
 ; GFX7-SDAG-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v5, vcc, 4, v1
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v6, vcc, 8, v1
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v7, vcc, 12, v1
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(3)
 ; GFX7-SDAG-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(3)
 ; GFX7-SDAG-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(3)
 ; GFX7-SDAG-NEXT:    buffer_store_dword v2, v6, s[0:3], 0 offen
 ; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(3)
 ; GFX7-SDAG-NEXT:    buffer_store_dword v0, v7, s[0:3], 0 offen
@@ -11242,13 +11256,13 @@ define void @freeze_v8p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) {
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v5, vcc, 12, v0
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v6, vcc, 8, v0
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v7, vcc, 4, v0
-; GFX6-SDAG-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
-; GFX6-SDAG-NEXT:    buffer_load_dword v3, v3, s[0:3], 0 offen
-; GFX6-SDAG-NEXT:    buffer_load_dword v4, v4, s[0:3], 0 offen
-; GFX6-SDAG-NEXT:    buffer_load_dword v5, v5, s[0:3], 0 offen
-; GFX6-SDAG-NEXT:    buffer_load_dword v6, v6, s[0:3], 0 offen
-; GFX6-SDAG-NEXT:    buffer_load_dword v7, v7, s[0:3], 0 offen
 ; GFX6-SDAG-NEXT:    buffer_load_dword v8, v0, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_load_dword v7, v7, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_load_dword v6, v6, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_load_dword v5, v5, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_load_dword v4, v4, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_load_dword v3, v3, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 28, v0
 ; GFX6-SDAG-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v9, vcc, 4, v1
@@ -11258,13 +11272,19 @@ define void @freeze_v8p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) {
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v13, vcc, 20, v1
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v14, vcc, 24, v1
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v15, vcc, 28, v1
-; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(7)
 ; GFX6-SDAG-NEXT:    buffer_store_dword v8, v1, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(7)
 ; GFX6-SDAG-NEXT:    buffer_store_dword v7, v9, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(7)
 ; GFX6-SDAG-NEXT:    buffer_store_dword v6, v10, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(7)
 ; GFX6-SDAG-NEXT:    buffer_store_dword v5, v11, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(7)
 ; GFX6-SDAG-NEXT:    buffer_store_dword v4, v12, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(7)
 ; GFX6-SDAG-NEXT:    buffer_store_dword v3, v13, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(7)
 ; GFX6-SDAG-NEXT:    buffer_store_dword v2, v14, s[0:3], 0 offen
 ; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(7)
 ; GFX6-SDAG-NEXT:    buffer_store_dword v0, v15, s[0:3], 0 offen
@@ -11324,13 +11344,13 @@ define void @freeze_v8p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) {
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v5, vcc, 12, v0
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v6, vcc, 8, v0
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v7, vcc, 4, v0
-; GFX7-SDAG-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
-; GFX7-SDAG-NEXT:    buffer_load_dword v3, v3, s[0:3], 0 offen
-; GFX7-SDAG-NEXT:    buffer_load_dword v4, v4, s[0:3], 0 offen
-; GFX7-SDAG-NEXT:    buffer_load_dword v5, v5, s[0:3], 0 offen
-; GFX7-SDAG-NEXT:    buffer_load_dword v6, v6, s[0:3], 0 offen
-; GFX7-SDAG-NEXT:    buffer_load_dword v7, v7, s[0:3], 0 offen
 ; GFX7-SDAG-NEXT:    buffer_load_dword v8, v0, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_load_dword v7, v7, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_load_dword v6, v6, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_load_dword v5, v5, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_load_dword v4, v4, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_load_dword v3, v3, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 28, v0
 ; GFX7-SDAG-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v9, vcc, 4, v1
@@ -11340,13 +11360,19 @@ define void @freeze_v8p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) {
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v13, vcc, 20, v1
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v14, vcc, 24, v1
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v15, vcc, 28, v1
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(7)
 ; GFX7-SDAG-NEXT:    buffer_store_dword v8, v1, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(7)
 ; GFX7-SDAG-NEXT:    buffer_store_dword v7, v9, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(7)
 ; GFX7-SDAG-NEXT:    buffer_store_dword v6, v10, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(7)
 ; GFX7-SDAG-NEXT:    buffer_store_dword v5, v11, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(7)
 ; GFX7-SDAG-NEXT:    buffer_store_dword v4, v12, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(7)
 ; GFX7-SDAG-NEXT:    buffer_store_dword v3, v13, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(7)
 ; GFX7-SDAG-NEXT:    buffer_store_dword v2, v14, s[0:3], 0 offen
 ; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(7)
 ; GFX7-SDAG-NEXT:    buffer_store_dword v0, v15, s[0:3], 0 offen
@@ -11538,10 +11564,10 @@ define void @freeze_v16p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) {
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v6, vcc, 12, v0
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v7, vcc, 8, v0
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v8, vcc, 4, v0
-; GFX6-SDAG-NEXT:    buffer_load_dword v5, v5, s[0:3], 0 offen
-; GFX6-SDAG-NEXT:    buffer_load_dword v6, v6, s[0:3], 0 offen
-; GFX6-SDAG-NEXT:    buffer_load_dword v7, v7, s[0:3], 0 offen
 ; GFX6-SDAG-NEXT:    buffer_load_dword v8, v8, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_load_dword v7, v7, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_load_dword v6, v6, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_load_dword v5, v5, s[0:3], 0 offen
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 56, v0
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v3, vcc, 52, v0
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v4, vcc, 48, v0
@@ -11552,53 +11578,54 @@ define void @freeze_v16p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) {
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v13, vcc, 28, v0
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v14, vcc, 24, v0
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v15, vcc, 20, v0
-; GFX6-SDAG-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
-; GFX6-SDAG-NEXT:    buffer_load_dword v3, v3, s[0:3], 0 offen
-; GFX6-SDAG-NEXT:    buffer_load_dword v4, v4, s[0:3], 0 offen
-; GFX6-SDAG-NEXT:    buffer_load_dword v9, v9, s[0:3], 0 offen
-; GFX6-SDAG-NEXT:    buffer_load_dword v10, v10, s[0:3], 0 offen
-; GFX6-SDAG-NEXT:    buffer_load_dword v11, v11, s[0:3], 0 offen
 ; GFX6-SDAG-NEXT:    buffer_load_dword v16, v0, s[0:3], 0 offen
 ; GFX6-SDAG-NEXT:    buffer_load_dword v15, v15, s[0:3], 0 offen
 ; GFX6-SDAG-NEXT:    buffer_load_dword v14, v14, s[0:3], 0 offen
 ; GFX6-SDAG-NEXT:    buffer_load_dword v13, v13, s[0:3], 0 offen
 ; GFX6-SDAG-NEXT:    buffer_load_dword v12, v12, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_load_dword v11, v11, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_load_dword v10, v10, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_load_dword v9, v9, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_load_dword v4, v4, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_load_dword v3, v3, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 60, v0
 ; GFX6-SDAG-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v17, vcc, 4, v1
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v18, vcc, 8, v1
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v19, vcc, 12, v1
 ; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(14)
-; GFX6-SDAG-NEXT:    buffer_store_dword v6, v19, s[0:3], 0 offen
-; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(14)
-; GFX6-SDAG-NEXT:    buffer_store_dword v7, v18, s[0:3], 0 offen
-; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(14)
 ; GFX6-SDAG-NEXT:    buffer_store_dword v8, v17, s[0:3], 0 offen
 ; GFX6-SDAG-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v8, vcc, 16, v1
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(13)
 ; GFX6-SDAG-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen
 ; GFX6-SDAG-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v5, vcc, 40, v1
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v17, vcc, 20, v1
+; GFX6-SDAG-NEXT:    buffer_store_dword v7, v18, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v7, vcc, 24, v1
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v18, vcc, 28, v1
+; GFX6-SDAG-NEXT:    buffer_store_dword v6, v19, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v6, vcc, 32, v1
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v19, vcc, 36, v1
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v8, vcc, 44, v1
-; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(9)
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(14)
 ; GFX6-SDAG-NEXT:    buffer_store_dword v16, v1, s[0:3], 0 offen
-; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(9)
 ; GFX6-SDAG-NEXT:    buffer_store_dword v15, v17, s[0:3], 0 offen
-; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(9)
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(14)
 ; GFX6-SDAG-NEXT:    buffer_store_dword v14, v7, s[0:3], 0 offen
-; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(9)
 ; GFX6-SDAG-NEXT:    buffer_store_dword v13, v18, s[0:3], 0 offen
-; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(9)
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(14)
 ; GFX6-SDAG-NEXT:    buffer_store_dword v12, v6, s[0:3], 0 offen
 ; GFX6-SDAG-NEXT:    buffer_store_dword v11, v19, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(14)
 ; GFX6-SDAG-NEXT:    buffer_store_dword v10, v5, s[0:3], 0 offen
 ; GFX6-SDAG-NEXT:    buffer_store_dword v9, v8, s[0:3], 0 offen
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v5, vcc, 48, v1
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(14)
 ; GFX6-SDAG-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
 ; GFX6-SDAG-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v4, vcc, 52, v1
@@ -11606,8 +11633,8 @@ define void @freeze_v16p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) {
 ; GFX6-SDAG-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v3, vcc, 56, v1
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v1, vcc, 60, v1
-; GFX6-SDAG-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
 ; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(14)
+; GFX6-SDAG-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
 ; GFX6-SDAG-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -11629,8 +11656,8 @@ define void @freeze_v16p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) {
 ; GFX6-GISEL-NEXT:    v_add_i32_e32 v12, vcc, 40, v0
 ; GFX6-GISEL-NEXT:    v_add_i32_e32 v13, vcc, 44, v0
 ; GFX6-GISEL-NEXT:    buffer_load_dword v4, v4, s[0:3], 0 offen
-; GFX6-GISEL-NEXT:    buffer_load_dword v9, v0, s[0:3], 0 offen
 ; GFX6-GISEL-NEXT:    buffer_load_dword v5, v5, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    buffer_load_dword v9, v0, s[0:3], 0 offen
 ; GFX6-GISEL-NEXT:    buffer_load_dword v6, v6, s[0:3], 0 offen
 ; GFX6-GISEL-NEXT:    buffer_load_dword v7, v7, s[0:3], 0 offen
 ; GFX6-GISEL-NEXT:    buffer_load_dword v8, v8, s[0:3], 0 offen
@@ -11660,35 +11687,34 @@ define void @freeze_v16p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) {
 ; GFX6-GISEL-NEXT:    v_add_i32_e32 v18, vcc, 28, v1
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(14)
 ; GFX6-GISEL-NEXT:    buffer_store_dword v4, v19, s[0:3], 0 offen
-; GFX6-GISEL-NEXT:    s_waitcnt expcnt(0)
-; GFX6-GISEL-NEXT:    v_add_i32_e32 v4, vcc, 32, v1
-; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(14)
 ; GFX6-GISEL-NEXT:    buffer_store_dword v5, v2, s[0:3], 0 offen
 ; GFX6-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 40, v1
+; GFX6-GISEL-NEXT:    s_waitcnt expcnt(1)
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v4, vcc, 32, v1
 ; GFX6-GISEL-NEXT:    v_add_i32_e32 v19, vcc, 36, v1
 ; GFX6-GISEL-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-GISEL-NEXT:    v_add_i32_e32 v5, vcc, 44, v1
-; GFX6-GISEL-NEXT:    buffer_store_dword v9, v1, s[0:3], 0 offen
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(14)
+; GFX6-GISEL-NEXT:    buffer_store_dword v9, v1, s[0:3], 0 offen
 ; GFX6-GISEL-NEXT:    buffer_store_dword v6, v17, s[0:3], 0 offen
-; GFX6-GISEL-NEXT:    buffer_store_dword v7, v3, s[0:3], 0 offen
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(14)
+; GFX6-GISEL-NEXT:    buffer_store_dword v7, v3, s[0:3], 0 offen
 ; GFX6-GISEL-NEXT:    buffer_store_dword v8, v18, s[0:3], 0 offen
-; GFX6-GISEL-NEXT:    buffer_store_dword v10, v4, s[0:3], 0 offen
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(14)
+; GFX6-GISEL-NEXT:    buffer_store_dword v10, v4, s[0:3], 0 offen
 ; GFX6-GISEL-NEXT:    buffer_store_dword v11, v19, s[0:3], 0 offen
-; GFX6-GISEL-NEXT:    buffer_store_dword v12, v2, s[0:3], 0 offen
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(14)
+; GFX6-GISEL-NEXT:    buffer_store_dword v12, v2, s[0:3], 0 offen
 ; GFX6-GISEL-NEXT:    buffer_store_dword v13, v5, s[0:3], 0 offen
 ; GFX6-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 48, v1
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(14)
 ; GFX6-GISEL-NEXT:    buffer_store_dword v14, v2, s[0:3], 0 offen
 ; GFX6-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 52, v1
-; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(14)
 ; GFX6-GISEL-NEXT:    buffer_store_dword v15, v2, s[0:3], 0 offen
 ; GFX6-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 56, v1
 ; GFX6-GISEL-NEXT:    v_add_i32_e32 v1, vcc, 60, v1
-; GFX6-GISEL-NEXT:    buffer_store_dword v16, v2, s[0:3], 0 offen
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(14)
+; GFX6-GISEL-NEXT:    buffer_store_dword v16, v2, s[0:3], 0 offen
 ; GFX6-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -11700,10 +11726,10 @@ define void @freeze_v16p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) {
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v6, vcc, 12, v0
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v7, vcc, 8, v0
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v8, vcc, 4, v0
-; GFX7-SDAG-NEXT:    buffer_load_dword v5, v5, s[0:3], 0 offen
-; GFX7-SDAG-NEXT:    buffer_load_dword v6, v6, s[0:3], 0 offen
-; GFX7-SDAG-NEXT:    buffer_load_dword v7, v7, s[0:3], 0 offen
 ; GFX7-SDAG-NEXT:    buffer_load_dword v8, v8, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_load_dword v7, v7, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_load_dword v6, v6, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_load_dword v5, v5, s[0:3], 0 offen
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 56, v0
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v3, vcc, 52, v0
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v4, vcc, 48, v0
@@ -11714,58 +11740,57 @@ define void @freeze_v16p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) {
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v13, vcc, 28, v0
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v14, vcc, 24, v0
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v15, vcc, 20, v0
-; GFX7-SDAG-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
-; GFX7-SDAG-NEXT:    buffer_load_dword v3, v3, s[0:3], 0 offen
-; GFX7-SDAG-NEXT:    buffer_load_dword v4, v4, s[0:3], 0 offen
-; GFX7-SDAG-NEXT:    buffer_load_dword v9, v9, s[0:3], 0 offen
-; GFX7-SDAG-NEXT:    buffer_load_dword v10, v10, s[0:3], 0 offen
-; GFX7-SDAG-NEXT:    buffer_load_dword v11, v11, s[0:3], 0 offen
 ; GFX7-SDAG-NEXT:    buffer_load_dword v16, v0, s[0:3], 0 offen
 ; GFX7-SDAG-NEXT:    buffer_load_dword v15, v15, s[0:3], 0 offen
 ; GFX7-SDAG-NEXT:    buffer_load_dword v14, v14, s[0:3], 0 offen
 ; GFX7-SDAG-NEXT:    buffer_load_dword v13, v13, s[0:3], 0 offen
 ; GFX7-SDAG-NEXT:    buffer_load_dword v12, v12, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_load_dword v11, v11, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_load_dword v10, v10, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_load_dword v9, v9, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_load_dword v4, v4, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_load_dword v3, v3, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 60, v0
 ; GFX7-SDAG-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v17, vcc, 4, v1
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v18, vcc, 8, v1
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v19, vcc, 12, v1
 ; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(14)
-; GFX7-SDAG-NEXT:    buffer_store_dword v6, v19, s[0:3], 0 offen
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(14)
-; GFX7-SDAG-NEXT:    buffer_store_dword v7, v18, s[0:3], 0 offen
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(14)
 ; GFX7-SDAG-NEXT:    buffer_store_dword v8, v17, s[0:3], 0 offen
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v8, vcc, 16, v1
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(13)
 ; GFX7-SDAG-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v5, vcc, 40, v1
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v17, vcc, 20, v1
+; GFX7-SDAG-NEXT:    buffer_store_dword v7, v18, s[0:3], 0 offen
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v7, vcc, 24, v1
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v18, vcc, 28, v1
+; GFX7-SDAG-NEXT:    buffer_store_dword v6, v19, s[0:3], 0 offen
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v6, vcc, 32, v1
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v19, vcc, 36, v1
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v8, vcc, 44, v1
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(9)
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(14)
 ; GFX7-SDAG-NEXT:    buffer_store_dword v16, v1, s[0:3], 0 offen
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(9)
 ; GFX7-SDAG-NEXT:    buffer_store_dword v15, v17, s[0:3], 0 offen
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(9)
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(14)
 ; GFX7-SDAG-NEXT:    buffer_store_dword v14, v7, s[0:3], 0 offen
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(9)
 ; GFX7-SDAG-NEXT:    buffer_store_dword v13, v18, s[0:3], 0 offen
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(9)
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(14)
 ; GFX7-SDAG-NEXT:    buffer_store_dword v12, v6, s[0:3], 0 offen
 ; GFX7-SDAG-NEXT:    buffer_store_dword v11, v19, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(14)
 ; GFX7-SDAG-NEXT:    buffer_store_dword v10, v5, s[0:3], 0 offen
 ; GFX7-SDAG-NEXT:    buffer_store_dword v9, v8, s[0:3], 0 offen
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v5, vcc, 48, v1
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(14)
 ; GFX7-SDAG-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v4, vcc, 52, v1
 ; GFX7-SDAG-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v3, vcc, 56, v1
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v1, vcc, 60, v1
-; GFX7-SDAG-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
 ; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(14)
+; GFX7-SDAG-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
 ; GFX7-SDAG-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -11787,8 +11812,8 @@ define void @freeze_v16p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) {
 ; GFX7-GISEL-NEXT:    v_add_i32_e32 v12, vcc, 40, v0
 ; GFX7-GISEL-NEXT:    v_add_i32_e32 v13, vcc, 44, v0
 ; GFX7-GISEL-NEXT:    buffer_load_dword v4, v4, s[0:3], 0 offen
-; GFX7-GISEL-NEXT:    buffer_load_dword v9, v0, s[0:3], 0 offen
 ; GFX7-GISEL-NEXT:    buffer_load_dword v5, v5, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    buffer_load_dword v9, v0, s[0:3], 0 offen
 ; GFX7-GISEL-NEXT:    buffer_load_dword v6, v6, s[0:3], 0 offen
 ; GFX7-GISEL-NEXT:    buffer_load_dword v7, v7, s[0:3], 0 offen
 ; GFX7-GISEL-NEXT:    buffer_load_dword v8, v8, s[0:3], 0 offen
@@ -11816,33 +11841,32 @@ define void @freeze_v16p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) {
 ; GFX7-GISEL-NEXT:    v_add_i32_e32 v18, vcc, 28, v1
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(14)
 ; GFX7-GISEL-NEXT:    buffer_store_dword v4, v19, s[0:3], 0 offen
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v4, vcc, 32, v1
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(14)
 ; GFX7-GISEL-NEXT:    buffer_store_dword v5, v2, s[0:3], 0 offen
 ; GFX7-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 40, v1
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v4, vcc, 32, v1
 ; GFX7-GISEL-NEXT:    v_add_i32_e32 v19, vcc, 36, v1
 ; GFX7-GISEL-NEXT:    v_add_i32_e32 v5, vcc, 44, v1
-; GFX7-GISEL-NEXT:    buffer_store_dword v9, v1, s[0:3], 0 offen
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(14)
+; GFX7-GISEL-NEXT:    buffer_store_dword v9, v1, s[0:3], 0 offen
 ; GFX7-GISEL-NEXT:    buffer_store_dword v6, v17, s[0:3], 0 offen
-; GFX7-GISEL-NEXT:    buffer_store_dword v7, v3, s[0:3], 0 offen
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(14)
+; GFX7-GISEL-NEXT:    buffer_store_dword v7, v3, s[0:3], 0 offen
 ; GFX7-GISEL-NEXT:    buffer_store_dword v8, v18, s[0:3], 0 offen
-; GFX7-GISEL-NEXT:    buffer_store_dword v10, v4, s[0:3], 0 offen
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(14)
+; GFX7-GISEL-NEXT:    buffer_store_dword v10, v4, s[0:3], 0 offen
 ; GFX7-GISEL-NEXT:    buffer_store_dword v11, v19, s[0:3], 0 offen
-; GFX7-GISEL-NEXT:    buffer_store_dword v12, v2, s[0:3], 0 offen
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(14)
+; GFX7-GISEL-NEXT:    buffer_store_dword v12, v2, s[0:3], 0 offen
 ; GFX7-GISEL-NEXT:    buffer_store_dword v13, v5, s[0:3], 0 offen
 ; GFX7-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 48, v1
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(14)
 ; GFX7-GISEL-NEXT:    buffer_store_dword v14, v2, s[0:3], 0 offen
 ; GFX7-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 52, v1
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(14)
 ; GFX7-GISEL-NEXT:    buffer_store_dword v15, v2, s[0:3], 0 offen
 ; GFX7-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 56, v1
 ; GFX7-GISEL-NEXT:    v_add_i32_e32 v1, vcc, 60, v1
-; GFX7-GISEL-NEXT:    buffer_store_dword v16, v2, s[0:3], 0 offen
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(14)
+; GFX7-GISEL-NEXT:    buffer_store_dword v16, v2, s[0:3], 0 offen
 ; GFX7-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -11864,8 +11888,8 @@ define void @freeze_v16p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) {
 ; GFX8-GISEL-NEXT:    v_add_u32_e32 v12, vcc, 40, v0
 ; GFX8-GISEL-NEXT:    v_add_u32_e32 v13, vcc, 44, v0
 ; GFX8-GISEL-NEXT:    buffer_load_dword v4, v4, s[0:3], 0 offen
-; GFX8-GISEL-NEXT:    buffer_load_dword v9, v0, s[0:3], 0 offen
 ; GFX8-GISEL-NEXT:    buffer_load_dword v5, v5, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    buffer_load_dword v9, v0, s[0:3], 0 offen
 ; GFX8-GISEL-NEXT:    buffer_load_dword v6, v6, s[0:3], 0 offen
 ; GFX8-GISEL-NEXT:    buffer_load_dword v7, v7, s[0:3], 0 offen
 ; GFX8-GISEL-NEXT:    buffer_load_dword v8, v8, s[0:3], 0 offen
@@ -11893,33 +11917,32 @@ define void @freeze_v16p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) {
 ; GFX8-GISEL-NEXT:    v_add_u32_e32 v18, vcc, 28, v1
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(14)
 ; GFX8-GISEL-NEXT:    buffer_store_dword v4, v19, s[0:3], 0 offen
-; GFX8-GISEL-NEXT:    v_add_u32_e32 v4, vcc, 32, v1
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(14)
 ; GFX8-GISEL-NEXT:    buffer_store_dword v5, v2, s[0:3], 0 offen
 ; GFX8-GISEL-NEXT:    v_add_u32_e32 v2, vcc, 40, v1
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v4, vcc, 32, v1
 ; GFX8-GISEL-NEXT:    v_add_u32_e32 v19, vcc, 36, v1
 ; GFX8-GISEL-NEXT:    v_add_u32_e32 v5, vcc, 44, v1
-; GFX8-GISEL-NEXT:    buffer_store_dword v9, v1, s[0:3], 0 offen
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(14)
+; GFX8-GISEL-NEXT:    buffer_store_dword v9, v1, s[0:3], 0 offen
 ; GFX8-GISEL-NEXT:    buffer_store_dword v6, v17, s[0:3], 0 offen
-; GFX8-GISEL-NEXT:    buffer_store_dword v7, v3, s[0:3], 0 offen
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(14)
+; GFX8-GISEL-NEXT:    buffer_store_dword v7, v3, s[0:3], 0 offen
 ; GFX8-GISEL-NEXT:    buffer_store_dword v8, v18, s[0:3], 0 offen
-; GFX8-GISEL-NEXT:    buffer_store_dword v10, v4, s[0:3], 0 offen
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(14)
+; GFX8-GISEL-NEXT:    buffer_store_dword v10, v4, s[0:3], 0 offen
 ; GFX8-GISEL-NEXT:    buffer_store_dword v11, v19, s[0:3], 0 offen
-; GFX8-GISEL-NEXT:    buffer_store_dword v12, v2, s[0:3], 0 offen
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(14)
+; GFX8-GISEL-NEXT:    buffer_store_dword v12, v2, s[0:3], 0 offen
 ; GFX8-GISEL-NEXT:    buffer_store_dword v13, v5, s[0:3], 0 offen
 ; GFX8-GISEL-NEXT:    v_add_u32_e32 v2, vcc, 48, v1
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(14)
 ; GFX8-GISEL-NEXT:    buffer_store_dword v14, v2, s[0:3], 0 offen
 ; GFX8-GISEL-NEXT:    v_add_u32_e32 v2, vcc, 52, v1
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(14)
 ; GFX8-GISEL-NEXT:    buffer_store_dword v15, v2, s[0:3], 0 offen
 ; GFX8-GISEL-NEXT:    v_add_u32_e32 v2, vcc, 56, v1
 ; GFX8-GISEL-NEXT:    v_add_u32_e32 v1, vcc, 60, v1
-; GFX8-GISEL-NEXT:    buffer_store_dword v16, v2, s[0:3], 0 offen
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(14)
+; GFX8-GISEL-NEXT:    buffer_store_dword v16, v2, s[0:3], 0 offen
 ; GFX8-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll
index 5babe9fb3d851..d86894fb335c7 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args.ll
@@ -2631,13 +2631,13 @@ define void @void_func_byval_struct_i8_i32(ptr addrspace(5) byval({ i8, i32 }) %
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v0, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v1, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v0, off, s32
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v1, off, s[0:3], 0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2835,9 +2835,9 @@ define void @void_func_v32i32_i32_i64(<32 x i32> %arg0, i32 %arg1, i64 %arg2) #0
 ; CIGFX89:       ; %bb.0:
 ; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; CIGFX89-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:4
 ; CIGFX89-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:12
 ; CIGFX89-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
-; CIGFX89-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:4
 ; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; CIGFX89-NEXT:    s_mov_b32 s6, -1
 ; CIGFX89-NEXT:    s_waitcnt vmcnt(3)
@@ -2868,8 +2868,8 @@ define void @void_func_v32i32_i32_i64(<32 x i32> %arg0, i32 %arg1, i64 %arg2) #0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x3
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:12
 ; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:4
+; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:12
 ; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:8
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
@@ -2890,7 +2890,7 @@ define void @void_func_v32i32_i32_i64(<32 x i32> %arg0, i32 %arg1, i64 %arg2) #0
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    s_waitcnt vmcnt(2)
 ; GFX11-NEXT:    buffer_store_b32 v34, off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -3241,11 +3241,11 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i
 ; GFX89:       ; %bb.0:
 ; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX89-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:20
-; GFX89-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:16
 ; GFX89-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:4
 ; GFX89-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:8
 ; GFX89-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:12
+; GFX89-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:20
+; GFX89-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:16
 ; GFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX89-NEXT:    s_mov_b32 s6, -1
 ; GFX89-NEXT:    s_waitcnt vmcnt(5)
@@ -3280,10 +3280,10 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x5
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:20
 ; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:8
 ; GFX11-NEXT:    scratch_load_b32 v36, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:20
 ; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:16
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
@@ -3304,13 +3304,13 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-NEXT:    s_waitcnt vmcnt(4)
 ; GFX11-NEXT:    buffer_store_b32 v34, off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-NEXT:    s_waitcnt vmcnt(3)
 ; GFX11-NEXT:    buffer_store_b32 v35, off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    s_waitcnt vmcnt(2)
 ; GFX11-NEXT:    buffer_store_b32 v36, off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -3330,13 +3330,13 @@ define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; CI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:32
-; CI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:28
-; CI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:24
 ; CI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:16
 ; CI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:12
 ; CI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:8
 ; CI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4
+; CI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:32
+; CI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:28
+; CI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:24
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
 ; CI-NEXT:    s_waitcnt vmcnt(7)
@@ -3367,13 +3367,13 @@ define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:32
-; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:28
-; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:24
 ; VI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:16
 ; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:12
 ; VI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:8
 ; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:32
+; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:24
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt vmcnt(7)
@@ -3404,13 +3404,13 @@ define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX9-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:32
-; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:24
 ; GFX9-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:16
 ; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:12
 ; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:8
 ; GFX9-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:32
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:24
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    s_waitcnt vmcnt(7)
@@ -3443,13 +3443,13 @@ define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x8
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:32
-; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:28
-; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:24
 ; GFX11-NEXT:    scratch_load_b32 v39, off, s32 offset:16
 ; GFX11-NEXT:    scratch_load_b32 v38, off, s32 offset:12
 ; GFX11-NEXT:    scratch_load_b32 v37, off, s32 offset:8
 ; GFX11-NEXT:    scratch_load_b32 v36, off, s32 offset:4
+; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:32
+; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:28
+; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:24
 ; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:20
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
@@ -3470,7 +3470,7 @@ define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    s_waitcnt vmcnt(4)
 ; GFX11-NEXT:    buffer_store_b128 v[36:39], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -3573,13 +3573,13 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8
 ; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:32
+; CI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:28
+; CI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:24
 ; CI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:16
 ; CI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:12
 ; CI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; CI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
-; CI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:32
-; CI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:28
-; CI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:24
 ; CI-NEXT:    s_waitcnt vmcnt(7)
 ; CI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -3592,13 +3592,13 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8
 ; CI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:20
-; CI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:48
-; CI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:44
-; CI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:40
 ; CI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:64
 ; CI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:60
 ; CI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:56
 ; CI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:52
+; CI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:48
+; CI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:44
+; CI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:40
 ; CI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:36
 ; CI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -3622,13 +3622,13 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:32
+; VI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:24
 ; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:16
 ; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:12
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:32
-; VI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:28
-; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:24
 ; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -3641,13 +3641,13 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8
 ; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:48
-; VI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:44
-; VI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:40
 ; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:64
 ; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:60
 ; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:56
 ; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:52
+; VI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:48
+; VI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:44
+; VI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:40
 ; VI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:36
 ; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -3671,13 +3671,13 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:32
+; GFX9-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:24
 ; GFX9-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:16
 ; GFX9-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:12
 ; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:32
-; GFX9-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:24
 ; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -3690,13 +3690,13 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8
 ; GFX9-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:48
-; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:40
 ; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:64
 ; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:60
 ; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:56
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:48
+; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:40
 ; GFX9-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:36
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
@@ -3719,26 +3719,26 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x10
-; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:48
-; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:44
-; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:40
-; GFX11-NEXT:    scratch_load_b32 v39, off, s32 offset:64
-; GFX11-NEXT:    scratch_load_b32 v38, off, s32 offset:60
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    scratch_load_b32 v37, off, s32 offset:56
-; GFX11-NEXT:    scratch_load_b32 v51, off, s32 offset:16
-; GFX11-NEXT:    scratch_load_b32 v50, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_b32 v49, off, s32 offset:8
 ; GFX11-NEXT:    scratch_load_b32 v55, off, s32 offset:32
 ; GFX11-NEXT:    scratch_load_b32 v54, off, s32 offset:28
 ; GFX11-NEXT:    scratch_load_b32 v53, off, s32 offset:24
 ; GFX11-NEXT:    scratch_load_b32 v52, off, s32 offset:20
+; GFX11-NEXT:    scratch_load_b32 v51, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_b32 v50, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_b32 v49, off, s32 offset:8
 ; GFX11-NEXT:    scratch_load_b32 v48, off, s32 offset:4
+; GFX11-NEXT:    scratch_load_b32 v39, off, s32 offset:64
+; GFX11-NEXT:    scratch_load_b32 v38, off, s32 offset:60
+; GFX11-NEXT:    scratch_load_b32 v37, off, s32 offset:56
 ; GFX11-NEXT:    scratch_load_b32 v36, off, s32 offset:52
+; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:48
+; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:44
+; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:40
 ; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:36
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-NEXT:    s_waitcnt vmcnt(16)
 ; GFX11-NEXT:    buffer_store_b128 v[28:31], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b128 v[24:27], off, s[0:3], 0 dlc
@@ -3755,13 +3755,13 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-NEXT:    s_waitcnt vmcnt(12)
 ; GFX11-NEXT:    buffer_store_b128 v[52:55], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-NEXT:    s_waitcnt vmcnt(8)
 ; GFX11-NEXT:    buffer_store_b128 v[48:51], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    s_waitcnt vmcnt(4)
 ; GFX11-NEXT:    buffer_store_b128 v[36:39], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -3812,25 +3812,25 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1,
 ; CI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:4
+; CI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:128
+; CI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:124
+; CI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:120
 ; CI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:112
 ; CI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:108
 ; CI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:104
 ; CI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:100
-; CI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:128
-; CI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:124
-; CI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:120
 ; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[32:35], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:116
-; CI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:80
-; CI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:76
-; CI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:72
 ; CI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:96
 ; CI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:92
 ; CI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:88
 ; CI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:84
+; CI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:80
+; CI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:76
+; CI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:72
 ; CI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:68
 ; CI-NEXT:    buffer_store_dwordx4 v[36:39], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -3885,25 +3885,25 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1,
 ; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:128
+; VI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:124
+; VI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:120
 ; VI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:112
 ; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:108
 ; VI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:104
 ; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:100
-; VI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:128
-; VI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:124
-; VI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:120
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[32:35], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:116
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:80
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:76
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:72
 ; VI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:96
 ; VI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:92
 ; VI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:88
 ; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:84
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:80
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:76
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:72
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:68
 ; VI-NEXT:    buffer_store_dwordx4 v[36:39], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -3959,26 +3959,26 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1,
 ; GFX9-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:128
+; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:124
+; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:120
 ; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:112
 ; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:108
 ; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:104
 ; GFX9-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:100
-; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:128
-; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:124
-; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:120
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[32:35], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:116
-; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:80
-; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:76
-; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:72
 ; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:96
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:92
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:88
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:84
+; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:80
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:76
+; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:72
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:68
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[36:39], off, s[4:7], 0
@@ -4001,42 +4001,42 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1,
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:80
-; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:76
-; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:72
-; GFX11-NEXT:    scratch_load_b32 v39, off, s32 offset:96
-; GFX11-NEXT:    scratch_load_b32 v38, off, s32 offset:92
-; GFX11-NEXT:    scratch_load_b32 v37, off, s32 offset:88
-; GFX11-NEXT:    scratch_load_b32 v51, off, s32 offset:112
-; GFX11-NEXT:    scratch_load_b32 v50, off, s32 offset:108
-; GFX11-NEXT:    scratch_load_b32 v49, off, s32 offset:104
-; GFX11-NEXT:    scratch_load_b32 v55, off, s32 offset:128
-; GFX11-NEXT:    scratch_load_b32 v54, off, s32 offset:124
-; GFX11-NEXT:    scratch_load_b32 v53, off, s32 offset:120
-; GFX11-NEXT:    scratch_load_b32 v67, off, s32 offset:16
-; GFX11-NEXT:    scratch_load_b32 v66, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_b32 v65, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_b32 v71, off, s32 offset:32
-; GFX11-NEXT:    scratch_load_b32 v70, off, s32 offset:28
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    scratch_load_b32 v69, off, s32 offset:24
-; GFX11-NEXT:    scratch_load_b32 v83, off, s32 offset:48
-; GFX11-NEXT:    scratch_load_b32 v82, off, s32 offset:44
-; GFX11-NEXT:    scratch_load_b32 v81, off, s32 offset:40
 ; GFX11-NEXT:    scratch_load_b32 v87, off, s32 offset:64
 ; GFX11-NEXT:    scratch_load_b32 v86, off, s32 offset:60
 ; GFX11-NEXT:    scratch_load_b32 v85, off, s32 offset:56
 ; GFX11-NEXT:    scratch_load_b32 v84, off, s32 offset:52
+; GFX11-NEXT:    scratch_load_b32 v83, off, s32 offset:48
+; GFX11-NEXT:    scratch_load_b32 v82, off, s32 offset:44
+; GFX11-NEXT:    scratch_load_b32 v81, off, s32 offset:40
 ; GFX11-NEXT:    scratch_load_b32 v80, off, s32 offset:36
+; GFX11-NEXT:    scratch_load_b32 v71, off, s32 offset:32
+; GFX11-NEXT:    scratch_load_b32 v70, off, s32 offset:28
+; GFX11-NEXT:    scratch_load_b32 v69, off, s32 offset:24
 ; GFX11-NEXT:    scratch_load_b32 v68, off, s32 offset:20
+; GFX11-NEXT:    scratch_load_b32 v67, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_b32 v66, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_b32 v65, off, s32 offset:8
 ; GFX11-NEXT:    scratch_load_b32 v64, off, s32 offset:4
+; GFX11-NEXT:    scratch_load_b32 v55, off, s32 offset:128
+; GFX11-NEXT:    scratch_load_b32 v54, off, s32 offset:124
+; GFX11-NEXT:    scratch_load_b32 v53, off, s32 offset:120
 ; GFX11-NEXT:    scratch_load_b32 v52, off, s32 offset:116
+; GFX11-NEXT:    scratch_load_b32 v51, off, s32 offset:112
+; GFX11-NEXT:    scratch_load_b32 v50, off, s32 offset:108
+; GFX11-NEXT:    scratch_load_b32 v49, off, s32 offset:104
 ; GFX11-NEXT:    scratch_load_b32 v48, off, s32 offset:100
+; GFX11-NEXT:    scratch_load_b32 v39, off, s32 offset:96
+; GFX11-NEXT:    scratch_load_b32 v38, off, s32 offset:92
+; GFX11-NEXT:    scratch_load_b32 v37, off, s32 offset:88
 ; GFX11-NEXT:    scratch_load_b32 v36, off, s32 offset:84
+; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:80
+; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:76
+; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:72
 ; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:68
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    s_waitcnt vmcnt(15)
+; GFX11-NEXT:    s_waitcnt vmcnt(32)
 ; GFX11-NEXT:    buffer_store_b128 v[28:31], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b128 v[24:27], off, s[0:3], 0 dlc
@@ -4053,25 +4053,25 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1,
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-NEXT:    s_waitcnt vmcnt(28)
 ; GFX11-NEXT:    buffer_store_b128 v[84:87], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-NEXT:    s_waitcnt vmcnt(24)
 ; GFX11-NEXT:    buffer_store_b128 v[80:83], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-NEXT:    s_waitcnt vmcnt(20)
 ; GFX11-NEXT:    buffer_store_b128 v[68:71], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-NEXT:    s_waitcnt vmcnt(16)
 ; GFX11-NEXT:    buffer_store_b128 v[64:67], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-NEXT:    s_waitcnt vmcnt(12)
 ; GFX11-NEXT:    buffer_store_b128 v[52:55], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-NEXT:    s_waitcnt vmcnt(8)
 ; GFX11-NEXT:    buffer_store_b128 v[48:51], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    s_waitcnt vmcnt(4)
 ; GFX11-NEXT:    buffer_store_b128 v[36:39], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -4280,13 +4280,13 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 {
 ; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
-; CI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:48
-; CI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:60
 ; CI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:64
-; CI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:52
+; CI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:60
 ; CI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:56
-; CI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:36
+; CI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:52
+; CI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:48
 ; CI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:40
+; CI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:36
 ; CI-NEXT:    s_waitcnt vmcnt(7)
 ; CI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -4353,13 +4353,13 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 {
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
-; VI-NEXT:    buffer_load_ubyte v32, off, s[0:3], s32 offset:48
-; VI-NEXT:    buffer_load_ubyte v33, off, s[0:3], s32 offset:60
 ; VI-NEXT:    buffer_load_ubyte v34, off, s[0:3], s32 offset:64
-; VI-NEXT:    buffer_load_ubyte v35, off, s[0:3], s32 offset:52
+; VI-NEXT:    buffer_load_ubyte v33, off, s[0:3], s32 offset:60
 ; VI-NEXT:    buffer_load_ubyte v36, off, s[0:3], s32 offset:56
-; VI-NEXT:    buffer_load_ubyte v37, off, s[0:3], s32 offset:36
+; VI-NEXT:    buffer_load_ubyte v35, off, s[0:3], s32 offset:52
+; VI-NEXT:    buffer_load_ubyte v32, off, s[0:3], s32 offset:48
 ; VI-NEXT:    buffer_load_ubyte v38, off, s[0:3], s32 offset:40
+; VI-NEXT:    buffer_load_ubyte v37, off, s[0:3], s32 offset:36
 ; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -4426,13 +4426,13 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 {
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
-; GFX9-NEXT:    buffer_load_ubyte v32, off, s[0:3], s32 offset:48
-; GFX9-NEXT:    buffer_load_ubyte v33, off, s[0:3], s32 offset:60
 ; GFX9-NEXT:    buffer_load_ubyte v34, off, s[0:3], s32 offset:64
-; GFX9-NEXT:    buffer_load_ubyte v35, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    buffer_load_ubyte v33, off, s[0:3], s32 offset:60
 ; GFX9-NEXT:    buffer_load_ubyte v36, off, s[0:3], s32 offset:56
-; GFX9-NEXT:    buffer_load_ubyte v37, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_ubyte v35, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    buffer_load_ubyte v32, off, s[0:3], s32 offset:48
 ; GFX9-NEXT:    buffer_load_ubyte v38, off, s[0:3], s32 offset:40
+; GFX9-NEXT:    buffer_load_ubyte v37, off, s[0:3], s32 offset:36
 ; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
index b750d28ffa7d3..1054c7b792eb8 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
@@ -2395,27 +2395,27 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x7
-; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
-; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:68
-; GFX10-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:72
-; GFX10-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:76
-; GFX10-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:80
-; GFX10-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:84
-; GFX10-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:88
 ; GFX10-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:92
+; GFX10-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:88
+; GFX10-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:84
+; GFX10-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:80
+; GFX10-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:76
+; GFX10-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:72
+; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:68
+; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
 ; GFX10-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen offset:120
 ; GFX10-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:116
 ; GFX10-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen offset:112
 ; GFX10-NEXT:    buffer_store_dword v28, v0, s[0:3], 0 offen offset:108
 ; GFX10-NEXT:    s_clause 0x7
-; GFX10-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:128
-; GFX10-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:132
-; GFX10-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:136
-; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:140
-; GFX10-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:144
-; GFX10-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:148
-; GFX10-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:152
 ; GFX10-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:156
+; GFX10-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:152
+; GFX10-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:148
+; GFX10-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:144
+; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:140
+; GFX10-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:136
+; GFX10-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:132
+; GFX10-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:128
 ; GFX10-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen offset:104
 ; GFX10-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen offset:100
 ; GFX10-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen offset:96
@@ -2426,14 +2426,14 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
 ; GFX10-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:76
 ; GFX10-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:72
 ; GFX10-NEXT:    s_clause 0x7
-; GFX10-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:96
-; GFX10-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:100
-; GFX10-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:104
-; GFX10-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:108
-; GFX10-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:112
-; GFX10-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116
-; GFX10-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:120
 ; GFX10-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:124
+; GFX10-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:120
+; GFX10-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116
+; GFX10-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:112
+; GFX10-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:108
+; GFX10-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:104
+; GFX10-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:100
+; GFX10-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:96
 ; GFX10-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:68
 ; GFX10-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:64
 ; GFX10-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:60
@@ -2443,14 +2443,14 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
 ; GFX10-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:44
 ; GFX10-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:40
 ; GFX10-NEXT:    s_clause 0x7
-; GFX10-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:32
-; GFX10-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:36
-; GFX10-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:40
-; GFX10-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:44
-; GFX10-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:48
-; GFX10-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:52
-; GFX10-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:56
 ; GFX10-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:60
+; GFX10-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:56
+; GFX10-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:52
+; GFX10-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:48
+; GFX10-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:44
+; GFX10-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:40
+; GFX10-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:36
+; GFX10-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:32
 ; GFX10-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:36
 ; GFX10-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:32
 ; GFX10-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:28
@@ -2533,24 +2533,24 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
 ; GFX11-NEXT:    scratch_store_b32 off, v62, s32 offset:168
 ; GFX11-NEXT:    scratch_store_b32 off, v63, s32 offset:164
 ; GFX11-NEXT:    s_clause 0x11
-; GFX11-NEXT:    scratch_load_b32 v36, off, s32 offset:16
-; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_b32 v51, off, s32 offset:32
-; GFX11-NEXT:    scratch_load_b32 v50, off, s32 offset:28
-; GFX11-NEXT:    scratch_load_b32 v49, off, s32 offset:24
-; GFX11-NEXT:    scratch_load_b32 v55, off, s32 offset:48
-; GFX11-NEXT:    scratch_load_b32 v54, off, s32 offset:44
-; GFX11-NEXT:    scratch_load_b32 v53, off, s32 offset:40
-; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:64
-; GFX11-NEXT:    scratch_load_b32 v39, off, s32 offset:60
-; GFX11-NEXT:    scratch_load_b32 v38, off, s32 offset:56
-; GFX11-NEXT:    scratch_load_b32 v44, off, s32 offset:80
-; GFX11-NEXT:    scratch_load_b32 v43, off, s32 offset:76
-; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:72
 ; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:96
 ; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:92
 ; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:88
+; GFX11-NEXT:    scratch_load_b32 v44, off, s32 offset:80
+; GFX11-NEXT:    scratch_load_b32 v43, off, s32 offset:76
+; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:72
+; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:64
+; GFX11-NEXT:    scratch_load_b32 v39, off, s32 offset:60
+; GFX11-NEXT:    scratch_load_b32 v38, off, s32 offset:56
+; GFX11-NEXT:    scratch_load_b32 v55, off, s32 offset:48
+; GFX11-NEXT:    scratch_load_b32 v54, off, s32 offset:44
+; GFX11-NEXT:    scratch_load_b32 v53, off, s32 offset:40
+; GFX11-NEXT:    scratch_load_b32 v51, off, s32 offset:32
+; GFX11-NEXT:    scratch_load_b32 v50, off, s32 offset:28
+; GFX11-NEXT:    scratch_load_b32 v49, off, s32 offset:24
+; GFX11-NEXT:    scratch_load_b32 v36, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:8
 ; GFX11-NEXT:    scratch_store_b128 v0, v[21:24], off offset:80
 ; GFX11-NEXT:    s_clause 0x2
 ; GFX11-NEXT:    scratch_load_b32 v23, off, s32 offset:112
@@ -2735,6 +2735,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX9-NEXT:    v_mov_b32_e32 v31, 0
 ; GFX9-NEXT:    v_writelane_b32 v63, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[36:37]
+; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:516
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:636
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:640
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:644
@@ -2775,8 +2776,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:788
 ; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:792
 ; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:796
-; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:516
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(40)
 ; GFX9-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:520
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -3009,6 +3009,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX10-NEXT:    v_writelane_b32 v63, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[36:37]
 ; GFX10-NEXT:    s_clause 0x28
+; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:516
 ; GFX10-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:636
 ; GFX10-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:640
 ; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:644
@@ -3049,8 +3050,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX10-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:788
 ; GFX10-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:792
 ; GFX10-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:796
-; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:516
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(40)
 ; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill
 ; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:520
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -3262,6 +3262,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    v_mov_b32_e32 v32, v48
 ; GFX11-NEXT:    s_clause 0x9
+; GFX11-NEXT:    scratch_load_b128 v[16:19], off, s33 offset:512
 ; GFX11-NEXT:    scratch_load_b128 v[48:51], off, s33 offset:656
 ; GFX11-NEXT:    scratch_load_b128 v[52:55], off, s33 offset:672
 ; GFX11-NEXT:    scratch_load_b128 v[37:40], off, s33 offset:688
@@ -3271,22 +3272,22 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    scratch_load_b128 v[0:3], off, s33 offset:752
 ; GFX11-NEXT:    scratch_load_b128 v[4:7], off, s33 offset:768
 ; GFX11-NEXT:    scratch_load_b128 v[8:11], off, s33 offset:784
-; GFX11-NEXT:    scratch_load_b128 v[16:19], off, s33 offset:512
-; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v1, v4
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(9)
 ; GFX11-NEXT:    scratch_store_b128 off, v[16:19], s33 offset:1584 ; 16-byte Folded Spill
 ; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    scratch_load_b128 v[28:31], off, s33 offset:576
 ; GFX11-NEXT:    scratch_load_b128 v[16:19], off, s33 offset:528
 ; GFX11-NEXT:    scratch_load_b128 v[20:23], off, s33 offset:544
 ; GFX11-NEXT:    scratch_load_b128 v[24:27], off, s33 offset:560
-; GFX11-NEXT:    scratch_load_b128 v[28:31], off, s33 offset:576
+; GFX11-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-NEXT:    v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v1, v4
+; GFX11-NEXT:    s_waitcnt vmcnt(4)
 ; GFX11-NEXT:    v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v7, v10
-; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    v_mov_b32_e32 v10, v21
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(3)
 ; GFX11-NEXT:    scratch_store_b128 off, v[28:31], s33 offset:1568 ; 16-byte Folded Spill
 ; GFX11-NEXT:    scratch_load_b128 v[28:31], off, s33 offset:592
+; GFX11-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-NEXT:    v_mov_b32_e32 v10, v21
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    scratch_store_b128 off, v[28:31], s33 offset:1552 ; 16-byte Folded Spill
 ; GFX11-NEXT:    scratch_load_b128 v[28:31], off, s33 offset:608
diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
index f80d50b56f550..b183deff48fdb 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -648,15 +648,15 @@ define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    global_load_dword v1, v0, s[0:1]
 ; GFX9-DL-NEXT:    global_load_dword v2, v0, s[2:3]
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[0:1]
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    global_load_ubyte v3, v0, s[6:7]
 ; GFX9-DL-NEXT:    s_mov_b32 s0, 0xc0c0100
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v1, s0
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-DL-NEXT:    v_perm_b32 v2, v2, v2, s0
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v1, s0
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, v3
 ; GFX9-DL-NEXT:    global_store_byte v0, v1, s[6:7]
@@ -670,14 +670,14 @@ define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_clause 0x1
-; GFX10-DL-NEXT:    global_load_dword v1, v0, s[0:1]
 ; GFX10-DL-NEXT:    global_load_dword v2, v0, s[2:3]
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[0:1]
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-DL-NEXT:    global_load_ubyte v3, v0, s[6:7]
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc0c0100
-; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-DL-NEXT:    v_perm_b32 v2, v2, v2, 0xc0c0100
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc0c0100
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, v3
 ; GFX10-DL-NEXT:    global_store_byte v0, v1, s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
index f5d7bb3a45fe1..cf1997204704b 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -1603,7 +1603,7 @@ define amdgpu_kernel void @extract_neg_offset_vgpr(ptr addrspace(1) %out) {
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
 ; NOOPT-NEXT:    v_readlane_b32 s0, v31, 6
 ; NOOPT-NEXT:    v_readlane_b32 s1, v31, 7
-; NOOPT-NEXT:    buffer_load_dword v17, off, s[20:23], 0 offset:72 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v16, off, s[20:23], 0 offset:68 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v0, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v1, off, s[20:23], 0 offset:8 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v2, off, s[20:23], 0 offset:12 ; 4-byte Folded Reload
@@ -1620,12 +1620,13 @@ define amdgpu_kernel void @extract_neg_offset_vgpr(ptr addrspace(1) %out) {
 ; NOOPT-NEXT:    buffer_load_dword v13, off, s[20:23], 0 offset:56 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v14, off, s[20:23], 0 offset:60 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v15, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v16, off, s[20:23], 0 offset:68 ; 4-byte Folded Reload
-; NOOPT-NEXT:    s_waitcnt vmcnt(0)
+; NOOPT-NEXT:    buffer_load_dword v17, off, s[20:23], 0 offset:72 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_waitcnt vmcnt(14)
 ; NOOPT-NEXT:    v_readfirstlane_b32 s2, v16
 ; NOOPT-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v16
 ; NOOPT-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; NOOPT-NEXT:    s_add_i32 m0, s2, 0xfffffe00
+; NOOPT-NEXT:    s_waitcnt vmcnt(1)
 ; NOOPT-NEXT:    v_movrels_b32_e32 v0, v0
 ; NOOPT-NEXT:    buffer_store_dword v0, off, s[20:23], 0 offset:76 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    buffer_store_dword v0, off, s[20:23], 0 offset:72 ; 4-byte Folded Spill
@@ -4119,6 +4120,7 @@ define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addr
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
 ; NOOPT-NEXT:    v_readlane_b32 s0, v31, 6
 ; NOOPT-NEXT:    v_readlane_b32 s1, v31, 7
+; NOOPT-NEXT:    buffer_load_dword v17, off, s[20:23], 0 offset:136 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v0, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v1, off, s[20:23], 0 offset:8 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v2, off, s[20:23], 0 offset:12 ; 4-byte Folded Reload
@@ -4143,12 +4145,12 @@ define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addr
 ; NOOPT-NEXT:    s_waitcnt expcnt(0)
 ; NOOPT-NEXT:    buffer_load_dword v15, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v16, off, s[20:23], 0 offset:68 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v17, off, s[20:23], 0 offset:136 ; 4-byte Folded Reload
-; NOOPT-NEXT:    s_waitcnt vmcnt(0)
+; NOOPT-NEXT:    s_waitcnt vmcnt(14)
 ; NOOPT-NEXT:    v_readfirstlane_b32 s2, v17
 ; NOOPT-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v17
 ; NOOPT-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; NOOPT-NEXT:    s_add_i32 m0, s2, 0xfffffe00
+; NOOPT-NEXT:    s_waitcnt vmcnt(0)
 ; NOOPT-NEXT:    v_movreld_b32_e32 v0, v16
 ; NOOPT-NEXT:    buffer_store_dword v0, off, s[20:23], 0 offset:140 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    buffer_store_dword v1, off, s[20:23], 0 offset:144 ; 4-byte Folded Spill
@@ -4208,41 +4210,52 @@ define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addr
 ; NOOPT-NEXT:    v_readlane_b32 s1, v31, 1
 ; NOOPT-NEXT:    v_readlane_b32 s2, v31, 2
 ; NOOPT-NEXT:    v_readlane_b32 s3, v31, 3
-; NOOPT-NEXT:    buffer_load_dword v15, off, s[20:23], 0 offset:140 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v16, off, s[20:23], 0 offset:144 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v17, off, s[20:23], 0 offset:148 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v18, off, s[20:23], 0 offset:152 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v19, off, s[20:23], 0 offset:156 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v20, off, s[20:23], 0 offset:160 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v21, off, s[20:23], 0 offset:164 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v17, off, s[20:23], 0 offset:148 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v16, off, s[20:23], 0 offset:144 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v15, off, s[20:23], 0 offset:140 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v22, off, s[20:23], 0 offset:168 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v23, off, s[20:23], 0 offset:172 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v24, off, s[20:23], 0 offset:176 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v25, off, s[20:23], 0 offset:180 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v21, off, s[20:23], 0 offset:164 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v20, off, s[20:23], 0 offset:160 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v19, off, s[20:23], 0 offset:156 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v26, off, s[20:23], 0 offset:184 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v27, off, s[20:23], 0 offset:188 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v28, off, s[20:23], 0 offset:192 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v29, off, s[20:23], 0 offset:196 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v25, off, s[20:23], 0 offset:180 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v24, off, s[20:23], 0 offset:176 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v23, off, s[20:23], 0 offset:172 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v30, off, s[20:23], 0 offset:200 ; 4-byte Folded Reload
-; NOOPT-NEXT:    s_waitcnt vmcnt(12)
+; NOOPT-NEXT:    buffer_load_dword v29, off, s[20:23], 0 offset:196 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v28, off, s[20:23], 0 offset:192 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v27, off, s[20:23], 0 offset:188 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_waitcnt vmcnt(14)
 ; NOOPT-NEXT:    v_mov_b32_e32 v4, v18
 ; NOOPT-NEXT:    v_mov_b32_e32 v5, v17
+; NOOPT-NEXT:    s_waitcnt vmcnt(13)
 ; NOOPT-NEXT:    v_mov_b32_e32 v6, v16
+; NOOPT-NEXT:    s_waitcnt vmcnt(12)
 ; NOOPT-NEXT:    v_mov_b32_e32 v0, v15
-; NOOPT-NEXT:    s_waitcnt vmcnt(8)
+; NOOPT-NEXT:    s_waitcnt vmcnt(11)
 ; NOOPT-NEXT:    v_mov_b32_e32 v1, v22
+; NOOPT-NEXT:    s_waitcnt vmcnt(10)
 ; NOOPT-NEXT:    v_mov_b32_e32 v2, v21
+; NOOPT-NEXT:    s_waitcnt vmcnt(9)
 ; NOOPT-NEXT:    v_mov_b32_e32 v3, v20
+; NOOPT-NEXT:    s_waitcnt vmcnt(8)
 ; NOOPT-NEXT:    v_mov_b32_e32 v7, v19
-; NOOPT-NEXT:    s_waitcnt vmcnt(4)
+; NOOPT-NEXT:    s_waitcnt vmcnt(7)
 ; NOOPT-NEXT:    v_mov_b32_e32 v12, v26
+; NOOPT-NEXT:    s_waitcnt vmcnt(6)
 ; NOOPT-NEXT:    v_mov_b32_e32 v13, v25
+; NOOPT-NEXT:    s_waitcnt vmcnt(5)
 ; NOOPT-NEXT:    v_mov_b32_e32 v14, v24
+; NOOPT-NEXT:    s_waitcnt vmcnt(4)
 ; NOOPT-NEXT:    v_mov_b32_e32 v8, v23
-; NOOPT-NEXT:    s_waitcnt vmcnt(0)
+; NOOPT-NEXT:    s_waitcnt vmcnt(3)
 ; NOOPT-NEXT:    v_mov_b32_e32 v9, v30
+; NOOPT-NEXT:    s_waitcnt vmcnt(2)
 ; NOOPT-NEXT:    v_mov_b32_e32 v10, v29
+; NOOPT-NEXT:    s_waitcnt vmcnt(1)
 ; NOOPT-NEXT:    v_mov_b32_e32 v11, v28
+; NOOPT-NEXT:    s_waitcnt vmcnt(0)
 ; NOOPT-NEXT:    v_mov_b32_e32 v15, v27
 ; NOOPT-NEXT:    ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16_vgpr17_vgpr18 killed $exec
 ; NOOPT-NEXT:    v_mov_b32_e32 v16, v11
@@ -4592,6 +4605,7 @@ define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, p
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
 ; NOOPT-NEXT:    v_readlane_b32 s0, v31, 6
 ; NOOPT-NEXT:    v_readlane_b32 s1, v31, 7
+; NOOPT-NEXT:    buffer_load_dword v17, off, s[20:23], 0 offset:136 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v0, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v1, off, s[20:23], 0 offset:8 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v2, off, s[20:23], 0 offset:12 ; 4-byte Folded Reload
@@ -4616,12 +4630,12 @@ define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, p
 ; NOOPT-NEXT:    s_waitcnt expcnt(0)
 ; NOOPT-NEXT:    buffer_load_dword v15, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v16, off, s[20:23], 0 offset:68 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v17, off, s[20:23], 0 offset:136 ; 4-byte Folded Reload
-; NOOPT-NEXT:    s_waitcnt vmcnt(0)
+; NOOPT-NEXT:    s_waitcnt vmcnt(14)
 ; NOOPT-NEXT:    v_readfirstlane_b32 s2, v17
 ; NOOPT-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v17
 ; NOOPT-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; NOOPT-NEXT:    s_add_i32 m0, s2, -16
+; NOOPT-NEXT:    s_waitcnt vmcnt(0)
 ; NOOPT-NEXT:    v_movreld_b32_e32 v0, v16
 ; NOOPT-NEXT:    buffer_store_dword v0, off, s[20:23], 0 offset:140 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    buffer_store_dword v1, off, s[20:23], 0 offset:144 ; 4-byte Folded Spill
@@ -4681,41 +4695,52 @@ define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, p
 ; NOOPT-NEXT:    v_readlane_b32 s1, v31, 1
 ; NOOPT-NEXT:    v_readlane_b32 s2, v31, 2
 ; NOOPT-NEXT:    v_readlane_b32 s3, v31, 3
-; NOOPT-NEXT:    buffer_load_dword v15, off, s[20:23], 0 offset:140 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v16, off, s[20:23], 0 offset:144 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v17, off, s[20:23], 0 offset:148 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v18, off, s[20:23], 0 offset:152 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v19, off, s[20:23], 0 offset:156 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v20, off, s[20:23], 0 offset:160 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v21, off, s[20:23], 0 offset:164 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v17, off, s[20:23], 0 offset:148 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v16, off, s[20:23], 0 offset:144 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v15, off, s[20:23], 0 offset:140 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v22, off, s[20:23], 0 offset:168 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v23, off, s[20:23], 0 offset:172 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v24, off, s[20:23], 0 offset:176 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v25, off, s[20:23], 0 offset:180 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v21, off, s[20:23], 0 offset:164 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v20, off, s[20:23], 0 offset:160 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v19, off, s[20:23], 0 offset:156 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v26, off, s[20:23], 0 offset:184 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v27, off, s[20:23], 0 offset:188 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v28, off, s[20:23], 0 offset:192 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v29, off, s[20:23], 0 offset:196 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v25, off, s[20:23], 0 offset:180 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v24, off, s[20:23], 0 offset:176 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v23, off, s[20:23], 0 offset:172 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v30, off, s[20:23], 0 offset:200 ; 4-byte Folded Reload
-; NOOPT-NEXT:    s_waitcnt vmcnt(12)
+; NOOPT-NEXT:    buffer_load_dword v29, off, s[20:23], 0 offset:196 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v28, off, s[20:23], 0 offset:192 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v27, off, s[20:23], 0 offset:188 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_waitcnt vmcnt(14)
 ; NOOPT-NEXT:    v_mov_b32_e32 v4, v18
 ; NOOPT-NEXT:    v_mov_b32_e32 v5, v17
+; NOOPT-NEXT:    s_waitcnt vmcnt(13)
 ; NOOPT-NEXT:    v_mov_b32_e32 v6, v16
+; NOOPT-NEXT:    s_waitcnt vmcnt(12)
 ; NOOPT-NEXT:    v_mov_b32_e32 v0, v15
-; NOOPT-NEXT:    s_waitcnt vmcnt(8)
+; NOOPT-NEXT:    s_waitcnt vmcnt(11)
 ; NOOPT-NEXT:    v_mov_b32_e32 v1, v22
+; NOOPT-NEXT:    s_waitcnt vmcnt(10)
 ; NOOPT-NEXT:    v_mov_b32_e32 v2, v21
+; NOOPT-NEXT:    s_waitcnt vmcnt(9)
 ; NOOPT-NEXT:    v_mov_b32_e32 v3, v20
+; NOOPT-NEXT:    s_waitcnt vmcnt(8)
 ; NOOPT-NEXT:    v_mov_b32_e32 v7, v19
-; NOOPT-NEXT:    s_waitcnt vmcnt(4)
+; NOOPT-NEXT:    s_waitcnt vmcnt(7)
 ; NOOPT-NEXT:    v_mov_b32_e32 v12, v26
+; NOOPT-NEXT:    s_waitcnt vmcnt(6)
 ; NOOPT-NEXT:    v_mov_b32_e32 v13, v25
+; NOOPT-NEXT:    s_waitcnt vmcnt(5)
 ; NOOPT-NEXT:    v_mov_b32_e32 v14, v24
+; NOOPT-NEXT:    s_waitcnt vmcnt(4)
 ; NOOPT-NEXT:    v_mov_b32_e32 v8, v23
-; NOOPT-NEXT:    s_waitcnt vmcnt(0)
+; NOOPT-NEXT:    s_waitcnt vmcnt(3)
 ; NOOPT-NEXT:    v_mov_b32_e32 v9, v30
+; NOOPT-NEXT:    s_waitcnt vmcnt(2)
 ; NOOPT-NEXT:    v_mov_b32_e32 v10, v29
+; NOOPT-NEXT:    s_waitcnt vmcnt(1)
 ; NOOPT-NEXT:    v_mov_b32_e32 v11, v28
+; NOOPT-NEXT:    s_waitcnt vmcnt(0)
 ; NOOPT-NEXT:    v_mov_b32_e32 v15, v27
 ; NOOPT-NEXT:    ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16_vgpr17_vgpr18 killed $exec
 ; NOOPT-NEXT:    v_mov_b32_e32 v16, v11
@@ -5132,7 +5157,7 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
 ; NOOPT-NEXT:    v_readlane_b32 s0, v18, 23
 ; NOOPT-NEXT:    v_readlane_b32 s1, v18, 24
-; NOOPT-NEXT:    buffer_load_dword v17, off, s[36:39], 0 offset:80 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v16, off, s[36:39], 0 offset:72 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:4 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:8 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v2, off, s[36:39], 0 offset:12 ; 4-byte Folded Reload
@@ -5149,12 +5174,13 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1
 ; NOOPT-NEXT:    buffer_load_dword v13, off, s[36:39], 0 offset:56 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v14, off, s[36:39], 0 offset:60 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v15, off, s[36:39], 0 offset:64 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v16, off, s[36:39], 0 offset:72 ; 4-byte Folded Reload
-; NOOPT-NEXT:    s_waitcnt vmcnt(0)
+; NOOPT-NEXT:    buffer_load_dword v17, off, s[36:39], 0 offset:80 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_waitcnt vmcnt(14)
 ; NOOPT-NEXT:    v_readfirstlane_b32 s2, v16
 ; NOOPT-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v16
 ; NOOPT-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; NOOPT-NEXT:    s_mov_b32 m0, s2
+; NOOPT-NEXT:    s_waitcnt vmcnt(1)
 ; NOOPT-NEXT:    v_movrels_b32_e32 v0, v0
 ; NOOPT-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:84 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:80 ; 4-byte Folded Spill
@@ -5249,7 +5275,7 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
 ; NOOPT-NEXT:    v_readlane_b32 s0, v18, 28
 ; NOOPT-NEXT:    v_readlane_b32 s1, v18, 29
-; NOOPT-NEXT:    buffer_load_dword v17, off, s[36:39], 0 offset:152 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v16, off, s[36:39], 0 offset:68 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:88 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:92 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v2, off, s[36:39], 0 offset:96 ; 4-byte Folded Reload
@@ -5266,12 +5292,13 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1
 ; NOOPT-NEXT:    buffer_load_dword v13, off, s[36:39], 0 offset:140 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v14, off, s[36:39], 0 offset:144 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v15, off, s[36:39], 0 offset:148 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v16, off, s[36:39], 0 offset:68 ; 4-byte Folded Reload
-; NOOPT-NEXT:    s_waitcnt vmcnt(0)
+; NOOPT-NEXT:    buffer_load_dword v17, off, s[36:39], 0 offset:152 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_waitcnt vmcnt(14)
 ; NOOPT-NEXT:    v_readfirstlane_b32 s2, v16
 ; NOOPT-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v16
 ; NOOPT-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; NOOPT-NEXT:    s_mov_b32 m0, s2
+; NOOPT-NEXT:    s_waitcnt vmcnt(1)
 ; NOOPT-NEXT:    v_movrels_b32_e32 v0, v0
 ; NOOPT-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:156 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:152 ; 4-byte Folded Spill
@@ -5302,10 +5329,10 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1
 ; NOOPT-NEXT:    v_readlane_b32 s5, v18, 1
 ; NOOPT-NEXT:    v_readlane_b32 s6, v18, 2
 ; NOOPT-NEXT:    v_readlane_b32 s7, v18, 3
-; NOOPT-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:76 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:156 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v2, off, s[36:39], 0 offset:84 ; 4-byte Folded Reload
-; NOOPT-NEXT:    s_waitcnt vmcnt(0)
+; NOOPT-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:156 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:76 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_waitcnt vmcnt(2)
 ; NOOPT-NEXT:    buffer_store_dword v2, off, s[4:7], 0
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
 ; NOOPT-NEXT:    buffer_store_dword v1, off, s[4:7], 0
@@ -5860,6 +5887,7 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1)
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
 ; NOOPT-NEXT:    v_readlane_b32 s0, v32, 7
 ; NOOPT-NEXT:    v_readlane_b32 s1, v32, 8
+; NOOPT-NEXT:    buffer_load_dword v17, off, s[28:31], 0 offset:80 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v0, off, s[28:31], 0 offset:4 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v1, off, s[28:31], 0 offset:8 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v2, off, s[28:31], 0 offset:12 ; 4-byte Folded Reload
@@ -5884,12 +5912,12 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1)
 ; NOOPT-NEXT:    s_waitcnt expcnt(0)
 ; NOOPT-NEXT:    buffer_load_dword v15, off, s[28:31], 0 offset:64 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v16, off, s[28:31], 0 offset:72 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v17, off, s[28:31], 0 offset:80 ; 4-byte Folded Reload
-; NOOPT-NEXT:    s_waitcnt vmcnt(0)
+; NOOPT-NEXT:    s_waitcnt vmcnt(14)
 ; NOOPT-NEXT:    v_readfirstlane_b32 s2, v17
 ; NOOPT-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v17
 ; NOOPT-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; NOOPT-NEXT:    s_mov_b32 m0, s2
+; NOOPT-NEXT:    s_waitcnt vmcnt(0)
 ; NOOPT-NEXT:    v_movreld_b32_e32 v0, v16
 ; NOOPT-NEXT:    buffer_store_dword v0, off, s[28:31], 0 offset:88 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    buffer_store_dword v1, off, s[28:31], 0 offset:92 ; 4-byte Folded Spill
@@ -5998,6 +6026,7 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1)
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
 ; NOOPT-NEXT:    v_readlane_b32 s0, v32, 11
 ; NOOPT-NEXT:    v_readlane_b32 s1, v32, 12
+; NOOPT-NEXT:    buffer_load_dword v17, off, s[28:31], 0 offset:76 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v0, off, s[28:31], 0 offset:152 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v1, off, s[28:31], 0 offset:156 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v2, off, s[28:31], 0 offset:160 ; 4-byte Folded Reload
@@ -6022,12 +6051,12 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1)
 ; NOOPT-NEXT:    s_waitcnt expcnt(0)
 ; NOOPT-NEXT:    buffer_load_dword v15, off, s[28:31], 0 offset:212 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v16, off, s[28:31], 0 offset:216 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v17, off, s[28:31], 0 offset:76 ; 4-byte Folded Reload
-; NOOPT-NEXT:    s_waitcnt vmcnt(0)
+; NOOPT-NEXT:    s_waitcnt vmcnt(14)
 ; NOOPT-NEXT:    v_readfirstlane_b32 s2, v17
 ; NOOPT-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v17
 ; NOOPT-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; NOOPT-NEXT:    s_mov_b32 m0, s2
+; NOOPT-NEXT:    s_waitcnt vmcnt(0)
 ; NOOPT-NEXT:    v_movreld_b32_e32 v0, v16
 ; NOOPT-NEXT:    buffer_store_dword v0, off, s[28:31], 0 offset:220 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    buffer_store_dword v1, off, s[28:31], 0 offset:224 ; 4-byte Folded Spill
@@ -6088,42 +6117,52 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1)
 ; NOOPT-NEXT:    v_readlane_b32 s5, v32, 1
 ; NOOPT-NEXT:    v_readlane_b32 s6, v32, 2
 ; NOOPT-NEXT:    v_readlane_b32 s7, v32, 3
-; NOOPT-NEXT:    buffer_load_dword v0, off, s[28:31], 0 offset:84 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v16, off, s[28:31], 0 offset:220 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v17, off, s[28:31], 0 offset:224 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v18, off, s[28:31], 0 offset:228 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v19, off, s[28:31], 0 offset:232 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v20, off, s[28:31], 0 offset:236 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v21, off, s[28:31], 0 offset:240 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v22, off, s[28:31], 0 offset:244 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v18, off, s[28:31], 0 offset:228 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v17, off, s[28:31], 0 offset:224 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v16, off, s[28:31], 0 offset:220 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v23, off, s[28:31], 0 offset:248 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v24, off, s[28:31], 0 offset:252 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v25, off, s[28:31], 0 offset:256 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v26, off, s[28:31], 0 offset:260 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v22, off, s[28:31], 0 offset:244 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v21, off, s[28:31], 0 offset:240 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v20, off, s[28:31], 0 offset:236 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v27, off, s[28:31], 0 offset:264 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v28, off, s[28:31], 0 offset:268 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v29, off, s[28:31], 0 offset:272 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v30, off, s[28:31], 0 offset:276 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v26, off, s[28:31], 0 offset:260 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v25, off, s[28:31], 0 offset:256 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v24, off, s[28:31], 0 offset:252 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v31, off, s[28:31], 0 offset:280 ; 4-byte Folded Reload
-; NOOPT-NEXT:    s_waitcnt vmcnt(12)
+; NOOPT-NEXT:    buffer_load_dword v30, off, s[28:31], 0 offset:276 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v29, off, s[28:31], 0 offset:272 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v28, off, s[28:31], 0 offset:268 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v0, off, s[28:31], 0 offset:84 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_waitcnt vmcnt(14)
 ; NOOPT-NEXT:    v_mov_b32_e32 v5, v19
 ; NOOPT-NEXT:    v_mov_b32_e32 v6, v18
 ; NOOPT-NEXT:    v_mov_b32_e32 v7, v17
+; NOOPT-NEXT:    s_waitcnt vmcnt(13)
 ; NOOPT-NEXT:    v_mov_b32_e32 v1, v16
-; NOOPT-NEXT:    s_waitcnt vmcnt(8)
+; NOOPT-NEXT:    s_waitcnt vmcnt(12)
 ; NOOPT-NEXT:    v_mov_b32_e32 v2, v23
+; NOOPT-NEXT:    s_waitcnt vmcnt(11)
 ; NOOPT-NEXT:    v_mov_b32_e32 v3, v22
+; NOOPT-NEXT:    s_waitcnt vmcnt(10)
 ; NOOPT-NEXT:    v_mov_b32_e32 v4, v21
+; NOOPT-NEXT:    s_waitcnt vmcnt(9)
 ; NOOPT-NEXT:    v_mov_b32_e32 v8, v20
-; NOOPT-NEXT:    s_waitcnt vmcnt(4)
+; NOOPT-NEXT:    s_waitcnt vmcnt(8)
 ; NOOPT-NEXT:    v_mov_b32_e32 v13, v27
+; NOOPT-NEXT:    s_waitcnt vmcnt(7)
 ; NOOPT-NEXT:    v_mov_b32_e32 v14, v26
+; NOOPT-NEXT:    s_waitcnt vmcnt(6)
 ; NOOPT-NEXT:    v_mov_b32_e32 v15, v25
+; NOOPT-NEXT:    s_waitcnt vmcnt(5)
 ; NOOPT-NEXT:    v_mov_b32_e32 v9, v24
-; NOOPT-NEXT:    s_waitcnt vmcnt(0)
+; NOOPT-NEXT:    s_waitcnt vmcnt(4)
 ; NOOPT-NEXT:    v_mov_b32_e32 v10, v31
+; NOOPT-NEXT:    s_waitcnt vmcnt(3)
 ; NOOPT-NEXT:    v_mov_b32_e32 v11, v30
+; NOOPT-NEXT:    s_waitcnt vmcnt(2)
 ; NOOPT-NEXT:    v_mov_b32_e32 v12, v29
+; NOOPT-NEXT:    s_waitcnt vmcnt(1)
 ; NOOPT-NEXT:    v_mov_b32_e32 v16, v28
 ; NOOPT-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17_vgpr18_vgpr19 killed $exec
 ; NOOPT-NEXT:    v_mov_b32_e32 v17, v12
@@ -9094,6 +9133,7 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
 ; NOOPT-NEXT:    v_readlane_b32 s0, v18, 6
 ; NOOPT-NEXT:    v_readlane_b32 s1, v18, 7
+; NOOPT-NEXT:    buffer_load_dword v17, off, s[24:27], 0 offset:80 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:12 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v1, off, s[24:27], 0 offset:16 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v2, off, s[24:27], 0 offset:20 ; 4-byte Folded Reload
@@ -9118,12 +9158,12 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
 ; NOOPT-NEXT:    s_waitcnt expcnt(0)
 ; NOOPT-NEXT:    buffer_load_dword v15, off, s[24:27], 0 offset:72 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v16, off, s[24:27], 0 offset:76 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v17, off, s[24:27], 0 offset:80 ; 4-byte Folded Reload
-; NOOPT-NEXT:    s_waitcnt vmcnt(0)
+; NOOPT-NEXT:    s_waitcnt vmcnt(14)
 ; NOOPT-NEXT:    v_readfirstlane_b32 s2, v17
 ; NOOPT-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v17
 ; NOOPT-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; NOOPT-NEXT:    s_mov_b32 m0, s2
+; NOOPT-NEXT:    s_waitcnt vmcnt(0)
 ; NOOPT-NEXT:    v_movreld_b32_e32 v0, v16
 ; NOOPT-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:84 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    buffer_store_dword v1, off, s[24:27], 0 offset:88 ; 4-byte Folded Spill
@@ -9580,6 +9620,7 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
 ; NOOPT-NEXT:    v_readlane_b32 s0, v33, 9
 ; NOOPT-NEXT:    v_readlane_b32 s1, v33, 10
+; NOOPT-NEXT:    buffer_load_dword v17, off, s[16:19], 0 offset:132 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v1, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v2, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload
@@ -9604,12 +9645,12 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace
 ; NOOPT-NEXT:    s_waitcnt expcnt(0)
 ; NOOPT-NEXT:    buffer_load_dword v15, off, s[16:19], 0 offset:64 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v16, off, s[16:19], 0 offset:144 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v17, off, s[16:19], 0 offset:132 ; 4-byte Folded Reload
-; NOOPT-NEXT:    s_waitcnt vmcnt(0)
+; NOOPT-NEXT:    s_waitcnt vmcnt(14)
 ; NOOPT-NEXT:    v_readfirstlane_b32 s2, v17
 ; NOOPT-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v17
 ; NOOPT-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; NOOPT-NEXT:    s_mov_b32 m0, s2
+; NOOPT-NEXT:    s_waitcnt vmcnt(0)
 ; NOOPT-NEXT:    v_movreld_b32_e32 v1, v16
 ; NOOPT-NEXT:    buffer_store_dword v0, off, s[16:19], 0 offset:148 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    buffer_store_dword v1, off, s[16:19], 0 offset:152 ; 4-byte Folded Spill
@@ -9669,48 +9710,58 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace
 ; NOOPT-NEXT:    v_readlane_b32 s1, v33, 4
 ; NOOPT-NEXT:    v_readlane_b32 s2, v33, 5
 ; NOOPT-NEXT:    v_readlane_b32 s3, v33, 6
-; NOOPT-NEXT:    buffer_load_dword v4, off, s[16:19], 0 offset:136 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v5, off, s[16:19], 0 offset:140 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v17, off, s[16:19], 0 offset:148 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v18, off, s[16:19], 0 offset:152 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v19, off, s[16:19], 0 offset:156 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v20, off, s[16:19], 0 offset:160 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v21, off, s[16:19], 0 offset:164 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v22, off, s[16:19], 0 offset:168 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v23, off, s[16:19], 0 offset:172 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v19, off, s[16:19], 0 offset:156 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v18, off, s[16:19], 0 offset:152 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v17, off, s[16:19], 0 offset:148 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v24, off, s[16:19], 0 offset:176 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v25, off, s[16:19], 0 offset:180 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v26, off, s[16:19], 0 offset:184 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v27, off, s[16:19], 0 offset:188 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v23, off, s[16:19], 0 offset:172 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v22, off, s[16:19], 0 offset:168 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v21, off, s[16:19], 0 offset:164 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v28, off, s[16:19], 0 offset:192 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v29, off, s[16:19], 0 offset:196 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v30, off, s[16:19], 0 offset:200 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v31, off, s[16:19], 0 offset:204 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v27, off, s[16:19], 0 offset:188 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v26, off, s[16:19], 0 offset:184 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v25, off, s[16:19], 0 offset:180 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v32, off, s[16:19], 0 offset:208 ; 4-byte Folded Reload
-; NOOPT-NEXT:    s_waitcnt vmcnt(12)
+; NOOPT-NEXT:    buffer_load_dword v31, off, s[16:19], 0 offset:204 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v30, off, s[16:19], 0 offset:200 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v29, off, s[16:19], 0 offset:196 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v4, off, s[16:19], 0 offset:136 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v5, off, s[16:19], 0 offset:140 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_waitcnt vmcnt(14)
 ; NOOPT-NEXT:    v_mov_b32_e32 v6, v20
 ; NOOPT-NEXT:    v_mov_b32_e32 v7, v19
 ; NOOPT-NEXT:    v_mov_b32_e32 v8, v18
 ; NOOPT-NEXT:    v_mov_b32_e32 v0, v17
-; NOOPT-NEXT:    s_waitcnt vmcnt(8)
+; NOOPT-NEXT:    s_waitcnt vmcnt(13)
 ; NOOPT-NEXT:    v_mov_b32_e32 v1, v24
+; NOOPT-NEXT:    s_waitcnt vmcnt(12)
 ; NOOPT-NEXT:    v_mov_b32_e32 v2, v23
+; NOOPT-NEXT:    s_waitcnt vmcnt(11)
 ; NOOPT-NEXT:    v_mov_b32_e32 v3, v22
+; NOOPT-NEXT:    s_waitcnt vmcnt(10)
 ; NOOPT-NEXT:    v_mov_b32_e32 v9, v21
-; NOOPT-NEXT:    s_waitcnt vmcnt(4)
+; NOOPT-NEXT:    s_waitcnt vmcnt(9)
 ; NOOPT-NEXT:    v_mov_b32_e32 v14, v28
+; NOOPT-NEXT:    s_waitcnt vmcnt(8)
 ; NOOPT-NEXT:    v_mov_b32_e32 v15, v27
+; NOOPT-NEXT:    s_waitcnt vmcnt(7)
 ; NOOPT-NEXT:    v_mov_b32_e32 v16, v26
+; NOOPT-NEXT:    s_waitcnt vmcnt(6)
 ; NOOPT-NEXT:    v_mov_b32_e32 v10, v25
-; NOOPT-NEXT:    s_waitcnt vmcnt(0)
+; NOOPT-NEXT:    s_waitcnt vmcnt(5)
 ; NOOPT-NEXT:    v_mov_b32_e32 v11, v32
+; NOOPT-NEXT:    s_waitcnt vmcnt(4)
 ; NOOPT-NEXT:    v_mov_b32_e32 v12, v31
+; NOOPT-NEXT:    s_waitcnt vmcnt(3)
 ; NOOPT-NEXT:    v_mov_b32_e32 v13, v30
+; NOOPT-NEXT:    s_waitcnt vmcnt(2)
 ; NOOPT-NEXT:    v_mov_b32_e32 v17, v29
 ; NOOPT-NEXT:    ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18_vgpr19_vgpr20 killed $exec
 ; NOOPT-NEXT:    v_mov_b32_e32 v18, v13
 ; NOOPT-NEXT:    v_mov_b32_e32 v19, v12
 ; NOOPT-NEXT:    v_mov_b32_e32 v20, v11
+; NOOPT-NEXT:    s_waitcnt vmcnt(0)
 ; NOOPT-NEXT:    v_mov_b32_e32 v12, v5
 ; NOOPT-NEXT:    v_mov_b32_e32 v11, v4
 ; NOOPT-NEXT:    buffer_store_dwordx4 v[17:20], v[11:12], s[0:3], 0 addr64 offset:48
diff --git a/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll b/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll
index 0d3f342f7735e..a7fb563e6698c 100644
--- a/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll
+++ b/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL %s
 
 ; Check for verifier error due to trying to save and restore SCC
 ; around a waterfall looop when it was never defined. We have to get
@@ -59,14 +59,15 @@ define void @issue92561(ptr addrspace(1) %arg) {
 ; SDAG-NEXT:    s_mov_b32 s7, s12
 ; SDAG-NEXT:    s_clause 0x2
 ; SDAG-NEXT:    image_sample_c_lz v0, [v1, v1, v0, v1], s[0:7], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
-; SDAG-NEXT:    image_sample_c_lz v3, [v1, v1, v1, v1], s[0:7], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
 ; SDAG-NEXT:    image_sample_c_lz v2, [v1, v2, v1, v1], s[0:7], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
+; SDAG-NEXT:    image_sample_c_lz v3, [v1, v1, v1, v1], s[0:7], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
 ; SDAG-NEXT:    s_waitcnt vmcnt(2)
 ; SDAG-NEXT:    v_add_f32_e32 v0, v9, v0
-; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; SDAG-NEXT:    s_waitcnt vmcnt(1)
+; SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; SDAG-NEXT:    v_add_f32_e32 v0, v2, v0
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v1
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    v_add_f32_e32 v0, v3, v0
 ; SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-NEXT:    v_mul_f32_e32 v0, 0x3e800000, v0
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
index a2da8876472ab..c1078e280621c 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
@@ -5199,10 +5199,10 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0,
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_load_dword s2, s[4:5], 0x9
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa
-; SI-NEXT:    buffer_load_ubyte v4, off, s[4:7], 0 offset:49
 ; SI-NEXT:    buffer_load_ubyte v5, off, s[4:7], 0 offset:50
-; SI-NEXT:    buffer_load_ubyte v6, off, s[4:7], 0 offset:51
 ; SI-NEXT:    buffer_load_ubyte v7, off, s[4:7], 0 offset:52
+; SI-NEXT:    buffer_load_ubyte v4, off, s[4:7], 0 offset:49
+; SI-NEXT:    buffer_load_ubyte v6, off, s[4:7], 0 offset:51
 ; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:53
 ; SI-NEXT:    s_mov_b64 s[4:5], 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5684,17 +5684,17 @@ define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) {
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    s_add_u32 s0, s4, 42
 ; VI-NEXT:    s_addc_u32 s1, s5, 0
-; VI-NEXT:    v_mov_b32_e32 v5, s1
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    v_mov_b32_e32 v5, s1
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    flat_load_ushort v0, v[0:1]
 ; VI-NEXT:    flat_load_ushort v4, v[4:5]
 ; VI-NEXT:    flat_load_ushort v2, v[2:3]
-; VI-NEXT:    flat_load_ushort v0, v[0:1]
 ; VI-NEXT:    s_load_dword s0, s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s0
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    flat_store_byte v[0:1], v1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    flat_store_short v[0:1], v4
diff --git a/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll b/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll
index 69a871f6f6ae5..a33fd0eae8726 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll
@@ -341,14 +341,15 @@ define amdgpu_kernel void @test_flat_misaligned_v4(ptr %arg) {
 ; SPLIT-NEXT:    v_add_co_u32 v6, vcc_lo, v0, 8
 ; SPLIT-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
 ; SPLIT-NEXT:    s_clause 0x3
-; SPLIT-NEXT:    flat_load_dword v8, v[2:3]
 ; SPLIT-NEXT:    flat_load_dword v9, v[4:5]
 ; SPLIT-NEXT:    flat_load_dword v10, v[0:1]
+; SPLIT-NEXT:    flat_load_dword v8, v[2:3]
 ; SPLIT-NEXT:    flat_load_dword v11, v[6:7]
-; SPLIT-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
+; SPLIT-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
 ; SPLIT-NEXT:    flat_store_dword v[6:7], v9
-; SPLIT-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(2)
+; SPLIT-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(3)
 ; SPLIT-NEXT:    flat_store_dword v[2:3], v10
+; SPLIT-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(3)
 ; SPLIT-NEXT:    flat_store_dword v[0:1], v8
 ; SPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(3)
 ; SPLIT-NEXT:    flat_store_dword v[4:5], v11
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
index efb55db486489..eea59d943279b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
@@ -807,22 +807,22 @@ define amdgpu_kernel void @fmuladd_v2f16(
 ; VI-FLUSH-NEXT:    s_mov_b32 s14, s10
 ; VI-FLUSH-NEXT:    s_mov_b32 s15, s11
 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT:    s_mov_b32 s12, s2
-; VI-FLUSH-NEXT:    s_mov_b32 s13, s3
 ; VI-FLUSH-NEXT:    s_mov_b32 s16, s4
 ; VI-FLUSH-NEXT:    s_mov_b32 s17, s5
-; VI-FLUSH-NEXT:    s_mov_b32 s18, s10
-; VI-FLUSH-NEXT:    s_mov_b32 s19, s11
 ; VI-FLUSH-NEXT:    s_mov_b32 s4, s6
 ; VI-FLUSH-NEXT:    s_mov_b32 s5, s7
 ; VI-FLUSH-NEXT:    s_mov_b32 s6, s10
 ; VI-FLUSH-NEXT:    s_mov_b32 s7, s11
-; VI-FLUSH-NEXT:    buffer_load_dword v0, off, s[12:15], 0
+; VI-FLUSH-NEXT:    s_mov_b32 s12, s2
+; VI-FLUSH-NEXT:    s_mov_b32 s13, s3
+; VI-FLUSH-NEXT:    s_mov_b32 s18, s10
+; VI-FLUSH-NEXT:    s_mov_b32 s19, s11
 ; VI-FLUSH-NEXT:    buffer_load_dword v1, off, s[4:7], 0
+; VI-FLUSH-NEXT:    buffer_load_dword v0, off, s[12:15], 0
 ; VI-FLUSH-NEXT:    buffer_load_dword v2, off, s[16:19], 0
 ; VI-FLUSH-NEXT:    s_mov_b32 s8, s0
 ; VI-FLUSH-NEXT:    s_mov_b32 s9, s1
-; VI-FLUSH-NEXT:    s_waitcnt vmcnt(1)
+; VI-FLUSH-NEXT:    s_waitcnt vmcnt(2)
 ; VI-FLUSH-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; VI-FLUSH-NEXT:    v_mac_f16_sdwa v3, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll
index f971080e02c5b..515b9a425b2d7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll
@@ -2399,9 +2399,9 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1)
 ; GFX950-NEXT:    scratch_load_dword v50, off, s32 offset:84
 ; GFX950-NEXT:    scratch_load_dword v49, off, s32 offset:96
 ; GFX950-NEXT:    scratch_load_dword v48, off, s32 offset:92
-; GFX950-NEXT:    scratch_load_dword v31, off, s32
 ; GFX950-NEXT:    scratch_load_dword v35, off, s32 offset:104
 ; GFX950-NEXT:    scratch_load_dword v34, off, s32 offset:100
+; GFX950-NEXT:    scratch_load_dword v31, off, s32
 ; GFX950-NEXT:    v_accvgpr_write_b32 a10, v58 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_write_b32 a11, v59 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_write_b32 a12, v60 ; Reload Reuse
@@ -2485,7 +2485,7 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1)
 ; GFX950-NEXT:    v_accvgpr_read_b32 v43, a3 ; Reload Reuse
 ; GFX950-NEXT:    v_cndmask_b32_e64 v22, v0, 0, vcc
 ; GFX950-NEXT:    v_cndmask_b32_e32 v23, v1, v2, vcc
-; GFX950-NEXT:    s_waitcnt vmcnt(6)
+; GFX950-NEXT:    s_waitcnt vmcnt(7)
 ; GFX950-NEXT:    v_max_f64 v[0:1], v[24:25], v[34:35]
 ; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[24:25], v[34:35]
 ; GFX950-NEXT:    v_accvgpr_read_b32 v42, a2 ; Reload Reuse
@@ -2529,25 +2529,25 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1)
 ; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:12
 ; GFX10-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:24
 ; GFX10-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:20
-; GFX10-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:36
 ; GFX10-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:32
 ; GFX10-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:28
-; GFX10-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:68
-; GFX10-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:64
-; GFX10-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:60
-; GFX10-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:56
-; GFX10-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:52
+; GFX10-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:36
+; GFX10-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:40
 ; GFX10-NEXT:    buffer_load_dword v65, off, s[0:3], s32 offset:48
 ; GFX10-NEXT:    buffer_load_dword v64, off, s[0:3], s32 offset:44
-; GFX10-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:40
-; GFX10-NEXT:    buffer_load_dword v66, off, s[0:3], s32 offset:100
-; GFX10-NEXT:    buffer_load_dword v69, off, s[0:3], s32 offset:96
-; GFX10-NEXT:    buffer_load_dword v68, off, s[0:3], s32 offset:92
-; GFX10-NEXT:    buffer_load_dword v71, off, s[0:3], s32 offset:88
-; GFX10-NEXT:    buffer_load_dword v70, off, s[0:3], s32 offset:84
+; GFX10-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:56
+; GFX10-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:52
+; GFX10-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:64
+; GFX10-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:60
+; GFX10-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:68
+; GFX10-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:72
 ; GFX10-NEXT:    buffer_load_dword v81, off, s[0:3], s32 offset:80
 ; GFX10-NEXT:    buffer_load_dword v80, off, s[0:3], s32 offset:76
-; GFX10-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:72
+; GFX10-NEXT:    buffer_load_dword v71, off, s[0:3], s32 offset:88
+; GFX10-NEXT:    buffer_load_dword v70, off, s[0:3], s32 offset:84
+; GFX10-NEXT:    buffer_load_dword v69, off, s[0:3], s32 offset:96
+; GFX10-NEXT:    buffer_load_dword v68, off, s[0:3], s32 offset:92
+; GFX10-NEXT:    buffer_load_dword v66, off, s[0:3], s32 offset:100
 ; GFX10-NEXT:    s_waitcnt vmcnt(23)
 ; GFX10-NEXT:    v_max_f64 v[82:83], v[0:1], v[31:32]
 ; GFX10-NEXT:    v_cmp_u_f64_e32 vcc_lo, v[0:1], v[31:32]
@@ -2558,38 +2558,39 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1)
 ; GFX10-NEXT:    v_max_f64 v[32:33], v[4:5], v[35:36]
 ; GFX10-NEXT:    v_cmp_u_f64_e64 s5, v[4:5], v[35:36]
 ; GFX10-NEXT:    s_clause 0x7
-; GFX10-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:112
 ; GFX10-NEXT:    buffer_load_dword v67, off, s[0:3], s32 offset:104
+; GFX10-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:112
 ; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:108
 ; GFX10-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:120
 ; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:116
 ; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX10-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:128
 ; GFX10-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:124
-; GFX10-NEXT:    s_waitcnt vmcnt(24)
+; GFX10-NEXT:    s_waitcnt vmcnt(25)
 ; GFX10-NEXT:    v_max_f64 v[34:35], v[6:7], v[48:49]
 ; GFX10-NEXT:    v_cmp_u_f64_e64 s6, v[6:7], v[48:49]
-; GFX10-NEXT:    s_waitcnt vmcnt(21)
-; GFX10-NEXT:    v_cmp_u_f64_e64 s10, v[14:15], v[52:53]
-; GFX10-NEXT:    s_waitcnt vmcnt(19)
-; GFX10-NEXT:    v_cmp_u_f64_e64 s9, v[12:13], v[54:55]
-; GFX10-NEXT:    s_waitcnt vmcnt(17)
-; GFX10-NEXT:    v_cmp_u_f64_e64 s8, v[10:11], v[64:65]
-; GFX10-NEXT:    s_waitcnt vmcnt(16)
+; GFX10-NEXT:    s_waitcnt vmcnt(23)
 ; GFX10-NEXT:    v_max_f64 v[48:49], v[8:9], v[37:38]
 ; GFX10-NEXT:    v_cmp_u_f64_e64 s7, v[8:9], v[37:38]
+; GFX10-NEXT:    s_waitcnt vmcnt(21)
 ; GFX10-NEXT:    v_max_f64 v[36:37], v[10:11], v[64:65]
+; GFX10-NEXT:    v_cmp_u_f64_e64 s8, v[10:11], v[64:65]
+; GFX10-NEXT:    s_waitcnt vmcnt(19)
 ; GFX10-NEXT:    v_max_f64 v[38:39], v[12:13], v[54:55]
+; GFX10-NEXT:    v_cmp_u_f64_e64 s9, v[12:13], v[54:55]
+; GFX10-NEXT:    s_waitcnt vmcnt(17)
 ; GFX10-NEXT:    v_max_f64 v[54:55], v[14:15], v[52:53]
+; GFX10-NEXT:    v_cmp_u_f64_e64 s10, v[14:15], v[52:53]
+; GFX10-NEXT:    s_waitcnt vmcnt(15)
+; GFX10-NEXT:    v_max_f64 v[52:53], v[16:17], v[50:51]
+; GFX10-NEXT:    v_cmp_u_f64_e64 s11, v[16:17], v[50:51]
+; GFX10-NEXT:    s_waitcnt vmcnt(13)
+; GFX10-NEXT:    v_max_f64 v[50:51], v[18:19], v[80:81]
+; GFX10-NEXT:    v_cmp_u_f64_e64 s12, v[18:19], v[80:81]
 ; GFX10-NEXT:    s_waitcnt vmcnt(11)
 ; GFX10-NEXT:    v_max_f64 v[64:65], v[20:21], v[70:71]
 ; GFX10-NEXT:    v_cmp_u_f64_e64 s13, v[20:21], v[70:71]
 ; GFX10-NEXT:    s_waitcnt vmcnt(9)
-; GFX10-NEXT:    v_cmp_u_f64_e64 s12, v[18:19], v[80:81]
-; GFX10-NEXT:    s_waitcnt vmcnt(8)
-; GFX10-NEXT:    v_max_f64 v[52:53], v[16:17], v[50:51]
-; GFX10-NEXT:    v_cmp_u_f64_e64 s11, v[16:17], v[50:51]
-; GFX10-NEXT:    v_max_f64 v[50:51], v[18:19], v[80:81]
 ; GFX10-NEXT:    v_max_f64 v[70:71], v[22:23], v[68:69]
 ; GFX10-NEXT:    v_cmp_u_f64_e64 s14, v[22:23], v[68:69]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, v34, 0, s6
@@ -2610,7 +2611,7 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1)
 ; GFX10-NEXT:    v_cndmask_b32_e64 v21, v65, 0x7ff80000, s13
 ; GFX10-NEXT:    v_cndmask_b32_e64 v22, v70, 0, s14
 ; GFX10-NEXT:    v_cndmask_b32_e64 v23, v71, 0x7ff80000, s14
-; GFX10-NEXT:    s_waitcnt vmcnt(6)
+; GFX10-NEXT:    s_waitcnt vmcnt(7)
 ; GFX10-NEXT:    v_max_f64 v[68:69], v[24:25], v[66:67]
 ; GFX10-NEXT:    v_cmp_u_f64_e64 s15, v[24:25], v[66:67]
 ; GFX10-NEXT:    s_waitcnt vmcnt(5)
@@ -2619,10 +2620,10 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1)
 ; GFX10-NEXT:    s_waitcnt vmcnt(3)
 ; GFX10-NEXT:    v_max_f64 v[80:81], v[28:29], v[2:3]
 ; GFX10-NEXT:    v_cmp_u_f64_e64 s17, v[28:29], v[2:3]
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v82, 0, vcc_lo
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_max_f64 v[86:87], v[30:31], v[4:5]
 ; GFX10-NEXT:    v_cmp_u_f64_e64 s18, v[30:31], v[4:5]
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v82, 0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v83, 0x7ff80000, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v84, 0, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v85, 0x7ff80000, s4
@@ -2642,7 +2643,6 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1)
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:8
 ; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:16
@@ -2673,51 +2673,52 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1)
 ; GFX11-NEXT:    scratch_load_b32 v82, off, s32 offset:108
 ; GFX11-NEXT:    scratch_load_b32 v85, off, s32 offset:120
 ; GFX11-NEXT:    scratch_load_b32 v84, off, s32 offset:116
+; GFX11-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v87, off, s32 offset:128
 ; GFX11-NEXT:    scratch_load_b32 v86, off, s32 offset:124
-; GFX11-NEXT:    s_waitcnt vmcnt(30)
+; GFX11-NEXT:    s_waitcnt vmcnt(31)
 ; GFX11-NEXT:    v_max_f64 v[96:97], v[0:1], v[32:33]
 ; GFX11-NEXT:    v_cmp_u_f64_e32 vcc_lo, v[0:1], v[32:33]
-; GFX11-NEXT:    s_waitcnt vmcnt(28)
+; GFX11-NEXT:    s_waitcnt vmcnt(29)
 ; GFX11-NEXT:    v_max_f64 v[32:33], v[2:3], v[34:35]
 ; GFX11-NEXT:    v_cmp_u_f64_e64 s0, v[2:3], v[34:35]
-; GFX11-NEXT:    s_waitcnt vmcnt(26)
+; GFX11-NEXT:    s_waitcnt vmcnt(27)
 ; GFX11-NEXT:    v_max_f64 v[34:35], v[4:5], v[36:37]
 ; GFX11-NEXT:    v_cmp_u_f64_e64 s1, v[4:5], v[36:37]
-; GFX11-NEXT:    s_waitcnt vmcnt(24)
+; GFX11-NEXT:    s_waitcnt vmcnt(25)
 ; GFX11-NEXT:    v_max_f64 v[36:37], v[6:7], v[38:39]
 ; GFX11-NEXT:    v_cmp_u_f64_e64 s2, v[6:7], v[38:39]
-; GFX11-NEXT:    s_waitcnt vmcnt(22)
+; GFX11-NEXT:    s_waitcnt vmcnt(23)
 ; GFX11-NEXT:    v_max_f64 v[38:39], v[8:9], v[48:49]
 ; GFX11-NEXT:    v_cmp_u_f64_e64 s3, v[8:9], v[48:49]
-; GFX11-NEXT:    s_waitcnt vmcnt(20)
+; GFX11-NEXT:    s_waitcnt vmcnt(21)
 ; GFX11-NEXT:    v_max_f64 v[48:49], v[10:11], v[50:51]
 ; GFX11-NEXT:    v_cmp_u_f64_e64 s4, v[10:11], v[50:51]
-; GFX11-NEXT:    s_waitcnt vmcnt(18)
+; GFX11-NEXT:    s_waitcnt vmcnt(19)
 ; GFX11-NEXT:    v_max_f64 v[50:51], v[12:13], v[52:53]
 ; GFX11-NEXT:    v_cmp_u_f64_e64 s5, v[12:13], v[52:53]
-; GFX11-NEXT:    s_waitcnt vmcnt(16)
+; GFX11-NEXT:    s_waitcnt vmcnt(17)
 ; GFX11-NEXT:    v_max_f64 v[52:53], v[14:15], v[54:55]
 ; GFX11-NEXT:    v_cmp_u_f64_e64 s6, v[14:15], v[54:55]
-; GFX11-NEXT:    s_waitcnt vmcnt(14)
+; GFX11-NEXT:    s_waitcnt vmcnt(15)
 ; GFX11-NEXT:    v_max_f64 v[54:55], v[16:17], v[64:65]
 ; GFX11-NEXT:    v_cmp_u_f64_e64 s7, v[16:17], v[64:65]
-; GFX11-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-NEXT:    s_waitcnt vmcnt(13)
 ; GFX11-NEXT:    v_max_f64 v[64:65], v[18:19], v[66:67]
 ; GFX11-NEXT:    v_cmp_u_f64_e64 s8, v[18:19], v[66:67]
-; GFX11-NEXT:    s_waitcnt vmcnt(10)
+; GFX11-NEXT:    s_waitcnt vmcnt(11)
 ; GFX11-NEXT:    v_max_f64 v[66:67], v[20:21], v[68:69]
 ; GFX11-NEXT:    v_cmp_u_f64_e64 s9, v[20:21], v[68:69]
-; GFX11-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-NEXT:    s_waitcnt vmcnt(9)
 ; GFX11-NEXT:    v_max_f64 v[68:69], v[22:23], v[70:71]
 ; GFX11-NEXT:    v_cmp_u_f64_e64 s10, v[22:23], v[70:71]
-; GFX11-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-NEXT:    s_waitcnt vmcnt(7)
 ; GFX11-NEXT:    v_max_f64 v[70:71], v[24:25], v[80:81]
 ; GFX11-NEXT:    v_cmp_u_f64_e64 s11, v[24:25], v[80:81]
-; GFX11-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-NEXT:    s_waitcnt vmcnt(5)
 ; GFX11-NEXT:    v_max_f64 v[80:81], v[26:27], v[82:83]
 ; GFX11-NEXT:    v_cmp_u_f64_e64 s12, v[26:27], v[82:83]
-; GFX11-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-NEXT:    s_waitcnt vmcnt(3)
 ; GFX11-NEXT:    v_max_f64 v[82:83], v[28:29], v[84:85]
 ; GFX11-NEXT:    v_cmp_u_f64_e64 s13, v[28:29], v[84:85]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -2765,7 +2766,6 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1)
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_clause 0x1f
-; GFX12-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX12-NEXT:    scratch_load_b32 v33, off, s32 offset:8
 ; GFX12-NEXT:    scratch_load_b32 v32, off, s32 offset:4
 ; GFX12-NEXT:    scratch_load_b32 v35, off, s32 offset:16
@@ -2796,37 +2796,38 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1)
 ; GFX12-NEXT:    scratch_load_b32 v82, off, s32 offset:108
 ; GFX12-NEXT:    scratch_load_b32 v85, off, s32 offset:120
 ; GFX12-NEXT:    scratch_load_b32 v84, off, s32 offset:116
+; GFX12-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX12-NEXT:    scratch_load_b32 v87, off, s32 offset:128
 ; GFX12-NEXT:    scratch_load_b32 v86, off, s32 offset:124
-; GFX12-NEXT:    s_wait_loadcnt 0x1e
+; GFX12-NEXT:    s_wait_loadcnt 0x1f
 ; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[32:33]
-; GFX12-NEXT:    s_wait_loadcnt 0x1c
+; GFX12-NEXT:    s_wait_loadcnt 0x1d
 ; GFX12-NEXT:    v_maximum_f64 v[2:3], v[2:3], v[34:35]
-; GFX12-NEXT:    s_wait_loadcnt 0x1a
+; GFX12-NEXT:    s_wait_loadcnt 0x1b
 ; GFX12-NEXT:    v_maximum_f64 v[4:5], v[4:5], v[36:37]
-; GFX12-NEXT:    s_wait_loadcnt 0x18
+; GFX12-NEXT:    s_wait_loadcnt 0x19
 ; GFX12-NEXT:    v_maximum_f64 v[6:7], v[6:7], v[38:39]
-; GFX12-NEXT:    s_wait_loadcnt 0x16
+; GFX12-NEXT:    s_wait_loadcnt 0x17
 ; GFX12-NEXT:    v_maximum_f64 v[8:9], v[8:9], v[48:49]
-; GFX12-NEXT:    s_wait_loadcnt 0x14
+; GFX12-NEXT:    s_wait_loadcnt 0x15
 ; GFX12-NEXT:    v_maximum_f64 v[10:11], v[10:11], v[50:51]
-; GFX12-NEXT:    s_wait_loadcnt 0x12
+; GFX12-NEXT:    s_wait_loadcnt 0x13
 ; GFX12-NEXT:    v_maximum_f64 v[12:13], v[12:13], v[52:53]
-; GFX12-NEXT:    s_wait_loadcnt 0x10
+; GFX12-NEXT:    s_wait_loadcnt 0x11
 ; GFX12-NEXT:    v_maximum_f64 v[14:15], v[14:15], v[54:55]
-; GFX12-NEXT:    s_wait_loadcnt 0xe
+; GFX12-NEXT:    s_wait_loadcnt 0xf
 ; GFX12-NEXT:    v_maximum_f64 v[16:17], v[16:17], v[64:65]
-; GFX12-NEXT:    s_wait_loadcnt 0xc
+; GFX12-NEXT:    s_wait_loadcnt 0xd
 ; GFX12-NEXT:    v_maximum_f64 v[18:19], v[18:19], v[66:67]
-; GFX12-NEXT:    s_wait_loadcnt 0xa
+; GFX12-NEXT:    s_wait_loadcnt 0xb
 ; GFX12-NEXT:    v_maximum_f64 v[20:21], v[20:21], v[68:69]
-; GFX12-NEXT:    s_wait_loadcnt 0x8
+; GFX12-NEXT:    s_wait_loadcnt 0x9
 ; GFX12-NEXT:    v_maximum_f64 v[22:23], v[22:23], v[70:71]
-; GFX12-NEXT:    s_wait_loadcnt 0x6
+; GFX12-NEXT:    s_wait_loadcnt 0x7
 ; GFX12-NEXT:    v_maximum_f64 v[24:25], v[24:25], v[80:81]
-; GFX12-NEXT:    s_wait_loadcnt 0x4
+; GFX12-NEXT:    s_wait_loadcnt 0x5
 ; GFX12-NEXT:    v_maximum_f64 v[26:27], v[26:27], v[82:83]
-; GFX12-NEXT:    s_wait_loadcnt 0x2
+; GFX12-NEXT:    s_wait_loadcnt 0x3
 ; GFX12-NEXT:    v_maximum_f64 v[28:29], v[28:29], v[84:85]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_maximum_f64 v[30:31], v[30:31], v[86:87]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll
index dfd67873c3b86..27d5955fbd9e1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll
@@ -2399,9 +2399,9 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1)
 ; GFX950-NEXT:    scratch_load_dword v50, off, s32 offset:84
 ; GFX950-NEXT:    scratch_load_dword v49, off, s32 offset:96
 ; GFX950-NEXT:    scratch_load_dword v48, off, s32 offset:92
-; GFX950-NEXT:    scratch_load_dword v31, off, s32
 ; GFX950-NEXT:    scratch_load_dword v35, off, s32 offset:104
 ; GFX950-NEXT:    scratch_load_dword v34, off, s32 offset:100
+; GFX950-NEXT:    scratch_load_dword v31, off, s32
 ; GFX950-NEXT:    v_accvgpr_write_b32 a10, v58 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_write_b32 a11, v59 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_write_b32 a12, v60 ; Reload Reuse
@@ -2485,7 +2485,7 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1)
 ; GFX950-NEXT:    v_accvgpr_read_b32 v43, a3 ; Reload Reuse
 ; GFX950-NEXT:    v_cndmask_b32_e64 v22, v0, 0, vcc
 ; GFX950-NEXT:    v_cndmask_b32_e32 v23, v1, v2, vcc
-; GFX950-NEXT:    s_waitcnt vmcnt(6)
+; GFX950-NEXT:    s_waitcnt vmcnt(7)
 ; GFX950-NEXT:    v_min_f64 v[0:1], v[24:25], v[34:35]
 ; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[24:25], v[34:35]
 ; GFX950-NEXT:    v_accvgpr_read_b32 v42, a2 ; Reload Reuse
@@ -2529,25 +2529,25 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1)
 ; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:12
 ; GFX10-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:24
 ; GFX10-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:20
-; GFX10-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:36
 ; GFX10-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:32
 ; GFX10-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:28
-; GFX10-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:68
-; GFX10-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:64
-; GFX10-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:60
-; GFX10-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:56
-; GFX10-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:52
+; GFX10-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:36
+; GFX10-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:40
 ; GFX10-NEXT:    buffer_load_dword v65, off, s[0:3], s32 offset:48
 ; GFX10-NEXT:    buffer_load_dword v64, off, s[0:3], s32 offset:44
-; GFX10-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:40
-; GFX10-NEXT:    buffer_load_dword v66, off, s[0:3], s32 offset:100
-; GFX10-NEXT:    buffer_load_dword v69, off, s[0:3], s32 offset:96
-; GFX10-NEXT:    buffer_load_dword v68, off, s[0:3], s32 offset:92
-; GFX10-NEXT:    buffer_load_dword v71, off, s[0:3], s32 offset:88
-; GFX10-NEXT:    buffer_load_dword v70, off, s[0:3], s32 offset:84
+; GFX10-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:56
+; GFX10-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:52
+; GFX10-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:64
+; GFX10-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:60
+; GFX10-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:68
+; GFX10-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:72
 ; GFX10-NEXT:    buffer_load_dword v81, off, s[0:3], s32 offset:80
 ; GFX10-NEXT:    buffer_load_dword v80, off, s[0:3], s32 offset:76
-; GFX10-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:72
+; GFX10-NEXT:    buffer_load_dword v71, off, s[0:3], s32 offset:88
+; GFX10-NEXT:    buffer_load_dword v70, off, s[0:3], s32 offset:84
+; GFX10-NEXT:    buffer_load_dword v69, off, s[0:3], s32 offset:96
+; GFX10-NEXT:    buffer_load_dword v68, off, s[0:3], s32 offset:92
+; GFX10-NEXT:    buffer_load_dword v66, off, s[0:3], s32 offset:100
 ; GFX10-NEXT:    s_waitcnt vmcnt(23)
 ; GFX10-NEXT:    v_min_f64 v[82:83], v[0:1], v[31:32]
 ; GFX10-NEXT:    v_cmp_u_f64_e32 vcc_lo, v[0:1], v[31:32]
@@ -2558,38 +2558,39 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1)
 ; GFX10-NEXT:    v_min_f64 v[32:33], v[4:5], v[35:36]
 ; GFX10-NEXT:    v_cmp_u_f64_e64 s5, v[4:5], v[35:36]
 ; GFX10-NEXT:    s_clause 0x7
-; GFX10-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:112
 ; GFX10-NEXT:    buffer_load_dword v67, off, s[0:3], s32 offset:104
+; GFX10-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:112
 ; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:108
 ; GFX10-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:120
 ; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:116
 ; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX10-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:128
 ; GFX10-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:124
-; GFX10-NEXT:    s_waitcnt vmcnt(24)
+; GFX10-NEXT:    s_waitcnt vmcnt(25)
 ; GFX10-NEXT:    v_min_f64 v[34:35], v[6:7], v[48:49]
 ; GFX10-NEXT:    v_cmp_u_f64_e64 s6, v[6:7], v[48:49]
-; GFX10-NEXT:    s_waitcnt vmcnt(21)
-; GFX10-NEXT:    v_cmp_u_f64_e64 s10, v[14:15], v[52:53]
-; GFX10-NEXT:    s_waitcnt vmcnt(19)
-; GFX10-NEXT:    v_cmp_u_f64_e64 s9, v[12:13], v[54:55]
-; GFX10-NEXT:    s_waitcnt vmcnt(17)
-; GFX10-NEXT:    v_cmp_u_f64_e64 s8, v[10:11], v[64:65]
-; GFX10-NEXT:    s_waitcnt vmcnt(16)
+; GFX10-NEXT:    s_waitcnt vmcnt(23)
 ; GFX10-NEXT:    v_min_f64 v[48:49], v[8:9], v[37:38]
 ; GFX10-NEXT:    v_cmp_u_f64_e64 s7, v[8:9], v[37:38]
+; GFX10-NEXT:    s_waitcnt vmcnt(21)
 ; GFX10-NEXT:    v_min_f64 v[36:37], v[10:11], v[64:65]
+; GFX10-NEXT:    v_cmp_u_f64_e64 s8, v[10:11], v[64:65]
+; GFX10-NEXT:    s_waitcnt vmcnt(19)
 ; GFX10-NEXT:    v_min_f64 v[38:39], v[12:13], v[54:55]
+; GFX10-NEXT:    v_cmp_u_f64_e64 s9, v[12:13], v[54:55]
+; GFX10-NEXT:    s_waitcnt vmcnt(17)
 ; GFX10-NEXT:    v_min_f64 v[54:55], v[14:15], v[52:53]
+; GFX10-NEXT:    v_cmp_u_f64_e64 s10, v[14:15], v[52:53]
+; GFX10-NEXT:    s_waitcnt vmcnt(15)
+; GFX10-NEXT:    v_min_f64 v[52:53], v[16:17], v[50:51]
+; GFX10-NEXT:    v_cmp_u_f64_e64 s11, v[16:17], v[50:51]
+; GFX10-NEXT:    s_waitcnt vmcnt(13)
+; GFX10-NEXT:    v_min_f64 v[50:51], v[18:19], v[80:81]
+; GFX10-NEXT:    v_cmp_u_f64_e64 s12, v[18:19], v[80:81]
 ; GFX10-NEXT:    s_waitcnt vmcnt(11)
 ; GFX10-NEXT:    v_min_f64 v[64:65], v[20:21], v[70:71]
 ; GFX10-NEXT:    v_cmp_u_f64_e64 s13, v[20:21], v[70:71]
 ; GFX10-NEXT:    s_waitcnt vmcnt(9)
-; GFX10-NEXT:    v_cmp_u_f64_e64 s12, v[18:19], v[80:81]
-; GFX10-NEXT:    s_waitcnt vmcnt(8)
-; GFX10-NEXT:    v_min_f64 v[52:53], v[16:17], v[50:51]
-; GFX10-NEXT:    v_cmp_u_f64_e64 s11, v[16:17], v[50:51]
-; GFX10-NEXT:    v_min_f64 v[50:51], v[18:19], v[80:81]
 ; GFX10-NEXT:    v_min_f64 v[70:71], v[22:23], v[68:69]
 ; GFX10-NEXT:    v_cmp_u_f64_e64 s14, v[22:23], v[68:69]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, v34, 0, s6
@@ -2610,7 +2611,7 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1)
 ; GFX10-NEXT:    v_cndmask_b32_e64 v21, v65, 0x7ff80000, s13
 ; GFX10-NEXT:    v_cndmask_b32_e64 v22, v70, 0, s14
 ; GFX10-NEXT:    v_cndmask_b32_e64 v23, v71, 0x7ff80000, s14
-; GFX10-NEXT:    s_waitcnt vmcnt(6)
+; GFX10-NEXT:    s_waitcnt vmcnt(7)
 ; GFX10-NEXT:    v_min_f64 v[68:69], v[24:25], v[66:67]
 ; GFX10-NEXT:    v_cmp_u_f64_e64 s15, v[24:25], v[66:67]
 ; GFX10-NEXT:    s_waitcnt vmcnt(5)
@@ -2619,10 +2620,10 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1)
 ; GFX10-NEXT:    s_waitcnt vmcnt(3)
 ; GFX10-NEXT:    v_min_f64 v[80:81], v[28:29], v[2:3]
 ; GFX10-NEXT:    v_cmp_u_f64_e64 s17, v[28:29], v[2:3]
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v82, 0, vcc_lo
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_min_f64 v[86:87], v[30:31], v[4:5]
 ; GFX10-NEXT:    v_cmp_u_f64_e64 s18, v[30:31], v[4:5]
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v82, 0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v83, 0x7ff80000, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v84, 0, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v85, 0x7ff80000, s4
@@ -2642,7 +2643,6 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1)
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:8
 ; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:16
@@ -2673,51 +2673,52 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1)
 ; GFX11-NEXT:    scratch_load_b32 v82, off, s32 offset:108
 ; GFX11-NEXT:    scratch_load_b32 v85, off, s32 offset:120
 ; GFX11-NEXT:    scratch_load_b32 v84, off, s32 offset:116
+; GFX11-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v87, off, s32 offset:128
 ; GFX11-NEXT:    scratch_load_b32 v86, off, s32 offset:124
-; GFX11-NEXT:    s_waitcnt vmcnt(30)
+; GFX11-NEXT:    s_waitcnt vmcnt(31)
 ; GFX11-NEXT:    v_min_f64 v[96:97], v[0:1], v[32:33]
 ; GFX11-NEXT:    v_cmp_u_f64_e32 vcc_lo, v[0:1], v[32:33]
-; GFX11-NEXT:    s_waitcnt vmcnt(28)
+; GFX11-NEXT:    s_waitcnt vmcnt(29)
 ; GFX11-NEXT:    v_min_f64 v[32:33], v[2:3], v[34:35]
 ; GFX11-NEXT:    v_cmp_u_f64_e64 s0, v[2:3], v[34:35]
-; GFX11-NEXT:    s_waitcnt vmcnt(26)
+; GFX11-NEXT:    s_waitcnt vmcnt(27)
 ; GFX11-NEXT:    v_min_f64 v[34:35], v[4:5], v[36:37]
 ; GFX11-NEXT:    v_cmp_u_f64_e64 s1, v[4:5], v[36:37]
-; GFX11-NEXT:    s_waitcnt vmcnt(24)
+; GFX11-NEXT:    s_waitcnt vmcnt(25)
 ; GFX11-NEXT:    v_min_f64 v[36:37], v[6:7], v[38:39]
 ; GFX11-NEXT:    v_cmp_u_f64_e64 s2, v[6:7], v[38:39]
-; GFX11-NEXT:    s_waitcnt vmcnt(22)
+; GFX11-NEXT:    s_waitcnt vmcnt(23)
 ; GFX11-NEXT:    v_min_f64 v[38:39], v[8:9], v[48:49]
 ; GFX11-NEXT:    v_cmp_u_f64_e64 s3, v[8:9], v[48:49]
-; GFX11-NEXT:    s_waitcnt vmcnt(20)
+; GFX11-NEXT:    s_waitcnt vmcnt(21)
 ; GFX11-NEXT:    v_min_f64 v[48:49], v[10:11], v[50:51]
 ; GFX11-NEXT:    v_cmp_u_f64_e64 s4, v[10:11], v[50:51]
-; GFX11-NEXT:    s_waitcnt vmcnt(18)
+; GFX11-NEXT:    s_waitcnt vmcnt(19)
 ; GFX11-NEXT:    v_min_f64 v[50:51], v[12:13], v[52:53]
 ; GFX11-NEXT:    v_cmp_u_f64_e64 s5, v[12:13], v[52:53]
-; GFX11-NEXT:    s_waitcnt vmcnt(16)
+; GFX11-NEXT:    s_waitcnt vmcnt(17)
 ; GFX11-NEXT:    v_min_f64 v[52:53], v[14:15], v[54:55]
 ; GFX11-NEXT:    v_cmp_u_f64_e64 s6, v[14:15], v[54:55]
-; GFX11-NEXT:    s_waitcnt vmcnt(14)
+; GFX11-NEXT:    s_waitcnt vmcnt(15)
 ; GFX11-NEXT:    v_min_f64 v[54:55], v[16:17], v[64:65]
 ; GFX11-NEXT:    v_cmp_u_f64_e64 s7, v[16:17], v[64:65]
-; GFX11-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-NEXT:    s_waitcnt vmcnt(13)
 ; GFX11-NEXT:    v_min_f64 v[64:65], v[18:19], v[66:67]
 ; GFX11-NEXT:    v_cmp_u_f64_e64 s8, v[18:19], v[66:67]
-; GFX11-NEXT:    s_waitcnt vmcnt(10)
+; GFX11-NEXT:    s_waitcnt vmcnt(11)
 ; GFX11-NEXT:    v_min_f64 v[66:67], v[20:21], v[68:69]
 ; GFX11-NEXT:    v_cmp_u_f64_e64 s9, v[20:21], v[68:69]
-; GFX11-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-NEXT:    s_waitcnt vmcnt(9)
 ; GFX11-NEXT:    v_min_f64 v[68:69], v[22:23], v[70:71]
 ; GFX11-NEXT:    v_cmp_u_f64_e64 s10, v[22:23], v[70:71]
-; GFX11-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-NEXT:    s_waitcnt vmcnt(7)
 ; GFX11-NEXT:    v_min_f64 v[70:71], v[24:25], v[80:81]
 ; GFX11-NEXT:    v_cmp_u_f64_e64 s11, v[24:25], v[80:81]
-; GFX11-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-NEXT:    s_waitcnt vmcnt(5)
 ; GFX11-NEXT:    v_min_f64 v[80:81], v[26:27], v[82:83]
 ; GFX11-NEXT:    v_cmp_u_f64_e64 s12, v[26:27], v[82:83]
-; GFX11-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-NEXT:    s_waitcnt vmcnt(3)
 ; GFX11-NEXT:    v_min_f64 v[82:83], v[28:29], v[84:85]
 ; GFX11-NEXT:    v_cmp_u_f64_e64 s13, v[28:29], v[84:85]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -2765,7 +2766,6 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1)
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_clause 0x1f
-; GFX12-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX12-NEXT:    scratch_load_b32 v33, off, s32 offset:8
 ; GFX12-NEXT:    scratch_load_b32 v32, off, s32 offset:4
 ; GFX12-NEXT:    scratch_load_b32 v35, off, s32 offset:16
@@ -2796,37 +2796,38 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1)
 ; GFX12-NEXT:    scratch_load_b32 v82, off, s32 offset:108
 ; GFX12-NEXT:    scratch_load_b32 v85, off, s32 offset:120
 ; GFX12-NEXT:    scratch_load_b32 v84, off, s32 offset:116
+; GFX12-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX12-NEXT:    scratch_load_b32 v87, off, s32 offset:128
 ; GFX12-NEXT:    scratch_load_b32 v86, off, s32 offset:124
-; GFX12-NEXT:    s_wait_loadcnt 0x1e
+; GFX12-NEXT:    s_wait_loadcnt 0x1f
 ; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[32:33]
-; GFX12-NEXT:    s_wait_loadcnt 0x1c
+; GFX12-NEXT:    s_wait_loadcnt 0x1d
 ; GFX12-NEXT:    v_minimum_f64 v[2:3], v[2:3], v[34:35]
-; GFX12-NEXT:    s_wait_loadcnt 0x1a
+; GFX12-NEXT:    s_wait_loadcnt 0x1b
 ; GFX12-NEXT:    v_minimum_f64 v[4:5], v[4:5], v[36:37]
-; GFX12-NEXT:    s_wait_loadcnt 0x18
+; GFX12-NEXT:    s_wait_loadcnt 0x19
 ; GFX12-NEXT:    v_minimum_f64 v[6:7], v[6:7], v[38:39]
-; GFX12-NEXT:    s_wait_loadcnt 0x16
+; GFX12-NEXT:    s_wait_loadcnt 0x17
 ; GFX12-NEXT:    v_minimum_f64 v[8:9], v[8:9], v[48:49]
-; GFX12-NEXT:    s_wait_loadcnt 0x14
+; GFX12-NEXT:    s_wait_loadcnt 0x15
 ; GFX12-NEXT:    v_minimum_f64 v[10:11], v[10:11], v[50:51]
-; GFX12-NEXT:    s_wait_loadcnt 0x12
+; GFX12-NEXT:    s_wait_loadcnt 0x13
 ; GFX12-NEXT:    v_minimum_f64 v[12:13], v[12:13], v[52:53]
-; GFX12-NEXT:    s_wait_loadcnt 0x10
+; GFX12-NEXT:    s_wait_loadcnt 0x11
 ; GFX12-NEXT:    v_minimum_f64 v[14:15], v[14:15], v[54:55]
-; GFX12-NEXT:    s_wait_loadcnt 0xe
+; GFX12-NEXT:    s_wait_loadcnt 0xf
 ; GFX12-NEXT:    v_minimum_f64 v[16:17], v[16:17], v[64:65]
-; GFX12-NEXT:    s_wait_loadcnt 0xc
+; GFX12-NEXT:    s_wait_loadcnt 0xd
 ; GFX12-NEXT:    v_minimum_f64 v[18:19], v[18:19], v[66:67]
-; GFX12-NEXT:    s_wait_loadcnt 0xa
+; GFX12-NEXT:    s_wait_loadcnt 0xb
 ; GFX12-NEXT:    v_minimum_f64 v[20:21], v[20:21], v[68:69]
-; GFX12-NEXT:    s_wait_loadcnt 0x8
+; GFX12-NEXT:    s_wait_loadcnt 0x9
 ; GFX12-NEXT:    v_minimum_f64 v[22:23], v[22:23], v[70:71]
-; GFX12-NEXT:    s_wait_loadcnt 0x6
+; GFX12-NEXT:    s_wait_loadcnt 0x7
 ; GFX12-NEXT:    v_minimum_f64 v[24:25], v[24:25], v[80:81]
-; GFX12-NEXT:    s_wait_loadcnt 0x4
+; GFX12-NEXT:    s_wait_loadcnt 0x5
 ; GFX12-NEXT:    v_minimum_f64 v[26:27], v[26:27], v[82:83]
-; GFX12-NEXT:    s_wait_loadcnt 0x2
+; GFX12-NEXT:    s_wait_loadcnt 0x3
 ; GFX12-NEXT:    v_minimum_f64 v[28:29], v[28:29], v[84:85]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_minimum_f64 v[30:31], v[30:31], v[86:87]
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index a135b43bad0fe..049c1329422cd 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -567,39 +567,52 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v1, off, s[0:3], 0 offset:2
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v4, off, s[0:3], 0 offset:4
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v2, off, s[0:3], 0 offset:6
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v5, off, s[0:3], 0 offset:8
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v3, off, s[0:3], 0 offset:10
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v6, off, s[0:3], 0 offset:12
 ; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v7, off, s[0:3], 0 offset:14
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v8, off, s[0:3], 0 offset:16
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v9, off, s[0:3], 0 offset:18
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v10, off, s[0:3], 0 offset:20
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v11, off, s[0:3], 0 offset:22
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v12, off, s[0:3], 0 offset:24
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v3, off, s[0:3], 0 offset:10
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v2, off, s[0:3], 0 offset:6
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v1, off, s[0:3], 0 offset:2
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v15, off, s[0:3], 0 offset:30
 ; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v13, off, s[0:3], 0 offset:26
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v11, off, s[0:3], 0 offset:22
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v9, off, s[0:3], 0 offset:18
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v6, off, s[0:3], 0 offset:12
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v5, off, s[0:3], 0 offset:8
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v4, off, s[0:3], 0 offset:4
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v14, off, s[0:3], 0 offset:28
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v15, off, s[0:3], 0 offset:30
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(8)
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v12, off, s[0:3], 0 offset:24
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v10, off, s[0:3], 0 offset:20
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v8, off, s[0:3], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(14)
 ; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
 ; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v3
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(13)
 ; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(12)
 ; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(11)
 ; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(10)
 ; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(9)
 ; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(8)
 ; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(7)
 ; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v3, v7, v6
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(6)
 ; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v2, v16, v5
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(5)
 ; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v1, v17, v4
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(4)
 ; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v0, v18, v0
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(3)
 ; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v7, v15, v14
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(2)
 ; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v6, v13, v12
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v5, v11, v10
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v4, v9, v8
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
@@ -774,25 +787,24 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #
 ; GFX12-TRUE16-NEXT:    global_load_d16_b16 v2, v8, s[0:1] offset:8
 ; GFX12-TRUE16-NEXT:    global_load_d16_b16 v1, v8, s[0:1] offset:4
 ; GFX12-TRUE16-NEXT:    global_load_d16_b16 v0, v8, s[0:1]
-; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x7
-; GFX12-TRUE16-NEXT:    global_load_d16_hi_b16 v3, v8, s[0:1] offset:14
-; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x7
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x6
 ; GFX12-TRUE16-NEXT:    global_load_d16_hi_b16 v7, v8, s[0:1] offset:30
-; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x7
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x6
 ; GFX12-TRUE16-NEXT:    global_load_d16_hi_b16 v6, v8, s[0:1] offset:26
-; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x7
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x6
 ; GFX12-TRUE16-NEXT:    global_load_d16_hi_b16 v5, v8, s[0:1] offset:22
-; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x7
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x6
 ; GFX12-TRUE16-NEXT:    global_load_d16_hi_b16 v4, v8, s[0:1] offset:18
-; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x7
-; GFX12-TRUE16-NEXT:    global_load_d16_hi_b16 v2, v8, s[0:1] offset:10
-; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x7
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x5
 ; GFX12-TRUE16-NEXT:    global_load_d16_hi_b16 v1, v8, s[0:1] offset:6
-; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x7
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x5
+; GFX12-TRUE16-NEXT:    s_clause 0x2
 ; GFX12-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v8, s[0:1] offset:2
-; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    s_clause 0x1
+; GFX12-TRUE16-NEXT:    global_load_d16_hi_b16 v3, v8, s[0:1] offset:14
+; GFX12-TRUE16-NEXT:    global_load_d16_hi_b16 v2, v8, s[0:1] offset:10
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x2
 ; GFX12-TRUE16-NEXT:    global_store_b128 v[0:1], v[4:7], off
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    global_store_b128 v[0:1], v[0:3], off
 ; GFX12-TRUE16-NEXT:    s_endpgm
 ;
@@ -810,25 +822,24 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #
 ; GFX12-FAKE16-NEXT:    global_load_u16 v2, v8, s[0:1] offset:8
 ; GFX12-FAKE16-NEXT:    global_load_u16 v1, v8, s[0:1] offset:4
 ; GFX12-FAKE16-NEXT:    global_load_u16 v0, v8, s[0:1]
-; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x7
-; GFX12-FAKE16-NEXT:    global_load_d16_hi_b16 v3, v8, s[0:1] offset:14
-; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x7
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x6
 ; GFX12-FAKE16-NEXT:    global_load_d16_hi_b16 v7, v8, s[0:1] offset:30
-; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x7
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x6
 ; GFX12-FAKE16-NEXT:    global_load_d16_hi_b16 v6, v8, s[0:1] offset:26
-; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x7
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x6
 ; GFX12-FAKE16-NEXT:    global_load_d16_hi_b16 v5, v8, s[0:1] offset:22
-; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x7
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x6
 ; GFX12-FAKE16-NEXT:    global_load_d16_hi_b16 v4, v8, s[0:1] offset:18
-; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x7
-; GFX12-FAKE16-NEXT:    global_load_d16_hi_b16 v2, v8, s[0:1] offset:10
-; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x7
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x5
 ; GFX12-FAKE16-NEXT:    global_load_d16_hi_b16 v1, v8, s[0:1] offset:6
-; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x7
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x5
+; GFX12-FAKE16-NEXT:    s_clause 0x2
 ; GFX12-FAKE16-NEXT:    global_load_d16_hi_b16 v0, v8, s[0:1] offset:2
-; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT:    s_clause 0x1
+; GFX12-FAKE16-NEXT:    global_load_d16_hi_b16 v3, v8, s[0:1] offset:14
+; GFX12-FAKE16-NEXT:    global_load_d16_hi_b16 v2, v8, s[0:1] offset:10
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x2
 ; GFX12-FAKE16-NEXT:    global_store_b128 v[0:1], v[4:7], off
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-FAKE16-NEXT:    global_store_b128 v[0:1], v[0:3], off
 ; GFX12-FAKE16-NEXT:    s_endpgm
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-f32.ll b/llvm/test/CodeGen/AMDGPU/load-global-f32.ll
index c119ef274bb04..3edba28348bf4 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-f32.ll
@@ -712,8 +712,8 @@ define amdgpu_kernel void @global_load_v16f32(ptr addrspace(1) %out, ptr addrspa
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s5
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s4
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
@@ -734,8 +734,9 @@ define amdgpu_kernel void @global_load_v16f32(ptr addrspace(1) %out, ptr addrspa
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s1
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s0
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(2)
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(3)
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(3)
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(3)
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[2:3], v[12:15]
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index bca39d06e941c..7373a2bc880c1 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -650,39 +650,52 @@ define amdgpu_kernel void @global_load_v16i16_align2(ptr addrspace(1) %in, ptr a
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s5
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s6
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s7
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 offset:2
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v4, off, s[8:11], 0 offset:4
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v2, off, s[8:11], 0 offset:6
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v5, off, s[8:11], 0 offset:8
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v3, off, s[8:11], 0 offset:10
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v6, off, s[8:11], 0 offset:12
 ; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v7, off, s[8:11], 0 offset:14
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v8, off, s[8:11], 0 offset:16
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v9, off, s[8:11], 0 offset:18
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v10, off, s[8:11], 0 offset:20
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v11, off, s[8:11], 0 offset:22
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v12, off, s[8:11], 0 offset:24
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v3, off, s[8:11], 0 offset:10
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v2, off, s[8:11], 0 offset:6
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 offset:2
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v15, off, s[8:11], 0 offset:30
 ; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v13, off, s[8:11], 0 offset:26
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v11, off, s[8:11], 0 offset:22
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v9, off, s[8:11], 0 offset:18
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v6, off, s[8:11], 0 offset:12
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v5, off, s[8:11], 0 offset:8
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v4, off, s[8:11], 0 offset:4
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
 ; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v14, off, s[8:11], 0 offset:28
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v15, off, s[8:11], 0 offset:30
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(8)
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v12, off, s[8:11], 0 offset:24
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v10, off, s[8:11], 0 offset:20
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v8, off, s[8:11], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(14)
 ; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
 ; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v3
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(13)
 ; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(12)
 ; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(11)
 ; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(10)
 ; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(9)
 ; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(8)
 ; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(7)
 ; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v3, v7, v6
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(6)
 ; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v2, v16, v5
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(5)
 ; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v1, v17, v4
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(4)
 ; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v0, v18, v0
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(3)
 ; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v7, v15, v14
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(2)
 ; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v6, v13, v12
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v5, v11, v10
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v4, v9, v8
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
@@ -4351,15 +4364,15 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s7
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s6, s2
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s7, s3
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:112
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:96
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:80
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:64
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[4:7], 0
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[32:35], off, s[4:7], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[36:39], off, s[4:7], 0 offset:32
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[40:43], off, s[4:7], 0 offset:48
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:64
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:80
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:96
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:112
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(7)
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v3, 16, v11
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v1, 16, v10
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v2, v11, 0, 16
@@ -4372,7 +4385,7 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v5, 16, v8
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v6, v9, 0, 16
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v4, v8, 0, 16
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(6)
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(10)
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v11, 16, v35
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v9, 16, v34
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v10, v35, 0, 16
@@ -4381,7 +4394,7 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v13, 16, v32
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v14, v33, 0, 16
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v12, v32, 0, 16
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(5)
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(9)
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v35, 16, v39
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v33, 16, v38
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v34, v39, 0, 16
@@ -4390,7 +4403,7 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v45, 16, v36
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v46, v37, 0, 16
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v44, v36, 0, 16
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(4)
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(8)
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v39, 16, v43
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v37, 16, v42
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v38, v43, 0, 16
@@ -4399,6 +4412,7 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v49, 16, v40
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v50, v41, 0, 16
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v48, v40, 0, 16
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(7)
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v43, 16, v31
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v41, 16, v30
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v42, v31, 0, 16
@@ -4407,6 +4421,7 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v53, 16, v28
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v54, v29, 0, 16
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v52, v28, 0, 16
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(6)
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v30, 16, v27
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v28, 16, v26
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v29, v27, 0, 16
@@ -4415,6 +4430,7 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v57, 16, v24
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v58, v25, 0, 16
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v56, v24, 0, 16
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(5)
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v26, 16, v23
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v24, 16, v22
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v25, v23, 0, 16
@@ -4423,6 +4439,7 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v61, 16, v20
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v62, v21, 0, 16
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v60, v20, 0, 16
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(4)
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v22, 16, v19
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v20, 16, v18
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v21, v19, 0, 16
@@ -8115,8 +8132,8 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(3)
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v14
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v23, v15
@@ -8124,10 +8141,11 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(2)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v27, v3
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v0
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v6
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v22, v7
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v4
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v10
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v23, v23, 0, 16
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v26, 31, v15
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
index e55fb2cac0985..a759b11785b29 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
@@ -2755,8 +2755,8 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v7, v5
 ; SI-NOHSA-NEXT:    s_mov_b32 s0, s4
 ; SI-NOHSA-NEXT:    s_mov_b32 s1, s5
-; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0
 ; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:16
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0
 ; SI-NOHSA-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v4, v0
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v6, v1
@@ -2773,7 +2773,7 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v4, v10
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v6, v11
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80
-; SI-NOHSA-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v4, v16
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v6, v17
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32
@@ -2781,7 +2781,7 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v4, v18
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v6, v19
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48
-; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(6) expcnt(0)
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v4, v12
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v6, v13
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0
@@ -3906,15 +3906,15 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; SI-NOHSA-NEXT:    s_mov_b32 s1, s5
 ; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:112
 ; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:96
-; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0
-; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:16
-; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:32
 ; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:48
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:32
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:16
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0
 ; SI-NOHSA-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v4
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v5
-; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:64
 ; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[32:35], off, s[8:11], 0 offset:80
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:64
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
 ; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v6
@@ -3928,7 +3928,7 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v10
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v11
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208
-; SI-NOHSA-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v32
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v33
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
@@ -3936,7 +3936,7 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v34
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v35
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
-; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(6) expcnt(0)
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v28
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v29
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
@@ -4021,8 +4021,8 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, s7
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, s5
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, s6
-; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
 ; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, 0
 ; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 16
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v3, v1
@@ -4117,7 +4117,7 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, v14
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, v15
 ; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
-; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(12)
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(13)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, v8
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, v9
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, s3
@@ -4133,6 +4133,7 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
 ; GCNX3-HSA-NEXT:    s_add_u32 s0, s0, 48
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(14)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, v4
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, v5
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, s3
@@ -4483,25 +4484,29 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa
 ; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
 ; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
 ; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
-; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
-; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:112
 ; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:96
-; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:80
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:112
 ; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:64
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:80
 ; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:32
 ; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:48
-; SI-NOHSA-NEXT:    s_waitcnt vmcnt(4)
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:96
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:112
-; SI-NOHSA-NEXT:    s_waitcnt vmcnt(4)
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0 offset:64
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:80
-; SI-NOHSA-NEXT:    s_waitcnt vmcnt(5)
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0 offset:32
-; SI-NOHSA-NEXT:    s_waitcnt vmcnt(5)
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0 offset:48
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
 ; SI-NOHSA-NEXT:    s_endpgm
 ;
@@ -4610,21 +4615,23 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa
 ; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
 ; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
-; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:112
 ; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:96
-; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:80
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:112
 ; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:64
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:80
 ; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:32
 ; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:48
 ; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[24:27], off, s[8:11], 0
 ; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:16
 ; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
 ; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
-; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(6)
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(7)
 ; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:96
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(7)
 ; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:112
-; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(6)
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(7)
 ; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:64
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(7)
 ; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:80
 ; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(7)
 ; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:32
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
index f879dc660203f..385902f2d707e 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
@@ -7527,16 +7527,17 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i64(ptr addrspace(1) %out
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dword v15, off, s[12:15], 0 offset:52 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dword v16, off, s[12:15], 0 offset:56 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dword v17, off, s[12:15], 0 offset:60 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v12, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v13, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(3)
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dword v14, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v12, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(2)
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dword v15, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(1) expcnt(1)
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v13, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(3) expcnt(1)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v16, v14
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(2)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, v12
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, v9
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dword v53, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dword v54, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
index bd191a37582c0..25cfa24c1ddd6 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
@@ -7311,8 +7311,8 @@ define amdgpu_kernel void @local_zextload_v32i16_to_v32i64(ptr addrspace(3) %out
 ; SI-NEXT:    s_waitcnt lgkmcnt(1)
 ; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v5
 ; SI-NEXT:    v_and_b32_e32 v20, 0xffff, v5
-; SI-NEXT:    ds_read2_b64 v[10:13], v0 offset0:4 offset1:5
 ; SI-NEXT:    ds_read2_b64 v[14:17], v0 offset0:6 offset1:7
+; SI-NEXT:    ds_read2_b64 v[10:13], v0 offset0:4 offset1:5
 ; SI-NEXT:    ds_write2_b64 v22, v[20:21], v[18:19] offset0:14 offset1:15
 ; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v3
 ; SI-NEXT:    v_and_b32_e32 v20, 0xffff, v3
@@ -7324,7 +7324,7 @@ define amdgpu_kernel void @local_zextload_v32i16_to_v32i64(ptr addrspace(3) %out
 ; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v7
 ; SI-NEXT:    v_and_b32_e32 v20, 0xffff, v7
 ; SI-NEXT:    ds_write2_b64 v22, v[20:21], v[18:19] offset0:2 offset1:3
-; SI-NEXT:    s_waitcnt lgkmcnt(4)
+; SI-NEXT:    s_waitcnt lgkmcnt(5)
 ; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v17
 ; SI-NEXT:    v_and_b32_e32 v20, 0xffff, v17
 ; SI-NEXT:    ds_write2_b64 v22, v[20:21], v[18:19] offset0:30 offset1:31
@@ -7333,6 +7333,7 @@ define amdgpu_kernel void @local_zextload_v32i16_to_v32i64(ptr addrspace(3) %out
 ; SI-NEXT:    v_mov_b32_e32 v20, v1
 ; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v15
 ; SI-NEXT:    ds_write2_b64 v22, v[19:20], v[17:18] offset0:26 offset1:27
+; SI-NEXT:    s_waitcnt lgkmcnt(6)
 ; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v13
 ; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v13
 ; SI-NEXT:    ds_write2_b64 v22, v[19:20], v[17:18] offset0:22 offset1:23
@@ -7464,12 +7465,12 @@ define amdgpu_kernel void @local_zextload_v32i16_to_v32i64(ptr addrspace(3) %out
 ; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v21, v5
 ; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v4, s1
-; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[6:9], v4 offset0:4 offset1:5
 ; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v4 offset0:6 offset1:7
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[6:9], v4 offset0:4 offset1:5
 ; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v22, s0
-; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[10:13], v4 offset1:1
 ; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[14:17], v4 offset0:2 offset1:3
-; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(2)
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[10:13], v4 offset1:1
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(3)
 ; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v20, 16, v2
 ; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v18, 0xffff, v2
 ; GFX9-NO-DS128-NEXT:    ds_write2_b64 v22, v[18:19], v[20:21] offset0:28 offset1:29
@@ -7480,6 +7481,7 @@ define amdgpu_kernel void @local_zextload_v32i16_to_v32i64(ptr addrspace(3) %out
 ; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v1, 0xffff, v0
 ; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v18, 16, v0
 ; GFX9-NO-DS128-NEXT:    ds_write2_b64 v22, v[1:2], v[18:19] offset0:24 offset1:25
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(5)
 ; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v9
 ; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v18, 16, v9
@@ -7496,7 +7498,7 @@ define amdgpu_kernel void @local_zextload_v32i16_to_v32i64(ptr addrspace(3) %out
 ; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v6
 ; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX9-NO-DS128-NEXT:    ds_write2_b64 v22, v[6:7], v[0:1] offset0:16 offset1:17
-; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(7)
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(8)
 ; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v17
 ; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v17
 ; GFX9-NO-DS128-NEXT:    ds_write2_b64 v22, v[6:7], v[0:1] offset0:14 offset1:15
@@ -7508,6 +7510,7 @@ define amdgpu_kernel void @local_zextload_v32i16_to_v32i64(ptr addrspace(3) %out
 ; GFX9-NO-DS128-NEXT:    ds_write2_b64 v22, v[6:7], v[0:1] offset0:10 offset1:11
 ; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v14
 ; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v14
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(10)
 ; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
 ; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v8, 0xffff, v10
 ; GFX9-NO-DS128-NEXT:    ds_write2_b64 v22, v[6:7], v[1:2] offset0:8 offset1:9
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll b/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll
index c9615f478e5b5..088fe562dc24f 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll
@@ -11,13 +11,14 @@ define amdgpu_vs void @test(ptr addrspace(8) inreg %arg1, ptr addrspace(3) %arg2
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, 4, v0
 ; CHECK-NEXT:    s_mov_b32 m0, -1
+; CHECK-NEXT:    ds_read_b32 v0, v0
 ; CHECK-NEXT:    ds_read_b32 v3, v1
 ; CHECK-NEXT:    ds_read_b32 v2, v2
 ; CHECK-NEXT:    ds_read_b32 v1, v4
-; CHECK-NEXT:    ds_read_b32 v0, v0
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
 ; CHECK-NEXT:    exp mrt0 off, off, off, off
 ; CHECK-NEXT:    v_mov_b32_e32 v4, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_FLOAT] idxen
 ; CHECK-NEXT:    s_endpgm
   call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float poison, float poison, float poison, float poison, i1 false, i1 false)
@@ -38,12 +39,12 @@ define amdgpu_vs void @test_2(ptr addrspace(8) inreg %arg1, i32 %arg2, i32 inreg
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, 8, v1
 ; CHECK-NEXT:    v_add_i32_e32 v10, vcc, 4, v1
 ; CHECK-NEXT:    s_mov_b32 m0, -1
-; CHECK-NEXT:    ds_read_b32 v2, v2
-; CHECK-NEXT:    ds_read_b32 v5, v4
-; CHECK-NEXT:    ds_read_b32 v4, v6
 ; CHECK-NEXT:    ds_read_b32 v9, v7
 ; CHECK-NEXT:    ds_read_b32 v8, v8
 ; CHECK-NEXT:    ds_read_b32 v7, v10
+; CHECK-NEXT:    ds_read_b32 v2, v2
+; CHECK-NEXT:    ds_read_b32 v5, v4
+; CHECK-NEXT:    ds_read_b32 v4, v6
 ; CHECK-NEXT:    ds_read_b32 v6, v1
 ; CHECK-NEXT:    ds_read_b32 v3, v3
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
@@ -78,26 +79,28 @@ define amdgpu_vs void @test_3(i32 inreg %arg1, i32 inreg %arg2, ptr addrspace(8)
 ; CHECK-NEXT:    ds_read_b32 v6, v0
 ; CHECK-NEXT:    ds_read_b32 v5, v3
 ; CHECK-NEXT:    ds_read_b32 v4, v4
+; CHECK-NEXT:    ds_read_b32 v3, v1
 ; CHECK-NEXT:    ds_read_b32 v8, v7
 ; CHECK-NEXT:    ds_read_b32 v7, v9
-; CHECK-NEXT:    ds_read_b32 v3, v1
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, 4, v2
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, 20, v2
 ; CHECK-NEXT:    v_add_i32_e32 v9, vcc, 16, v2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
 ; CHECK-NEXT:    tbuffer_store_format_xyzw v[3:6], v10, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:264 glc slc
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    tbuffer_store_format_xy v[7:8], v10, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:280 glc slc
 ; CHECK-NEXT:    s_waitcnt expcnt(1)
 ; CHECK-NEXT:    ds_read_b32 v5, v11
 ; CHECK-NEXT:    ds_read_b32 v4, v12
 ; CHECK-NEXT:    ds_read_b32 v3, v0
-; CHECK-NEXT:    ds_read_b32 v1, v1
 ; CHECK-NEXT:    ds_read_b32 v0, v9
 ; CHECK-NEXT:    ds_read_b32 v2, v2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    ds_read_b32 v1, v1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
 ; CHECK-NEXT:    exp mrt0 off, off, off, off
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
 ; CHECK-NEXT:    tbuffer_store_format_xyzw v[2:5], v10, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:240 glc slc
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    tbuffer_store_format_xy v[0:1], v10, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:256 glc slc
 ; CHECK-NEXT:    s_endpgm
   %load1 = load <6 x float>, ptr addrspace(3) %arg5, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/load-local.128.ll b/llvm/test/CodeGen/AMDGPU/load-local.128.ll
index 5b6af7654f7e9..302cf003042d9 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local.128.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local.128.ll
@@ -96,35 +96,38 @@ define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) {
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    ds_read_u8 v1, v0 offset:1
-; GFX7-NEXT:    ds_read_u8 v2, v0 offset:6
-; GFX7-NEXT:    ds_read_u8 v3, v0 offset:4
-; GFX7-NEXT:    ds_read_u8 v4, v0 offset:2
 ; GFX7-NEXT:    ds_read_u8 v5, v0
 ; GFX7-NEXT:    ds_read_u8 v6, v0 offset:3
+; GFX7-NEXT:    ds_read_u8 v4, v0 offset:2
 ; GFX7-NEXT:    ds_read_u8 v7, v0 offset:5
+; GFX7-NEXT:    ds_read_u8 v3, v0 offset:4
 ; GFX7-NEXT:    ds_read_u8 v8, v0 offset:7
+; GFX7-NEXT:    ds_read_u8 v2, v0 offset:6
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(7)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(6)
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v5
-; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(5)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v6
+; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX7-NEXT:    v_or_b32_e32 v4, v5, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX7-NEXT:    v_or_b32_e32 v4, v4, v1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(3)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v7
+; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v8
 ; GFX7-NEXT:    ds_read_u8 v5, v0 offset:9
 ; GFX7-NEXT:    ds_read_u8 v6, v0 offset:11
+; GFX7-NEXT:    ds_read_u8 v11, v0 offset:10
 ; GFX7-NEXT:    ds_read_u8 v7, v0 offset:13
+; GFX7-NEXT:    ds_read_u8 v10, v0 offset:12
 ; GFX7-NEXT:    ds_read_u8 v8, v0 offset:15
 ; GFX7-NEXT:    ds_read_u8 v9, v0 offset:14
-; GFX7-NEXT:    ds_read_u8 v10, v0 offset:12
-; GFX7-NEXT:    ds_read_u8 v11, v0 offset:10
 ; GFX7-NEXT:    ds_read_u8 v0, v0 offset:8
+; GFX7-NEXT:    s_waitcnt lgkmcnt(8)
 ; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -332,25 +335,27 @@ define <4 x i32> @load_lds_v4i32_align2(ptr addrspace(3) %ptr) {
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    ds_read_u16 v1, v0 offset:2
-; GFX7-NEXT:    ds_read_u16 v3, v0 offset:12
-; GFX7-NEXT:    ds_read_u16 v2, v0 offset:8
-; GFX7-NEXT:    ds_read_u16 v4, v0 offset:4
 ; GFX7-NEXT:    ds_read_u16 v5, v0
 ; GFX7-NEXT:    ds_read_u16 v6, v0 offset:6
+; GFX7-NEXT:    ds_read_u16 v4, v0 offset:4
 ; GFX7-NEXT:    ds_read_u16 v7, v0 offset:10
+; GFX7-NEXT:    ds_read_u16 v2, v0 offset:8
 ; GFX7-NEXT:    ds_read_u16 v8, v0 offset:14
+; GFX7-NEXT:    ds_read_u16 v3, v0 offset:12
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(7)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(3)
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v5
-; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(5)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v6
+; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(3)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
+; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_or_b32_e32 v3, v4, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/load-local.96.ll b/llvm/test/CodeGen/AMDGPU/load-local.96.ll
index 509aba49893f6..1673a29a5714a 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local.96.ll
@@ -87,31 +87,34 @@ define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) {
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    ds_read_u8 v1, v0 offset:1
-; GFX7-NEXT:    ds_read_u8 v2, v0 offset:6
-; GFX7-NEXT:    ds_read_u8 v4, v0 offset:4
-; GFX7-NEXT:    ds_read_u8 v3, v0 offset:2
 ; GFX7-NEXT:    ds_read_u8 v5, v0
 ; GFX7-NEXT:    ds_read_u8 v6, v0 offset:3
+; GFX7-NEXT:    ds_read_u8 v3, v0 offset:2
 ; GFX7-NEXT:    ds_read_u8 v7, v0 offset:5
+; GFX7-NEXT:    ds_read_u8 v4, v0 offset:4
 ; GFX7-NEXT:    ds_read_u8 v8, v0 offset:7
+; GFX7-NEXT:    ds_read_u8 v2, v0 offset:6
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(7)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(6)
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v5
-; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(5)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v6
+; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX7-NEXT:    v_or_b32_e32 v3, v5, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(3)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v7
 ; GFX7-NEXT:    ds_read_u8 v5, v0 offset:9
 ; GFX7-NEXT:    ds_read_u8 v6, v0 offset:11
 ; GFX7-NEXT:    ds_read_u8 v7, v0 offset:10
 ; GFX7-NEXT:    ds_read_u8 v0, v0 offset:8
+; GFX7-NEXT:    s_waitcnt lgkmcnt(6)
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(5)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v8
+; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -275,20 +278,21 @@ define <3 x i32> @load_lds_v3i32_align2(ptr addrspace(3) %ptr) {
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    ds_read_u16 v1, v0 offset:2
-; GFX7-NEXT:    ds_read_u16 v2, v0 offset:8
-; GFX7-NEXT:    ds_read_u16 v3, v0 offset:4
 ; GFX7-NEXT:    ds_read_u16 v4, v0
 ; GFX7-NEXT:    ds_read_u16 v5, v0 offset:6
+; GFX7-NEXT:    ds_read_u16 v3, v0 offset:4
 ; GFX7-NEXT:    ds_read_u16 v6, v0 offset:10
+; GFX7-NEXT:    ds_read_u16 v2, v0 offset:8
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(5)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(3)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/max.i16.ll b/llvm/test/CodeGen/AMDGPU/max.i16.ll
index a5b64f6f80d9b..005fb07d82198 100644
--- a/llvm/test/CodeGen/AMDGPU/max.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/max.i16.ll
@@ -116,9 +116,9 @@ define amdgpu_kernel void @v_test_imax_sge_v3i16(ptr addrspace(1) %out, ptr addr
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v7, v[2:3]
 ; VI-NEXT:    flat_load_ushort v4, v[4:5]
 ; VI-NEXT:    flat_load_dword v5, v[0:1]
-; VI-NEXT:    flat_load_dword v7, v[2:3]
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 4, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
 ; VI-NEXT:    flat_load_ushort v8, v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
index 5b7c36559a366..2448f3e3025e8 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
@@ -186,14 +186,14 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %
 ; CHECK-NEXT:    buffer_load_dword v14, v26, s[20:23], 0 offen offset:72
 ; CHECK-NEXT:    buffer_load_dword v13, v26, s[20:23], 0 offen offset:68
 ; CHECK-NEXT:    buffer_load_dword v12, v26, s[20:23], 0 offen offset:64
-; CHECK-NEXT:    buffer_load_dword v16, v26, s[20:23], 0 offen offset:32
-; CHECK-NEXT:    buffer_load_dword v17, v26, s[20:23], 0 offen offset:36
-; CHECK-NEXT:    buffer_load_dword v18, v26, s[20:23], 0 offen offset:40
-; CHECK-NEXT:    buffer_load_dword v19, v26, s[20:23], 0 offen offset:44
 ; CHECK-NEXT:    buffer_load_dword v20, v26, s[20:23], 0 offen offset:48
 ; CHECK-NEXT:    buffer_load_dword v21, v26, s[20:23], 0 offen offset:52
 ; CHECK-NEXT:    buffer_load_dword v22, v26, s[20:23], 0 offen offset:56
 ; CHECK-NEXT:    buffer_load_dword v23, v26, s[20:23], 0 offen offset:60
+; CHECK-NEXT:    buffer_load_dword v16, v26, s[20:23], 0 offen offset:32
+; CHECK-NEXT:    buffer_load_dword v17, v26, s[20:23], 0 offen offset:36
+; CHECK-NEXT:    buffer_load_dword v18, v26, s[20:23], 0 offen offset:40
+; CHECK-NEXT:    buffer_load_dword v19, v26, s[20:23], 0 offen offset:44
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v25, s1
 ; CHECK-NEXT:    v_mov_b32_e32 v24, s0
@@ -201,14 +201,14 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %
 ; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[0:3] offset:112
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[4:7] offset:96
-; CHECK-NEXT:    buffer_load_dword v0, v26, s[20:23], 0 offen
-; CHECK-NEXT:    buffer_load_dword v1, v26, s[20:23], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_dword v2, v26, s[20:23], 0 offen offset:8
-; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    buffer_load_dword v4, v26, s[20:23], 0 offen offset:16
+; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    buffer_load_dword v5, v26, s[20:23], 0 offen offset:20
 ; CHECK-NEXT:    buffer_load_dword v6, v26, s[20:23], 0 offen offset:24
 ; CHECK-NEXT:    buffer_load_dword v7, v26, s[20:23], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v0, v26, s[20:23], 0 offen
+; CHECK-NEXT:    buffer_load_dword v1, v26, s[20:23], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v2, v26, s[20:23], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v3, v26, s[20:23], 0 offen offset:12
 ; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[8:11] offset:80
@@ -474,14 +474,14 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %
 ; CHECK-NEXT:    buffer_load_dword v14, v26, s[20:23], 0 offen offset:72
 ; CHECK-NEXT:    buffer_load_dword v13, v26, s[20:23], 0 offen offset:68
 ; CHECK-NEXT:    buffer_load_dword v12, v26, s[20:23], 0 offen offset:64
-; CHECK-NEXT:    buffer_load_dword v16, v26, s[20:23], 0 offen offset:32
-; CHECK-NEXT:    buffer_load_dword v17, v26, s[20:23], 0 offen offset:36
-; CHECK-NEXT:    buffer_load_dword v18, v26, s[20:23], 0 offen offset:40
-; CHECK-NEXT:    buffer_load_dword v19, v26, s[20:23], 0 offen offset:44
 ; CHECK-NEXT:    buffer_load_dword v20, v26, s[20:23], 0 offen offset:48
 ; CHECK-NEXT:    buffer_load_dword v21, v26, s[20:23], 0 offen offset:52
 ; CHECK-NEXT:    buffer_load_dword v22, v26, s[20:23], 0 offen offset:56
 ; CHECK-NEXT:    buffer_load_dword v23, v26, s[20:23], 0 offen offset:60
+; CHECK-NEXT:    buffer_load_dword v16, v26, s[20:23], 0 offen offset:32
+; CHECK-NEXT:    buffer_load_dword v17, v26, s[20:23], 0 offen offset:36
+; CHECK-NEXT:    buffer_load_dword v18, v26, s[20:23], 0 offen offset:40
+; CHECK-NEXT:    buffer_load_dword v19, v26, s[20:23], 0 offen offset:44
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v25, s1
 ; CHECK-NEXT:    v_mov_b32_e32 v24, s0
@@ -489,14 +489,14 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %
 ; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[0:3] offset:112
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[4:7] offset:96
-; CHECK-NEXT:    buffer_load_dword v0, v26, s[20:23], 0 offen
-; CHECK-NEXT:    buffer_load_dword v1, v26, s[20:23], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_dword v2, v26, s[20:23], 0 offen offset:8
-; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    buffer_load_dword v4, v26, s[20:23], 0 offen offset:16
+; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    buffer_load_dword v5, v26, s[20:23], 0 offen offset:20
 ; CHECK-NEXT:    buffer_load_dword v6, v26, s[20:23], 0 offen offset:24
 ; CHECK-NEXT:    buffer_load_dword v7, v26, s[20:23], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v0, v26, s[20:23], 0 offen
+; CHECK-NEXT:    buffer_load_dword v1, v26, s[20:23], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v2, v26, s[20:23], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v3, v26, s[20:23], 0 offen offset:12
 ; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[8:11] offset:80
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll b/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll
index 048610184368d..111354114856b 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll
@@ -844,16 +844,17 @@ define void @memcpy_p0_p5_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
 ; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
+; CHECK-NEXT:    s_waitcnt vmcnt(6)
 ; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[7:8] offset:23
+; CHECK-NEXT:    s_waitcnt vmcnt(2)
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[9:10] offset:16
@@ -869,15 +870,15 @@ define void @memcpy_p0_p5_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[7:10] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
@@ -911,16 +912,17 @@ define void @memcpy_p0_p5_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
 ; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
+; CHECK-NEXT:    s_waitcnt vmcnt(6)
 ; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[7:8] offset:23
+; CHECK-NEXT:    s_waitcnt vmcnt(2)
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[9:10] offset:16
@@ -936,15 +938,15 @@ define void @memcpy_p0_p5_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[7:10] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
diff --git a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
index 02f39e25cb447..3c27adfe7d47b 100644
--- a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
+++ b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
@@ -97,6 +97,11 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0)
 ; ALIGNED-NEXT:    s_clause 0xf
 ; ALIGNED-NEXT:    flat_load_dwordx4 v[16:19], v[24:25] offset:240
 ; ALIGNED-NEXT:    flat_load_dwordx4 v[20:23], v[24:25] offset:224
+; ALIGNED-NEXT:    flat_load_dwordx4 v[100:103], v[24:25] offset:208
+; ALIGNED-NEXT:    flat_load_dwordx4 v[96:99], v[24:25] offset:192
+; ALIGNED-NEXT:    flat_load_dwordx4 v[81:84], v[24:25] offset:176
+; ALIGNED-NEXT:    flat_load_dwordx4 v[66:69], v[24:25] offset:160
+; ALIGNED-NEXT:    flat_load_dwordx4 v[52:55], v[24:25] offset:144
 ; ALIGNED-NEXT:    flat_load_dwordx4 v[4:7], v[24:25]
 ; ALIGNED-NEXT:    flat_load_dwordx4 v[8:11], v[24:25] offset:16
 ; ALIGNED-NEXT:    flat_load_dwordx4 v[12:15], v[24:25] offset:32
@@ -106,11 +111,6 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0)
 ; ALIGNED-NEXT:    flat_load_dwordx4 v[26:29], v[24:25] offset:96
 ; ALIGNED-NEXT:    flat_load_dwordx4 v[32:35], v[24:25] offset:112
 ; ALIGNED-NEXT:    flat_load_dwordx4 v[44:47], v[24:25] offset:128
-; ALIGNED-NEXT:    flat_load_dwordx4 v[52:55], v[24:25] offset:144
-; ALIGNED-NEXT:    flat_load_dwordx4 v[66:69], v[24:25] offset:160
-; ALIGNED-NEXT:    flat_load_dwordx4 v[81:84], v[24:25] offset:176
-; ALIGNED-NEXT:    flat_load_dwordx4 v[96:99], v[24:25] offset:192
-; ALIGNED-NEXT:    flat_load_dwordx4 v[100:103], v[24:25] offset:208
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
 ; ALIGNED-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:64
 ; ALIGNED-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:68
@@ -160,7 +160,7 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0)
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v36 offset:226
 ; ALIGNED-NEXT:    flat_store_byte v[16:17], v36 offset:224
-; ALIGNED-NEXT:    s_waitcnt lgkmcnt(16)
+; ALIGNED-NEXT:    s_waitcnt lgkmcnt(29)
 ; ALIGNED-NEXT:    buffer_store_dword v100, off, s[0:3], s32 offset:32
 ; ALIGNED-NEXT:    buffer_store_dword v101, off, s[0:3], s32 offset:36
 ; ALIGNED-NEXT:    buffer_store_dword v102, off, s[0:3], s32 offset:40
@@ -187,6 +187,7 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0)
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v64 offset:210
 ; ALIGNED-NEXT:    flat_store_byte v[16:17], v64 offset:208
+; ALIGNED-NEXT:    s_waitcnt lgkmcnt(36)
 ; ALIGNED-NEXT:    buffer_store_dword v96, off, s[0:3], s32 offset:48
 ; ALIGNED-NEXT:    buffer_store_dword v97, off, s[0:3], s32 offset:52
 ; ALIGNED-NEXT:    buffer_store_dword v98, off, s[0:3], s32 offset:56
@@ -208,6 +209,7 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0)
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v80 offset:194
 ; ALIGNED-NEXT:    flat_store_byte v[16:17], v80 offset:192
+; ALIGNED-NEXT:    s_waitcnt lgkmcnt(43)
 ; ALIGNED-NEXT:    buffer_store_dword v81, off, s[0:3], s32 offset:128
 ; ALIGNED-NEXT:    buffer_store_dword v82, off, s[0:3], s32 offset:132
 ; ALIGNED-NEXT:    buffer_store_dword v83, off, s[0:3], s32 offset:136
@@ -229,6 +231,7 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0)
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v81 offset:178
 ; ALIGNED-NEXT:    flat_store_byte v[16:17], v81 offset:176
+; ALIGNED-NEXT:    s_waitcnt lgkmcnt(50)
 ; ALIGNED-NEXT:    buffer_store_dword v66, off, s[0:3], s32 offset:144
 ; ALIGNED-NEXT:    buffer_store_dword v67, off, s[0:3], s32 offset:148
 ; ALIGNED-NEXT:    buffer_store_dword v68, off, s[0:3], s32 offset:152
@@ -250,6 +253,7 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0)
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v66 offset:162
 ; ALIGNED-NEXT:    flat_store_byte v[16:17], v66 offset:160
+; ALIGNED-NEXT:    s_waitcnt lgkmcnt(57)
 ; ALIGNED-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:96
 ; ALIGNED-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:100
 ; ALIGNED-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:104
@@ -271,6 +275,7 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0)
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v52 offset:146
 ; ALIGNED-NEXT:    flat_store_byte v[16:17], v52 offset:144
+; ALIGNED-NEXT:    s_waitcnt lgkmcnt(56)
 ; ALIGNED-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:112
 ; ALIGNED-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:116
 ; ALIGNED-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:120
@@ -853,6 +858,11 @@ define void @memcpy_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1)
 ; ALIGNED-NEXT:    s_clause 0xf
 ; ALIGNED-NEXT:    global_load_dwordx4 v[16:19], v[24:25], off offset:240
 ; ALIGNED-NEXT:    global_load_dwordx4 v[20:23], v[24:25], off offset:224
+; ALIGNED-NEXT:    global_load_dwordx4 v[100:103], v[24:25], off offset:208
+; ALIGNED-NEXT:    global_load_dwordx4 v[96:99], v[24:25], off offset:192
+; ALIGNED-NEXT:    global_load_dwordx4 v[81:84], v[24:25], off offset:176
+; ALIGNED-NEXT:    global_load_dwordx4 v[66:69], v[24:25], off offset:160
+; ALIGNED-NEXT:    global_load_dwordx4 v[52:55], v[24:25], off offset:144
 ; ALIGNED-NEXT:    global_load_dwordx4 v[4:7], v[24:25], off
 ; ALIGNED-NEXT:    global_load_dwordx4 v[8:11], v[24:25], off offset:16
 ; ALIGNED-NEXT:    global_load_dwordx4 v[12:15], v[24:25], off offset:32
@@ -862,11 +872,6 @@ define void @memcpy_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1)
 ; ALIGNED-NEXT:    global_load_dwordx4 v[26:29], v[24:25], off offset:96
 ; ALIGNED-NEXT:    global_load_dwordx4 v[32:35], v[24:25], off offset:112
 ; ALIGNED-NEXT:    global_load_dwordx4 v[44:47], v[24:25], off offset:128
-; ALIGNED-NEXT:    global_load_dwordx4 v[52:55], v[24:25], off offset:144
-; ALIGNED-NEXT:    global_load_dwordx4 v[66:69], v[24:25], off offset:160
-; ALIGNED-NEXT:    global_load_dwordx4 v[81:84], v[24:25], off offset:176
-; ALIGNED-NEXT:    global_load_dwordx4 v[96:99], v[24:25], off offset:192
-; ALIGNED-NEXT:    global_load_dwordx4 v[100:103], v[24:25], off offset:208
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(15)
 ; ALIGNED-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:64
 ; ALIGNED-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:68
@@ -2392,6 +2397,7 @@ define void @memcpy_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:  .LBB3_1: ; %load-store-loop
 ; ALIGNED-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; ALIGNED-NEXT:    s_clause 0x34
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:203
 ; ALIGNED-NEXT:    buffer_load_ubyte v116, v1, s[0:3], 0 offen offset:255
 ; ALIGNED-NEXT:    buffer_load_ubyte v117, v1, s[0:3], 0 offen offset:254
 ; ALIGNED-NEXT:    buffer_load_ubyte v118, v1, s[0:3], 0 offen offset:253
@@ -2444,14 +2450,14 @@ define void @memcpy_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    buffer_load_ubyte v125, v1, s[0:3], 0 offen offset:206
 ; ALIGNED-NEXT:    buffer_load_ubyte v126, v1, s[0:3], 0 offen offset:205
 ; ALIGNED-NEXT:    buffer_load_ubyte v127, v1, s[0:3], 0 offen offset:204
-; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:203
 ; ALIGNED-NEXT:    s_add_u32 s4, s4, 0x100
 ; ALIGNED-NEXT:    s_addc_u32 s5, s5, 0
 ; ALIGNED-NEXT:    v_cmp_gt_u64_e64 s6, 0x800, s[4:5]
 ; ALIGNED-NEXT:    s_and_b32 vcc_lo, exec_lo, s6
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(52)
 ; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_clause 0x3e
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:129
 ; ALIGNED-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:202
 ; ALIGNED-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:201
 ; ALIGNED-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:200
@@ -2514,8 +2520,8 @@ define void @memcpy_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    buffer_load_ubyte v86, v1, s[0:3], 0 offen offset:143
 ; ALIGNED-NEXT:    buffer_load_ubyte v87, v1, s[0:3], 0 offen offset:142
 ; ALIGNED-NEXT:    buffer_load_ubyte v96, v1, s[0:3], 0 offen offset:141
-; ALIGNED-NEXT:    buffer_load_ubyte v97, v1, s[0:3], 0 offen offset:140
 ; ALIGNED-NEXT:    s_clause 0xa
+; ALIGNED-NEXT:    buffer_load_ubyte v97, v1, s[0:3], 0 offen offset:140
 ; ALIGNED-NEXT:    buffer_load_ubyte v98, v1, s[0:3], 0 offen offset:139
 ; ALIGNED-NEXT:    buffer_load_ubyte v99, v1, s[0:3], 0 offen offset:138
 ; ALIGNED-NEXT:    buffer_load_ubyte v100, v1, s[0:3], 0 offen offset:137
@@ -2526,8 +2532,7 @@ define void @memcpy_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    buffer_load_ubyte v113, v1, s[0:3], 0 offen offset:132
 ; ALIGNED-NEXT:    buffer_load_ubyte v114, v1, s[0:3], 0 offen offset:131
 ; ALIGNED-NEXT:    buffer_load_ubyte v115, v1, s[0:3], 0 offen offset:130
-; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:129
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(62)
 ; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:128
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
@@ -3583,35 +3588,11 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; CHECK-NEXT:  .LBB4_1: ; %load-store-loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    s_clause 0x3e
-; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:32
-; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:36
-; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:40
-; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:44
-; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:48
-; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:52
-; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:56
-; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:60
-; CHECK-NEXT:    buffer_load_dword v14, v2, s[0:3], 0 offen offset:76
-; CHECK-NEXT:    buffer_load_dword v18, v2, s[0:3], 0 offen offset:92
-; CHECK-NEXT:    buffer_load_dword v17, v2, s[0:3], 0 offen offset:88
-; CHECK-NEXT:    buffer_load_dword v16, v2, s[0:3], 0 offen offset:84
-; CHECK-NEXT:    buffer_load_dword v15, v2, s[0:3], 0 offen offset:80
-; CHECK-NEXT:    buffer_load_dword v13, v2, s[0:3], 0 offen offset:72
-; CHECK-NEXT:    buffer_load_dword v12, v2, s[0:3], 0 offen offset:68
-; CHECK-NEXT:    buffer_load_dword v11, v2, s[0:3], 0 offen offset:64
-; CHECK-NEXT:    buffer_load_dword v22, v2, s[0:3], 0 offen offset:108
-; CHECK-NEXT:    buffer_load_dword v26, v2, s[0:3], 0 offen offset:124
-; CHECK-NEXT:    buffer_load_dword v25, v2, s[0:3], 0 offen offset:120
-; CHECK-NEXT:    buffer_load_dword v24, v2, s[0:3], 0 offen offset:116
-; CHECK-NEXT:    buffer_load_dword v23, v2, s[0:3], 0 offen offset:112
-; CHECK-NEXT:    buffer_load_dword v21, v2, s[0:3], 0 offen offset:104
-; CHECK-NEXT:    buffer_load_dword v20, v2, s[0:3], 0 offen offset:100
-; CHECK-NEXT:    buffer_load_dword v19, v2, s[0:3], 0 offen offset:96
-; CHECK-NEXT:    buffer_load_dword v30, v2, s[0:3], 0 offen offset:236
 ; CHECK-NEXT:    buffer_load_dword v34, v2, s[0:3], 0 offen offset:252
 ; CHECK-NEXT:    buffer_load_dword v33, v2, s[0:3], 0 offen offset:248
 ; CHECK-NEXT:    buffer_load_dword v32, v2, s[0:3], 0 offen offset:244
 ; CHECK-NEXT:    buffer_load_dword v31, v2, s[0:3], 0 offen offset:240
+; CHECK-NEXT:    buffer_load_dword v30, v2, s[0:3], 0 offen offset:236
 ; CHECK-NEXT:    buffer_load_dword v29, v2, s[0:3], 0 offen offset:232
 ; CHECK-NEXT:    buffer_load_dword v28, v2, s[0:3], 0 offen offset:228
 ; CHECK-NEXT:    buffer_load_dword v27, v2, s[0:3], 0 offen offset:224
@@ -3623,11 +3604,11 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; CHECK-NEXT:    buffer_load_dword v50, v2, s[0:3], 0 offen offset:200
 ; CHECK-NEXT:    buffer_load_dword v49, v2, s[0:3], 0 offen offset:196
 ; CHECK-NEXT:    buffer_load_dword v48, v2, s[0:3], 0 offen offset:192
-; CHECK-NEXT:    buffer_load_dword v55, v2, s[0:3], 0 offen offset:172
 ; CHECK-NEXT:    buffer_load_dword v67, v2, s[0:3], 0 offen offset:188
 ; CHECK-NEXT:    buffer_load_dword v66, v2, s[0:3], 0 offen offset:184
 ; CHECK-NEXT:    buffer_load_dword v65, v2, s[0:3], 0 offen offset:180
 ; CHECK-NEXT:    buffer_load_dword v64, v2, s[0:3], 0 offen offset:176
+; CHECK-NEXT:    buffer_load_dword v55, v2, s[0:3], 0 offen offset:172
 ; CHECK-NEXT:    buffer_load_dword v54, v2, s[0:3], 0 offen offset:168
 ; CHECK-NEXT:    buffer_load_dword v53, v2, s[0:3], 0 offen offset:164
 ; CHECK-NEXT:    buffer_load_dword v52, v2, s[0:3], 0 offen offset:160
@@ -3639,13 +3620,37 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; CHECK-NEXT:    buffer_load_dword v82, v2, s[0:3], 0 offen offset:136
 ; CHECK-NEXT:    buffer_load_dword v81, v2, s[0:3], 0 offen offset:132
 ; CHECK-NEXT:    buffer_load_dword v80, v2, s[0:3], 0 offen offset:128
-; CHECK-NEXT:    buffer_load_dword v84, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_dword v85, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_dword v86, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v26, v2, s[0:3], 0 offen offset:124
+; CHECK-NEXT:    buffer_load_dword v25, v2, s[0:3], 0 offen offset:120
+; CHECK-NEXT:    buffer_load_dword v24, v2, s[0:3], 0 offen offset:116
+; CHECK-NEXT:    buffer_load_dword v23, v2, s[0:3], 0 offen offset:112
+; CHECK-NEXT:    buffer_load_dword v22, v2, s[0:3], 0 offen offset:108
+; CHECK-NEXT:    buffer_load_dword v21, v2, s[0:3], 0 offen offset:104
+; CHECK-NEXT:    buffer_load_dword v20, v2, s[0:3], 0 offen offset:100
+; CHECK-NEXT:    buffer_load_dword v19, v2, s[0:3], 0 offen offset:96
+; CHECK-NEXT:    buffer_load_dword v18, v2, s[0:3], 0 offen offset:92
+; CHECK-NEXT:    buffer_load_dword v17, v2, s[0:3], 0 offen offset:88
+; CHECK-NEXT:    buffer_load_dword v16, v2, s[0:3], 0 offen offset:84
+; CHECK-NEXT:    buffer_load_dword v15, v2, s[0:3], 0 offen offset:80
+; CHECK-NEXT:    buffer_load_dword v14, v2, s[0:3], 0 offen offset:76
+; CHECK-NEXT:    buffer_load_dword v13, v2, s[0:3], 0 offen offset:72
+; CHECK-NEXT:    buffer_load_dword v12, v2, s[0:3], 0 offen offset:68
+; CHECK-NEXT:    buffer_load_dword v11, v2, s[0:3], 0 offen offset:64
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:48
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:52
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:56
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:60
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:32
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:36
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:40
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:44
 ; CHECK-NEXT:    buffer_load_dword v96, v2, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    buffer_load_dword v97, v2, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    buffer_load_dword v98, v2, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    buffer_load_dword v99, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v84, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v85, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v86, v2, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v87, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    v_add_co_u32 v100, vcc_lo, v0, s4
 ; CHECK-NEXT:    s_add_u32 s4, s4, 0x100
@@ -3653,29 +3658,35 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; CHECK-NEXT:    s_addc_u32 s5, s5, 0
 ; CHECK-NEXT:    v_add_nc_u32_e32 v2, 0x100, v2
 ; CHECK-NEXT:    v_cmp_gt_u64_e64 s6, 0x800, s[4:5]
-; CHECK-NEXT:    s_waitcnt vmcnt(35)
+; CHECK-NEXT:    s_waitcnt vmcnt(60)
 ; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[31:34] offset:240
-; CHECK-NEXT:    s_waitcnt vmcnt(32)
+; CHECK-NEXT:    s_waitcnt vmcnt(56)
 ; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[27:30] offset:224
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
+; CHECK-NEXT:    s_waitcnt vmcnt(52)
 ; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[35:38] offset:208
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
+; CHECK-NEXT:    s_waitcnt vmcnt(48)
 ; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[48:51] offset:192
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
+; CHECK-NEXT:    s_waitcnt vmcnt(44)
 ; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[64:67] offset:176
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
+; CHECK-NEXT:    s_waitcnt vmcnt(40)
 ; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[52:55] offset:160
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
 ; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[68:71] offset:144
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
+; CHECK-NEXT:    s_waitcnt vmcnt(32)
 ; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[80:83] offset:128
+; CHECK-NEXT:    s_waitcnt vmcnt(28)
 ; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[23:26] offset:112
+; CHECK-NEXT:    s_waitcnt vmcnt(24)
 ; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[19:22] offset:96
+; CHECK-NEXT:    s_waitcnt vmcnt(20)
 ; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[15:18] offset:80
+; CHECK-NEXT:    s_waitcnt vmcnt(16)
 ; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[11:14] offset:64
+; CHECK-NEXT:    s_waitcnt vmcnt(12)
 ; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[7:10] offset:48
+; CHECK-NEXT:    s_waitcnt vmcnt(8)
 ; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[3:6] offset:32
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
 ; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[96:99] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[84:87]
@@ -5336,14 +5347,14 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; UNROLL3-NEXT:  .LBB4_1: ; %load-store-loop
 ; UNROLL3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; UNROLL3-NEXT:    s_clause 0xb
-; UNROLL3-NEXT:    buffer_load_dword v4, v3, s[0:3], 0 offen
-; UNROLL3-NEXT:    buffer_load_dword v5, v3, s[0:3], 0 offen offset:4
-; UNROLL3-NEXT:    buffer_load_dword v6, v3, s[0:3], 0 offen offset:8
-; UNROLL3-NEXT:    buffer_load_dword v7, v3, s[0:3], 0 offen offset:12
 ; UNROLL3-NEXT:    buffer_load_dword v8, v3, s[0:3], 0 offen offset:16
 ; UNROLL3-NEXT:    buffer_load_dword v9, v3, s[0:3], 0 offen offset:20
 ; UNROLL3-NEXT:    buffer_load_dword v10, v3, s[0:3], 0 offen offset:24
 ; UNROLL3-NEXT:    buffer_load_dword v11, v3, s[0:3], 0 offen offset:28
+; UNROLL3-NEXT:    buffer_load_dword v4, v3, s[0:3], 0 offen
+; UNROLL3-NEXT:    buffer_load_dword v5, v3, s[0:3], 0 offen offset:4
+; UNROLL3-NEXT:    buffer_load_dword v6, v3, s[0:3], 0 offen offset:8
+; UNROLL3-NEXT:    buffer_load_dword v7, v3, s[0:3], 0 offen offset:12
 ; UNROLL3-NEXT:    buffer_load_dword v12, v3, s[0:3], 0 offen offset:32
 ; UNROLL3-NEXT:    buffer_load_dword v13, v3, s[0:3], 0 offen offset:36
 ; UNROLL3-NEXT:    buffer_load_dword v14, v3, s[0:3], 0 offen offset:40
@@ -5354,8 +5365,9 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; UNROLL3-NEXT:    s_addc_u32 s5, s5, 0
 ; UNROLL3-NEXT:    v_add_nc_u32_e32 v3, 48, v3
 ; UNROLL3-NEXT:    v_cmp_gt_u64_e64 s6, 0x7e0, s[4:5]
-; UNROLL3-NEXT:    s_waitcnt vmcnt(4)
+; UNROLL3-NEXT:    s_waitcnt vmcnt(8)
 ; UNROLL3-NEXT:    flat_store_dwordx4 v[16:17], v[8:11] offset:16
+; UNROLL3-NEXT:    s_waitcnt vmcnt(4)
 ; UNROLL3-NEXT:    flat_store_dwordx4 v[16:17], v[4:7]
 ; UNROLL3-NEXT:    s_waitcnt vmcnt(0)
 ; UNROLL3-NEXT:    flat_store_dwordx4 v[16:17], v[12:15] offset:32
@@ -5551,6 +5563,11 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0
 ; ALIGNED-NEXT:    s_clause 0xf
 ; ALIGNED-NEXT:    flat_load_dwordx4 v[16:19], v[20:21] offset:240
 ; ALIGNED-NEXT:    flat_load_dwordx4 v[22:25], v[20:21] offset:224
+; ALIGNED-NEXT:    flat_load_dwordx4 v[26:29], v[20:21] offset:208
+; ALIGNED-NEXT:    flat_load_dwordx4 v[30:33], v[20:21] offset:192
+; ALIGNED-NEXT:    flat_load_dwordx4 v[34:37], v[20:21] offset:176
+; ALIGNED-NEXT:    flat_load_dwordx4 v[44:47], v[20:21] offset:160
+; ALIGNED-NEXT:    flat_load_dwordx4 v[50:53], v[20:21] offset:144
 ; ALIGNED-NEXT:    flat_load_dwordx4 v[4:7], v[20:21]
 ; ALIGNED-NEXT:    flat_load_dwordx4 v[8:11], v[20:21] offset:16
 ; ALIGNED-NEXT:    flat_load_dwordx4 v[12:15], v[20:21] offset:32
@@ -5560,11 +5577,6 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0
 ; ALIGNED-NEXT:    flat_load_dwordx4 v[116:119], v[20:21] offset:96
 ; ALIGNED-NEXT:    flat_load_dwordx4 v[66:69], v[20:21] offset:112
 ; ALIGNED-NEXT:    flat_load_dwordx4 v[40:43], v[20:21] offset:128
-; ALIGNED-NEXT:    flat_load_dwordx4 v[50:53], v[20:21] offset:144
-; ALIGNED-NEXT:    flat_load_dwordx4 v[44:47], v[20:21] offset:160
-; ALIGNED-NEXT:    flat_load_dwordx4 v[34:37], v[20:21] offset:176
-; ALIGNED-NEXT:    flat_load_dwordx4 v[30:33], v[20:21] offset:192
-; ALIGNED-NEXT:    flat_load_dwordx4 v[26:29], v[20:21] offset:208
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
 ; ALIGNED-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:64
 ; ALIGNED-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:68
@@ -5618,7 +5630,7 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v22 offset:226
 ; ALIGNED-NEXT:    flat_store_byte v[16:17], v22 offset:224
-; ALIGNED-NEXT:    s_waitcnt lgkmcnt(16)
+; ALIGNED-NEXT:    s_waitcnt lgkmcnt(29)
 ; ALIGNED-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:32
 ; ALIGNED-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:36
 ; ALIGNED-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:40
@@ -5640,6 +5652,7 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v26 offset:210
 ; ALIGNED-NEXT:    flat_store_byte v[16:17], v26 offset:208
+; ALIGNED-NEXT:    s_waitcnt lgkmcnt(36)
 ; ALIGNED-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:48
 ; ALIGNED-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:52
 ; ALIGNED-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:56
@@ -5661,6 +5674,7 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v30 offset:194
 ; ALIGNED-NEXT:    flat_store_byte v[16:17], v30 offset:192
+; ALIGNED-NEXT:    s_waitcnt lgkmcnt(43)
 ; ALIGNED-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:128
 ; ALIGNED-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:132
 ; ALIGNED-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:136
@@ -5682,6 +5696,7 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v34 offset:178
 ; ALIGNED-NEXT:    flat_store_byte v[16:17], v34 offset:176
+; ALIGNED-NEXT:    s_waitcnt lgkmcnt(50)
 ; ALIGNED-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:144
 ; ALIGNED-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:148
 ; ALIGNED-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:152
@@ -5703,6 +5718,7 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v38 offset:162
 ; ALIGNED-NEXT:    flat_store_byte v[16:17], v38 offset:160
+; ALIGNED-NEXT:    s_waitcnt lgkmcnt(57)
 ; ALIGNED-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:96
 ; ALIGNED-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:100
 ; ALIGNED-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:104
@@ -5724,6 +5740,7 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v50 offset:146
 ; ALIGNED-NEXT:    flat_store_byte v[16:17], v50 offset:144
+; ALIGNED-NEXT:    s_waitcnt lgkmcnt(56)
 ; ALIGNED-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:112
 ; ALIGNED-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:116
 ; ALIGNED-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:120
@@ -6180,6 +6197,11 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0
 ; ALIGNED-NEXT:    s_clause 0xf
 ; ALIGNED-NEXT:    flat_load_dwordx4 v[16:19], v[24:25] offset:240
 ; ALIGNED-NEXT:    flat_load_dwordx4 v[20:23], v[24:25] offset:224
+; ALIGNED-NEXT:    flat_load_dwordx4 v[100:103], v[24:25] offset:208
+; ALIGNED-NEXT:    flat_load_dwordx4 v[96:99], v[24:25] offset:192
+; ALIGNED-NEXT:    flat_load_dwordx4 v[81:84], v[24:25] offset:176
+; ALIGNED-NEXT:    flat_load_dwordx4 v[66:69], v[24:25] offset:160
+; ALIGNED-NEXT:    flat_load_dwordx4 v[52:55], v[24:25] offset:144
 ; ALIGNED-NEXT:    flat_load_dwordx4 v[4:7], v[24:25]
 ; ALIGNED-NEXT:    flat_load_dwordx4 v[8:11], v[24:25] offset:16
 ; ALIGNED-NEXT:    flat_load_dwordx4 v[12:15], v[24:25] offset:32
@@ -6189,11 +6211,6 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0
 ; ALIGNED-NEXT:    flat_load_dwordx4 v[26:29], v[24:25] offset:96
 ; ALIGNED-NEXT:    flat_load_dwordx4 v[32:35], v[24:25] offset:112
 ; ALIGNED-NEXT:    flat_load_dwordx4 v[44:47], v[24:25] offset:128
-; ALIGNED-NEXT:    flat_load_dwordx4 v[52:55], v[24:25] offset:144
-; ALIGNED-NEXT:    flat_load_dwordx4 v[66:69], v[24:25] offset:160
-; ALIGNED-NEXT:    flat_load_dwordx4 v[81:84], v[24:25] offset:176
-; ALIGNED-NEXT:    flat_load_dwordx4 v[96:99], v[24:25] offset:192
-; ALIGNED-NEXT:    flat_load_dwordx4 v[100:103], v[24:25] offset:208
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
 ; ALIGNED-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:320
 ; ALIGNED-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:324
@@ -6243,7 +6260,7 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v36 offset:226
 ; ALIGNED-NEXT:    flat_store_byte v[16:17], v36 offset:224
-; ALIGNED-NEXT:    s_waitcnt lgkmcnt(16)
+; ALIGNED-NEXT:    s_waitcnt lgkmcnt(29)
 ; ALIGNED-NEXT:    buffer_store_dword v100, off, s[0:3], s32 offset:288
 ; ALIGNED-NEXT:    buffer_store_dword v101, off, s[0:3], s32 offset:292
 ; ALIGNED-NEXT:    buffer_store_dword v102, off, s[0:3], s32 offset:296
@@ -6269,6 +6286,7 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v64 offset:210
 ; ALIGNED-NEXT:    flat_store_byte v[16:17], v64 offset:208
+; ALIGNED-NEXT:    s_waitcnt lgkmcnt(36)
 ; ALIGNED-NEXT:    buffer_store_dword v96, off, s[0:3], s32 offset:304
 ; ALIGNED-NEXT:    buffer_store_dword v97, off, s[0:3], s32 offset:308
 ; ALIGNED-NEXT:    buffer_store_dword v98, off, s[0:3], s32 offset:312
@@ -6290,6 +6308,7 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v80 offset:194
 ; ALIGNED-NEXT:    flat_store_byte v[16:17], v80 offset:192
+; ALIGNED-NEXT:    s_waitcnt lgkmcnt(43)
 ; ALIGNED-NEXT:    buffer_store_dword v81, off, s[0:3], s32 offset:384
 ; ALIGNED-NEXT:    buffer_store_dword v82, off, s[0:3], s32 offset:388
 ; ALIGNED-NEXT:    buffer_store_dword v83, off, s[0:3], s32 offset:392
@@ -6311,6 +6330,7 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v81 offset:178
 ; ALIGNED-NEXT:    flat_store_byte v[16:17], v81 offset:176
+; ALIGNED-NEXT:    s_waitcnt lgkmcnt(50)
 ; ALIGNED-NEXT:    buffer_store_dword v66, off, s[0:3], s32 offset:400
 ; ALIGNED-NEXT:    buffer_store_dword v67, off, s[0:3], s32 offset:404
 ; ALIGNED-NEXT:    buffer_store_dword v68, off, s[0:3], s32 offset:408
@@ -6332,6 +6352,7 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v66 offset:162
 ; ALIGNED-NEXT:    flat_store_byte v[16:17], v66 offset:160
+; ALIGNED-NEXT:    s_waitcnt lgkmcnt(57)
 ; ALIGNED-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:352
 ; ALIGNED-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:356
 ; ALIGNED-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:360
@@ -6353,6 +6374,7 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v52 offset:146
 ; ALIGNED-NEXT:    flat_store_byte v[16:17], v52 offset:144
+; ALIGNED-NEXT:    s_waitcnt lgkmcnt(56)
 ; ALIGNED-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:368
 ; ALIGNED-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:372
 ; ALIGNED-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:376
@@ -7054,6 +7076,11 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1
 ; ALIGNED-NEXT:    s_clause 0xf
 ; ALIGNED-NEXT:    global_load_dwordx4 v[16:19], v[20:21], off offset:240
 ; ALIGNED-NEXT:    global_load_dwordx4 v[22:25], v[20:21], off offset:224
+; ALIGNED-NEXT:    global_load_dwordx4 v[26:29], v[20:21], off offset:208
+; ALIGNED-NEXT:    global_load_dwordx4 v[30:33], v[20:21], off offset:192
+; ALIGNED-NEXT:    global_load_dwordx4 v[34:37], v[20:21], off offset:176
+; ALIGNED-NEXT:    global_load_dwordx4 v[44:47], v[20:21], off offset:160
+; ALIGNED-NEXT:    global_load_dwordx4 v[50:53], v[20:21], off offset:144
 ; ALIGNED-NEXT:    global_load_dwordx4 v[4:7], v[20:21], off
 ; ALIGNED-NEXT:    global_load_dwordx4 v[8:11], v[20:21], off offset:16
 ; ALIGNED-NEXT:    global_load_dwordx4 v[12:15], v[20:21], off offset:32
@@ -7063,11 +7090,6 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1
 ; ALIGNED-NEXT:    global_load_dwordx4 v[116:119], v[20:21], off offset:96
 ; ALIGNED-NEXT:    global_load_dwordx4 v[66:69], v[20:21], off offset:112
 ; ALIGNED-NEXT:    global_load_dwordx4 v[40:43], v[20:21], off offset:128
-; ALIGNED-NEXT:    global_load_dwordx4 v[50:53], v[20:21], off offset:144
-; ALIGNED-NEXT:    global_load_dwordx4 v[44:47], v[20:21], off offset:160
-; ALIGNED-NEXT:    global_load_dwordx4 v[34:37], v[20:21], off offset:176
-; ALIGNED-NEXT:    global_load_dwordx4 v[30:33], v[20:21], off offset:192
-; ALIGNED-NEXT:    global_load_dwordx4 v[26:29], v[20:21], off offset:208
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(15)
 ; ALIGNED-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:64
 ; ALIGNED-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:68
@@ -7681,6 +7703,11 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1
 ; ALIGNED-NEXT:    s_clause 0xf
 ; ALIGNED-NEXT:    global_load_dwordx4 v[16:19], v[24:25], off offset:240
 ; ALIGNED-NEXT:    global_load_dwordx4 v[20:23], v[24:25], off offset:224
+; ALIGNED-NEXT:    global_load_dwordx4 v[100:103], v[24:25], off offset:208
+; ALIGNED-NEXT:    global_load_dwordx4 v[96:99], v[24:25], off offset:192
+; ALIGNED-NEXT:    global_load_dwordx4 v[81:84], v[24:25], off offset:176
+; ALIGNED-NEXT:    global_load_dwordx4 v[66:69], v[24:25], off offset:160
+; ALIGNED-NEXT:    global_load_dwordx4 v[52:55], v[24:25], off offset:144
 ; ALIGNED-NEXT:    global_load_dwordx4 v[4:7], v[24:25], off
 ; ALIGNED-NEXT:    global_load_dwordx4 v[8:11], v[24:25], off offset:16
 ; ALIGNED-NEXT:    global_load_dwordx4 v[12:15], v[24:25], off offset:32
@@ -7690,11 +7717,6 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1
 ; ALIGNED-NEXT:    global_load_dwordx4 v[26:29], v[24:25], off offset:96
 ; ALIGNED-NEXT:    global_load_dwordx4 v[32:35], v[24:25], off offset:112
 ; ALIGNED-NEXT:    global_load_dwordx4 v[44:47], v[24:25], off offset:128
-; ALIGNED-NEXT:    global_load_dwordx4 v[52:55], v[24:25], off offset:144
-; ALIGNED-NEXT:    global_load_dwordx4 v[66:69], v[24:25], off offset:160
-; ALIGNED-NEXT:    global_load_dwordx4 v[81:84], v[24:25], off offset:176
-; ALIGNED-NEXT:    global_load_dwordx4 v[96:99], v[24:25], off offset:192
-; ALIGNED-NEXT:    global_load_dwordx4 v[100:103], v[24:25], off offset:208
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(15)
 ; ALIGNED-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:320
 ; ALIGNED-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:324
@@ -10124,6 +10146,7 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_clause 0x3e
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:126
 ; ALIGNED-NEXT:    buffer_load_ubyte v123, v1, s[0:3], 0 offen offset:247
 ; ALIGNED-NEXT:    buffer_load_ubyte v122, v1, s[0:3], 0 offen offset:246
 ; ALIGNED-NEXT:    buffer_load_ubyte v121, v1, s[0:3], 0 offen offset:245
@@ -10186,8 +10209,8 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_load_ubyte v96, v1, s[0:3], 0 offen offset:188
 ; ALIGNED-NEXT:    buffer_load_ubyte v87, v1, s[0:3], 0 offen offset:187
 ; ALIGNED-NEXT:    buffer_load_ubyte v86, v1, s[0:3], 0 offen offset:186
-; ALIGNED-NEXT:    buffer_load_ubyte v85, v1, s[0:3], 0 offen offset:185
 ; ALIGNED-NEXT:    s_clause 0x3a
+; ALIGNED-NEXT:    buffer_load_ubyte v85, v1, s[0:3], 0 offen offset:185
 ; ALIGNED-NEXT:    buffer_load_ubyte v84, v1, s[0:3], 0 offen offset:184
 ; ALIGNED-NEXT:    buffer_load_ubyte v83, v1, s[0:3], 0 offen offset:183
 ; ALIGNED-NEXT:    buffer_load_ubyte v82, v1, s[0:3], 0 offen offset:182
@@ -10246,16 +10269,15 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:129
 ; ALIGNED-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:128
 ; ALIGNED-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:127
-; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:126
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(62)
 ; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_clause 0x4
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:121
 ; ALIGNED-NEXT:    buffer_load_ubyte v127, v1, s[0:3], 0 offen offset:125
 ; ALIGNED-NEXT:    buffer_load_ubyte v126, v1, s[0:3], 0 offen offset:124
 ; ALIGNED-NEXT:    buffer_load_ubyte v125, v1, s[0:3], 0 offen offset:123
 ; ALIGNED-NEXT:    buffer_load_ubyte v124, v1, s[0:3], 0 offen offset:122
-; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:121
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
 ; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:120
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
@@ -11323,6 +11345,7 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_clause 0x3e
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:126
 ; ALIGNED-NEXT:    buffer_load_ubyte v115, v1, s[0:3], 0 offen offset:199
 ; ALIGNED-NEXT:    buffer_load_ubyte v114, v1, s[0:3], 0 offen offset:198
 ; ALIGNED-NEXT:    buffer_load_ubyte v113, v1, s[0:3], 0 offen offset:197
@@ -11385,8 +11408,8 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:140
 ; ALIGNED-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:139
 ; ALIGNED-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:138
-; ALIGNED-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:137
 ; ALIGNED-NEXT:    s_clause 0xa
+; ALIGNED-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:137
 ; ALIGNED-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:136
 ; ALIGNED-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:135
 ; ALIGNED-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:134
@@ -11397,10 +11420,10 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:129
 ; ALIGNED-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:128
 ; ALIGNED-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:127
-; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:126
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(62)
 ; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_clause 0x34
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:73
 ; ALIGNED-NEXT:    buffer_load_ubyte v127, v1, s[0:3], 0 offen offset:125
 ; ALIGNED-NEXT:    buffer_load_ubyte v126, v1, s[0:3], 0 offen offset:124
 ; ALIGNED-NEXT:    buffer_load_ubyte v125, v1, s[0:3], 0 offen offset:123
@@ -11453,8 +11476,7 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_load_ubyte v118, v1, s[0:3], 0 offen offset:76
 ; ALIGNED-NEXT:    buffer_load_ubyte v117, v1, s[0:3], 0 offen offset:75
 ; ALIGNED-NEXT:    buffer_load_ubyte v116, v1, s[0:3], 0 offen offset:74
-; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:73
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(52)
 ; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:72
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
@@ -12436,94 +12458,104 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; CHECK-NEXT:  .LBB9_1: ; %memmove_fwd_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    s_clause 0x3e
-; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:32
-; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:36
-; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:40
-; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:44
-; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:48
-; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:52
-; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:56
-; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:60
-; CHECK-NEXT:    buffer_load_dword v14, v2, s[0:3], 0 offen offset:76
-; CHECK-NEXT:    buffer_load_dword v18, v2, s[0:3], 0 offen offset:92
-; CHECK-NEXT:    buffer_load_dword v17, v2, s[0:3], 0 offen offset:88
-; CHECK-NEXT:    buffer_load_dword v16, v2, s[0:3], 0 offen offset:84
-; CHECK-NEXT:    buffer_load_dword v15, v2, s[0:3], 0 offen offset:80
-; CHECK-NEXT:    buffer_load_dword v13, v2, s[0:3], 0 offen offset:72
-; CHECK-NEXT:    buffer_load_dword v12, v2, s[0:3], 0 offen offset:68
-; CHECK-NEXT:    buffer_load_dword v11, v2, s[0:3], 0 offen offset:64
-; CHECK-NEXT:    buffer_load_dword v22, v2, s[0:3], 0 offen offset:108
-; CHECK-NEXT:    buffer_load_dword v26, v2, s[0:3], 0 offen offset:124
-; CHECK-NEXT:    buffer_load_dword v25, v2, s[0:3], 0 offen offset:120
-; CHECK-NEXT:    buffer_load_dword v24, v2, s[0:3], 0 offen offset:116
-; CHECK-NEXT:    buffer_load_dword v23, v2, s[0:3], 0 offen offset:112
-; CHECK-NEXT:    buffer_load_dword v21, v2, s[0:3], 0 offen offset:104
-; CHECK-NEXT:    buffer_load_dword v20, v2, s[0:3], 0 offen offset:100
-; CHECK-NEXT:    buffer_load_dword v19, v2, s[0:3], 0 offen offset:96
-; CHECK-NEXT:    buffer_load_dword v30, v2, s[0:3], 0 offen offset:172
-; CHECK-NEXT:    buffer_load_dword v34, v2, s[0:3], 0 offen offset:188
-; CHECK-NEXT:    buffer_load_dword v33, v2, s[0:3], 0 offen offset:184
-; CHECK-NEXT:    buffer_load_dword v32, v2, s[0:3], 0 offen offset:180
-; CHECK-NEXT:    buffer_load_dword v31, v2, s[0:3], 0 offen offset:176
-; CHECK-NEXT:    buffer_load_dword v29, v2, s[0:3], 0 offen offset:168
-; CHECK-NEXT:    buffer_load_dword v28, v2, s[0:3], 0 offen offset:164
-; CHECK-NEXT:    buffer_load_dword v27, v2, s[0:3], 0 offen offset:160
-; CHECK-NEXT:    buffer_load_dword v38, v2, s[0:3], 0 offen offset:204
-; CHECK-NEXT:    buffer_load_dword v51, v2, s[0:3], 0 offen offset:220
-; CHECK-NEXT:    buffer_load_dword v50, v2, s[0:3], 0 offen offset:216
-; CHECK-NEXT:    buffer_load_dword v49, v2, s[0:3], 0 offen offset:212
-; CHECK-NEXT:    buffer_load_dword v48, v2, s[0:3], 0 offen offset:208
-; CHECK-NEXT:    buffer_load_dword v37, v2, s[0:3], 0 offen offset:200
-; CHECK-NEXT:    buffer_load_dword v36, v2, s[0:3], 0 offen offset:196
-; CHECK-NEXT:    buffer_load_dword v35, v2, s[0:3], 0 offen offset:192
-; CHECK-NEXT:    buffer_load_dword v55, v2, s[0:3], 0 offen offset:236
 ; CHECK-NEXT:    buffer_load_dword v67, v2, s[0:3], 0 offen offset:252
 ; CHECK-NEXT:    buffer_load_dword v66, v2, s[0:3], 0 offen offset:248
 ; CHECK-NEXT:    buffer_load_dword v65, v2, s[0:3], 0 offen offset:244
 ; CHECK-NEXT:    buffer_load_dword v64, v2, s[0:3], 0 offen offset:240
+; CHECK-NEXT:    buffer_load_dword v55, v2, s[0:3], 0 offen offset:236
 ; CHECK-NEXT:    buffer_load_dword v54, v2, s[0:3], 0 offen offset:232
 ; CHECK-NEXT:    buffer_load_dword v53, v2, s[0:3], 0 offen offset:228
 ; CHECK-NEXT:    buffer_load_dword v52, v2, s[0:3], 0 offen offset:224
-; CHECK-NEXT:    buffer_load_dword v71, v2, s[0:3], 0 offen offset:140
+; CHECK-NEXT:    buffer_load_dword v51, v2, s[0:3], 0 offen offset:220
+; CHECK-NEXT:    buffer_load_dword v50, v2, s[0:3], 0 offen offset:216
+; CHECK-NEXT:    buffer_load_dword v49, v2, s[0:3], 0 offen offset:212
+; CHECK-NEXT:    buffer_load_dword v48, v2, s[0:3], 0 offen offset:208
+; CHECK-NEXT:    buffer_load_dword v38, v2, s[0:3], 0 offen offset:204
+; CHECK-NEXT:    buffer_load_dword v37, v2, s[0:3], 0 offen offset:200
+; CHECK-NEXT:    buffer_load_dword v36, v2, s[0:3], 0 offen offset:196
+; CHECK-NEXT:    buffer_load_dword v35, v2, s[0:3], 0 offen offset:192
+; CHECK-NEXT:    buffer_load_dword v34, v2, s[0:3], 0 offen offset:188
+; CHECK-NEXT:    buffer_load_dword v33, v2, s[0:3], 0 offen offset:184
+; CHECK-NEXT:    buffer_load_dword v32, v2, s[0:3], 0 offen offset:180
+; CHECK-NEXT:    buffer_load_dword v31, v2, s[0:3], 0 offen offset:176
+; CHECK-NEXT:    buffer_load_dword v30, v2, s[0:3], 0 offen offset:172
+; CHECK-NEXT:    buffer_load_dword v29, v2, s[0:3], 0 offen offset:168
+; CHECK-NEXT:    buffer_load_dword v28, v2, s[0:3], 0 offen offset:164
+; CHECK-NEXT:    buffer_load_dword v27, v2, s[0:3], 0 offen offset:160
 ; CHECK-NEXT:    buffer_load_dword v83, v2, s[0:3], 0 offen offset:156
 ; CHECK-NEXT:    buffer_load_dword v82, v2, s[0:3], 0 offen offset:152
 ; CHECK-NEXT:    buffer_load_dword v81, v2, s[0:3], 0 offen offset:148
 ; CHECK-NEXT:    buffer_load_dword v80, v2, s[0:3], 0 offen offset:144
+; CHECK-NEXT:    buffer_load_dword v71, v2, s[0:3], 0 offen offset:140
 ; CHECK-NEXT:    buffer_load_dword v70, v2, s[0:3], 0 offen offset:136
 ; CHECK-NEXT:    buffer_load_dword v69, v2, s[0:3], 0 offen offset:132
 ; CHECK-NEXT:    buffer_load_dword v68, v2, s[0:3], 0 offen offset:128
-; CHECK-NEXT:    buffer_load_dword v84, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_dword v85, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_dword v86, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v26, v2, s[0:3], 0 offen offset:124
+; CHECK-NEXT:    buffer_load_dword v25, v2, s[0:3], 0 offen offset:120
+; CHECK-NEXT:    buffer_load_dword v24, v2, s[0:3], 0 offen offset:116
+; CHECK-NEXT:    buffer_load_dword v23, v2, s[0:3], 0 offen offset:112
+; CHECK-NEXT:    buffer_load_dword v22, v2, s[0:3], 0 offen offset:108
+; CHECK-NEXT:    buffer_load_dword v21, v2, s[0:3], 0 offen offset:104
+; CHECK-NEXT:    buffer_load_dword v20, v2, s[0:3], 0 offen offset:100
+; CHECK-NEXT:    buffer_load_dword v19, v2, s[0:3], 0 offen offset:96
+; CHECK-NEXT:    buffer_load_dword v18, v2, s[0:3], 0 offen offset:92
+; CHECK-NEXT:    buffer_load_dword v17, v2, s[0:3], 0 offen offset:88
+; CHECK-NEXT:    buffer_load_dword v16, v2, s[0:3], 0 offen offset:84
+; CHECK-NEXT:    buffer_load_dword v15, v2, s[0:3], 0 offen offset:80
+; CHECK-NEXT:    buffer_load_dword v14, v2, s[0:3], 0 offen offset:76
+; CHECK-NEXT:    buffer_load_dword v13, v2, s[0:3], 0 offen offset:72
+; CHECK-NEXT:    buffer_load_dword v12, v2, s[0:3], 0 offen offset:68
+; CHECK-NEXT:    buffer_load_dword v11, v2, s[0:3], 0 offen offset:64
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:48
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:52
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:56
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:60
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:32
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:36
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:40
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:44
 ; CHECK-NEXT:    buffer_load_dword v96, v2, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    buffer_load_dword v97, v2, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    buffer_load_dword v98, v2, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    buffer_load_dword v99, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v84, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v85, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v86, v2, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v87, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    v_add_co_u32 v100, vcc_lo, v0, s4
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo
 ; CHECK-NEXT:    s_add_u32 s4, s4, 0x100
 ; CHECK-NEXT:    v_add_nc_u32_e32 v2, 0x100, v2
 ; CHECK-NEXT:    s_addc_u32 s5, s5, 0
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
+; CHECK-NEXT:    s_waitcnt vmcnt(60)
 ; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[64:67] offset:240
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
+; CHECK-NEXT:    s_waitcnt vmcnt(56)
 ; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[52:55] offset:224
+; CHECK-NEXT:    s_waitcnt vmcnt(52)
 ; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[48:51] offset:208
+; CHECK-NEXT:    s_waitcnt vmcnt(48)
 ; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[35:38] offset:192
+; CHECK-NEXT:    s_waitcnt vmcnt(44)
 ; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[31:34] offset:176
+; CHECK-NEXT:    s_waitcnt vmcnt(40)
 ; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[27:30] offset:160
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
 ; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[80:83] offset:144
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
+; CHECK-NEXT:    s_waitcnt vmcnt(32)
 ; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[68:71] offset:128
+; CHECK-NEXT:    s_waitcnt vmcnt(28)
 ; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[23:26] offset:112
+; CHECK-NEXT:    s_waitcnt vmcnt(24)
 ; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[19:22] offset:96
+; CHECK-NEXT:    s_waitcnt vmcnt(20)
 ; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[15:18] offset:80
+; CHECK-NEXT:    s_waitcnt vmcnt(16)
 ; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[11:14] offset:64
+; CHECK-NEXT:    s_waitcnt vmcnt(12)
 ; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[7:10] offset:48
+; CHECK-NEXT:    s_waitcnt vmcnt(8)
 ; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[3:6] offset:32
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
 ; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[96:99] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[84:87]
@@ -12540,51 +12572,27 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; CHECK-NEXT:  .LBB9_4: ; %memmove_bwd_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    s_clause 0x3e
-; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:32
-; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:36
-; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:40
-; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:44
-; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:48
-; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:52
-; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:56
-; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:60
-; CHECK-NEXT:    buffer_load_dword v14, v2, s[0:3], 0 offen offset:76
-; CHECK-NEXT:    buffer_load_dword v18, v2, s[0:3], 0 offen offset:92
-; CHECK-NEXT:    buffer_load_dword v17, v2, s[0:3], 0 offen offset:88
-; CHECK-NEXT:    buffer_load_dword v16, v2, s[0:3], 0 offen offset:84
-; CHECK-NEXT:    buffer_load_dword v15, v2, s[0:3], 0 offen offset:80
-; CHECK-NEXT:    buffer_load_dword v13, v2, s[0:3], 0 offen offset:72
-; CHECK-NEXT:    buffer_load_dword v12, v2, s[0:3], 0 offen offset:68
-; CHECK-NEXT:    buffer_load_dword v11, v2, s[0:3], 0 offen offset:64
-; CHECK-NEXT:    buffer_load_dword v22, v2, s[0:3], 0 offen offset:108
-; CHECK-NEXT:    buffer_load_dword v26, v2, s[0:3], 0 offen offset:124
-; CHECK-NEXT:    buffer_load_dword v25, v2, s[0:3], 0 offen offset:120
-; CHECK-NEXT:    buffer_load_dword v24, v2, s[0:3], 0 offen offset:116
-; CHECK-NEXT:    buffer_load_dword v23, v2, s[0:3], 0 offen offset:112
-; CHECK-NEXT:    buffer_load_dword v21, v2, s[0:3], 0 offen offset:104
-; CHECK-NEXT:    buffer_load_dword v20, v2, s[0:3], 0 offen offset:100
-; CHECK-NEXT:    buffer_load_dword v19, v2, s[0:3], 0 offen offset:96
-; CHECK-NEXT:    buffer_load_dword v30, v2, s[0:3], 0 offen offset:236
 ; CHECK-NEXT:    buffer_load_dword v34, v2, s[0:3], 0 offen offset:252
 ; CHECK-NEXT:    buffer_load_dword v33, v2, s[0:3], 0 offen offset:248
 ; CHECK-NEXT:    buffer_load_dword v32, v2, s[0:3], 0 offen offset:244
 ; CHECK-NEXT:    buffer_load_dword v31, v2, s[0:3], 0 offen offset:240
+; CHECK-NEXT:    buffer_load_dword v30, v2, s[0:3], 0 offen offset:236
 ; CHECK-NEXT:    buffer_load_dword v29, v2, s[0:3], 0 offen offset:232
 ; CHECK-NEXT:    buffer_load_dword v28, v2, s[0:3], 0 offen offset:228
 ; CHECK-NEXT:    buffer_load_dword v27, v2, s[0:3], 0 offen offset:224
-; CHECK-NEXT:    buffer_load_dword v38, v2, s[0:3], 0 offen offset:204
 ; CHECK-NEXT:    buffer_load_dword v51, v2, s[0:3], 0 offen offset:220
 ; CHECK-NEXT:    buffer_load_dword v50, v2, s[0:3], 0 offen offset:216
 ; CHECK-NEXT:    buffer_load_dword v49, v2, s[0:3], 0 offen offset:212
 ; CHECK-NEXT:    buffer_load_dword v48, v2, s[0:3], 0 offen offset:208
+; CHECK-NEXT:    buffer_load_dword v38, v2, s[0:3], 0 offen offset:204
 ; CHECK-NEXT:    buffer_load_dword v37, v2, s[0:3], 0 offen offset:200
 ; CHECK-NEXT:    buffer_load_dword v36, v2, s[0:3], 0 offen offset:196
 ; CHECK-NEXT:    buffer_load_dword v35, v2, s[0:3], 0 offen offset:192
-; CHECK-NEXT:    buffer_load_dword v55, v2, s[0:3], 0 offen offset:172
 ; CHECK-NEXT:    buffer_load_dword v67, v2, s[0:3], 0 offen offset:188
 ; CHECK-NEXT:    buffer_load_dword v66, v2, s[0:3], 0 offen offset:184
 ; CHECK-NEXT:    buffer_load_dword v65, v2, s[0:3], 0 offen offset:180
 ; CHECK-NEXT:    buffer_load_dword v64, v2, s[0:3], 0 offen offset:176
+; CHECK-NEXT:    buffer_load_dword v55, v2, s[0:3], 0 offen offset:172
 ; CHECK-NEXT:    buffer_load_dword v54, v2, s[0:3], 0 offen offset:168
 ; CHECK-NEXT:    buffer_load_dword v53, v2, s[0:3], 0 offen offset:164
 ; CHECK-NEXT:    buffer_load_dword v52, v2, s[0:3], 0 offen offset:160
@@ -12596,42 +12604,72 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; CHECK-NEXT:    buffer_load_dword v82, v2, s[0:3], 0 offen offset:136
 ; CHECK-NEXT:    buffer_load_dword v81, v2, s[0:3], 0 offen offset:132
 ; CHECK-NEXT:    buffer_load_dword v80, v2, s[0:3], 0 offen offset:128
-; CHECK-NEXT:    buffer_load_dword v84, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_dword v85, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_dword v86, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v26, v2, s[0:3], 0 offen offset:124
+; CHECK-NEXT:    buffer_load_dword v25, v2, s[0:3], 0 offen offset:120
+; CHECK-NEXT:    buffer_load_dword v24, v2, s[0:3], 0 offen offset:116
+; CHECK-NEXT:    buffer_load_dword v23, v2, s[0:3], 0 offen offset:112
+; CHECK-NEXT:    buffer_load_dword v22, v2, s[0:3], 0 offen offset:108
+; CHECK-NEXT:    buffer_load_dword v21, v2, s[0:3], 0 offen offset:104
+; CHECK-NEXT:    buffer_load_dword v20, v2, s[0:3], 0 offen offset:100
+; CHECK-NEXT:    buffer_load_dword v19, v2, s[0:3], 0 offen offset:96
+; CHECK-NEXT:    buffer_load_dword v18, v2, s[0:3], 0 offen offset:92
+; CHECK-NEXT:    buffer_load_dword v17, v2, s[0:3], 0 offen offset:88
+; CHECK-NEXT:    buffer_load_dword v16, v2, s[0:3], 0 offen offset:84
+; CHECK-NEXT:    buffer_load_dword v15, v2, s[0:3], 0 offen offset:80
+; CHECK-NEXT:    buffer_load_dword v14, v2, s[0:3], 0 offen offset:76
+; CHECK-NEXT:    buffer_load_dword v13, v2, s[0:3], 0 offen offset:72
+; CHECK-NEXT:    buffer_load_dword v12, v2, s[0:3], 0 offen offset:68
+; CHECK-NEXT:    buffer_load_dword v11, v2, s[0:3], 0 offen offset:64
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:48
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:52
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:56
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:60
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:32
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:36
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:40
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:44
 ; CHECK-NEXT:    buffer_load_dword v96, v2, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    buffer_load_dword v97, v2, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    buffer_load_dword v98, v2, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    buffer_load_dword v99, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v84, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v85, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v86, v2, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v87, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    v_add_co_u32 v100, vcc_lo, v0, s4
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo
 ; CHECK-NEXT:    v_add_nc_u32_e32 v2, 0xffffff00, v2
 ; CHECK-NEXT:    s_add_u32 s4, s4, 0xffffff00
 ; CHECK-NEXT:    s_addc_u32 s5, s5, -1
-; CHECK-NEXT:    s_waitcnt vmcnt(35)
+; CHECK-NEXT:    s_waitcnt vmcnt(60)
 ; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[31:34] offset:240
-; CHECK-NEXT:    s_waitcnt vmcnt(32)
+; CHECK-NEXT:    s_waitcnt vmcnt(56)
 ; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[27:30] offset:224
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
+; CHECK-NEXT:    s_waitcnt vmcnt(52)
 ; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[48:51] offset:208
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
+; CHECK-NEXT:    s_waitcnt vmcnt(48)
 ; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[35:38] offset:192
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
+; CHECK-NEXT:    s_waitcnt vmcnt(44)
 ; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[64:67] offset:176
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
+; CHECK-NEXT:    s_waitcnt vmcnt(40)
 ; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[52:55] offset:160
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
 ; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[68:71] offset:144
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
+; CHECK-NEXT:    s_waitcnt vmcnt(32)
 ; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[80:83] offset:128
+; CHECK-NEXT:    s_waitcnt vmcnt(28)
 ; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[23:26] offset:112
+; CHECK-NEXT:    s_waitcnt vmcnt(24)
 ; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[19:22] offset:96
+; CHECK-NEXT:    s_waitcnt vmcnt(20)
 ; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[15:18] offset:80
+; CHECK-NEXT:    s_waitcnt vmcnt(16)
 ; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[11:14] offset:64
+; CHECK-NEXT:    s_waitcnt vmcnt(12)
 ; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[7:10] offset:48
+; CHECK-NEXT:    s_waitcnt vmcnt(8)
 ; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[3:6] offset:32
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
 ; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[96:99] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[84:87]
@@ -13483,18 +13521,20 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v109, 16, v95
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_clause 0x4
-; ALIGNED-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12
 ; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:13
+; ALIGNED-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12
 ; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:14
 ; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:15
 ; ALIGNED-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:11
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v95, v4, 8, v6
-; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v109, v0, 8, v1
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_clause 0x1
 ; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:9
 ; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:10
@@ -15034,14 +15074,14 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_clause 0x3
-; ALIGNED-NEXT:    buffer_load_ubyte v127, v4, s[0:3], 0 offen offset:13
 ; ALIGNED-NEXT:    buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:14
+; ALIGNED-NEXT:    buffer_load_ubyte v127, v4, s[0:3], 0 offen offset:13
 ; ALIGNED-NEXT:    buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:15
 ; ALIGNED-NEXT:    buffer_load_ubyte v120, v4, s[0:3], 0 offen offset:11
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
-; ALIGNED-NEXT:    v_lshl_or_b32 v62, v127, 8, v0
-; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
 ; ALIGNED-NEXT:    buffer_store_dword v76, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    v_lshl_or_b32 v62, v127, 8, v0
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v76, v104, 8, v76
 ; ALIGNED-NEXT:    buffer_store_dword v104, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill
@@ -15857,14 +15897,14 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; UNROLL3-NEXT:  .LBB9_2: ; %memmove_fwd_loop
 ; UNROLL3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; UNROLL3-NEXT:    s_clause 0xb
-; UNROLL3-NEXT:    buffer_load_dword v4, v3, s[0:3], 0 offen
-; UNROLL3-NEXT:    buffer_load_dword v5, v3, s[0:3], 0 offen offset:4
-; UNROLL3-NEXT:    buffer_load_dword v6, v3, s[0:3], 0 offen offset:8
-; UNROLL3-NEXT:    buffer_load_dword v7, v3, s[0:3], 0 offen offset:12
 ; UNROLL3-NEXT:    buffer_load_dword v8, v3, s[0:3], 0 offen offset:16
 ; UNROLL3-NEXT:    buffer_load_dword v9, v3, s[0:3], 0 offen offset:20
 ; UNROLL3-NEXT:    buffer_load_dword v10, v3, s[0:3], 0 offen offset:24
 ; UNROLL3-NEXT:    buffer_load_dword v11, v3, s[0:3], 0 offen offset:28
+; UNROLL3-NEXT:    buffer_load_dword v4, v3, s[0:3], 0 offen
+; UNROLL3-NEXT:    buffer_load_dword v5, v3, s[0:3], 0 offen offset:4
+; UNROLL3-NEXT:    buffer_load_dword v6, v3, s[0:3], 0 offen offset:8
+; UNROLL3-NEXT:    buffer_load_dword v7, v3, s[0:3], 0 offen offset:12
 ; UNROLL3-NEXT:    buffer_load_dword v12, v3, s[0:3], 0 offen offset:32
 ; UNROLL3-NEXT:    buffer_load_dword v13, v3, s[0:3], 0 offen offset:36
 ; UNROLL3-NEXT:    buffer_load_dword v14, v3, s[0:3], 0 offen offset:40
@@ -15874,8 +15914,9 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; UNROLL3-NEXT:    s_add_u32 s4, s4, 48
 ; UNROLL3-NEXT:    v_add_nc_u32_e32 v3, 48, v3
 ; UNROLL3-NEXT:    s_addc_u32 s5, s5, 0
-; UNROLL3-NEXT:    s_waitcnt vmcnt(4)
+; UNROLL3-NEXT:    s_waitcnt vmcnt(8)
 ; UNROLL3-NEXT:    flat_store_dwordx4 v[16:17], v[8:11] offset:16
+; UNROLL3-NEXT:    s_waitcnt vmcnt(4)
 ; UNROLL3-NEXT:    flat_store_dwordx4 v[16:17], v[4:7]
 ; UNROLL3-NEXT:    s_waitcnt vmcnt(0)
 ; UNROLL3-NEXT:    flat_store_dwordx4 v[16:17], v[12:15] offset:32
@@ -15926,14 +15967,14 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; UNROLL3-NEXT:  .LBB9_6: ; %memmove_bwd_loop
 ; UNROLL3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; UNROLL3-NEXT:    s_clause 0xb
-; UNROLL3-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
-; UNROLL3-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
-; UNROLL3-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
-; UNROLL3-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
 ; UNROLL3-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
 ; UNROLL3-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
 ; UNROLL3-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
 ; UNROLL3-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:28
+; UNROLL3-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; UNROLL3-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; UNROLL3-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; UNROLL3-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
 ; UNROLL3-NEXT:    buffer_load_dword v11, v2, s[0:3], 0 offen offset:32
 ; UNROLL3-NEXT:    buffer_load_dword v12, v2, s[0:3], 0 offen offset:36
 ; UNROLL3-NEXT:    buffer_load_dword v13, v2, s[0:3], 0 offen offset:40
@@ -15943,8 +15984,9 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; UNROLL3-NEXT:    v_subrev_nc_u32_e32 v2, 48, v2
 ; UNROLL3-NEXT:    s_add_u32 s4, s4, 0xffffffd0
 ; UNROLL3-NEXT:    s_addc_u32 s5, s5, -1
-; UNROLL3-NEXT:    s_waitcnt vmcnt(4)
+; UNROLL3-NEXT:    s_waitcnt vmcnt(8)
 ; UNROLL3-NEXT:    flat_store_dwordx4 v[15:16], v[7:10] offset:16
+; UNROLL3-NEXT:    s_waitcnt vmcnt(4)
 ; UNROLL3-NEXT:    flat_store_dwordx4 v[15:16], v[3:6]
 ; UNROLL3-NEXT:    s_waitcnt vmcnt(0)
 ; UNROLL3-NEXT:    flat_store_dwordx4 v[15:16], v[11:14] offset:32
diff --git a/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll b/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll
index 01b7f40f6256f..ba2facc5a8786 100644
--- a/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll
+++ b/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll
@@ -141,15 +141,16 @@ define void @memmove_p0_p0_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr add
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_clause 0x4
 ; CHECK-NEXT:    flat_load_dword v8, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3]
+; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:30
 ; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:28
 ; CHECK-NEXT:    flat_load_dword v9, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:30
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3]
 ; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
 ; CHECK-NEXT:    flat_store_dword v[0:1], v8 offset:16
 ; CHECK-NEXT:    flat_load_dword v8, v[2:3] offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(2)
+; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(5)
 ; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(5)
 ; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(2)
 ; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[8:9] offset:20
@@ -198,11 +199,11 @@ define void @memmove_p0_p0_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:30
 ; CHECK-NEXT:    flat_load_dword v9, v[2:3] offset:16
+; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:30
 ; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:28
 ; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
 ; CHECK-NEXT:    flat_store_dword v[0:1], v9 offset:16
 ; CHECK-NEXT:    flat_load_dword v9, v[2:3] offset:20
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -370,15 +371,16 @@ define void @memmove_p0_p1_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr add
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_clause 0x4
 ; CHECK-NEXT:    global_load_dword v8, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:30
 ; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:28
 ; CHECK-NEXT:    global_load_dword v9, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:30
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
 ; CHECK-NEXT:    flat_store_dword v[0:1], v8 offset:16
 ; CHECK-NEXT:    global_load_dword v8, v[2:3], off offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
 ; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(3)
 ; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[8:9] offset:20
@@ -427,11 +429,11 @@ define void @memmove_p0_p1_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:30
 ; CHECK-NEXT:    global_load_dword v9, v[2:3], off offset:16
+; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:30
 ; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:28
 ; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
+; CHECK-NEXT:    s_waitcnt vmcnt(3)
 ; CHECK-NEXT:    flat_store_dword v[0:1], v9 offset:16
 ; CHECK-NEXT:    global_load_dword v9, v[2:3], off offset:20
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
@@ -485,14 +487,14 @@ define void @memmove_p0_p3_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p3_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_b32 v8, v2 offset:24
 ; CHECK-NEXT:    ds_read_u8 v9, v2 offset:30
 ; CHECK-NEXT:    ds_read_u16 v10, v2 offset:28
+; CHECK-NEXT:    ds_read_b32 v8, v2 offset:24
 ; CHECK-NEXT:    ds_read_b64 v[6:7], v2 offset:16
 ; CHECK-NEXT:    ds_read_b128 v[2:5], v2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
 ; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
 ; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
 ; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
@@ -540,14 +542,14 @@ define void @memmove_p0_p3_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p3_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_b32 v8, v2 offset:24
 ; CHECK-NEXT:    ds_read_u8 v9, v2 offset:30
 ; CHECK-NEXT:    ds_read_u16 v10, v2 offset:28
+; CHECK-NEXT:    ds_read_b32 v8, v2 offset:24
 ; CHECK-NEXT:    ds_read_b64 v[6:7], v2 offset:16
 ; CHECK-NEXT:    ds_read_b128 v[2:5], v2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
 ; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
 ; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
 ; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
@@ -819,15 +821,16 @@ define void @memmove_p0_p4_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr add
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_clause 0x4
 ; CHECK-NEXT:    global_load_dword v8, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:30
 ; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:28
 ; CHECK-NEXT:    global_load_dword v9, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:30
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
 ; CHECK-NEXT:    flat_store_dword v[0:1], v8 offset:16
 ; CHECK-NEXT:    global_load_dword v8, v[2:3], off offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
 ; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(3)
 ; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[8:9] offset:20
@@ -876,11 +879,11 @@ define void @memmove_p0_p4_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:30
 ; CHECK-NEXT:    global_load_dword v9, v[2:3], off offset:16
+; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:30
 ; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:28
 ; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
+; CHECK-NEXT:    s_waitcnt vmcnt(3)
 ; CHECK-NEXT:    flat_store_dword v[0:1], v9 offset:16
 ; CHECK-NEXT:    global_load_dword v9, v[2:3], off offset:20
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
@@ -939,18 +942,20 @@ define void @memmove_p0_p5_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_clause 0x8
+; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
 ; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
 ; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
+; CHECK-NEXT:    s_waitcnt vmcnt(8)
 ; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(7)
 ; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
 ; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[7:9] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
@@ -966,15 +971,15 @@ define void @memmove_p0_p5_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[7:10] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
@@ -1008,18 +1013,20 @@ define void @memmove_p0_p5_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_clause 0x8
+; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
 ; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
 ; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
+; CHECK-NEXT:    s_waitcnt vmcnt(8)
 ; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(7)
 ; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
 ; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[7:9] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
@@ -1035,15 +1042,15 @@ define void @memmove_p0_p5_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[7:10] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
@@ -1077,19 +1084,20 @@ define void @memmove_p0_p5_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr add
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_clause 0x8
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
+; CHECK-NEXT:    s_waitcnt vmcnt(6)
 ; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[7:9] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(5)
 ; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
 ; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
@@ -1147,19 +1155,20 @@ define void @memmove_p0_p5_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_clause 0x8
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
+; CHECK-NEXT:    s_waitcnt vmcnt(6)
 ; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[7:9] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(5)
 ; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
 ; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
@@ -1319,15 +1328,16 @@ define void @memmove_p1_p0_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr add
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_clause 0x4
 ; CHECK-NEXT:    flat_load_dword v8, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3]
+; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:30
 ; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:28
 ; CHECK-NEXT:    flat_load_dword v9, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:30
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3]
 ; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
 ; CHECK-NEXT:    global_store_dword v[0:1], v8, off offset:16
 ; CHECK-NEXT:    flat_load_dword v8, v[2:3] offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
 ; CHECK-NEXT:    global_store_byte v[0:1], v11, off offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
 ; CHECK-NEXT:    global_store_short v[0:1], v10, off offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off offset:20
@@ -1373,11 +1383,11 @@ define void @memmove_p1_p0_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:30
 ; CHECK-NEXT:    flat_load_dword v9, v[2:3] offset:16
+; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:30
 ; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:28
 ; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
 ; CHECK-NEXT:    global_store_dword v[0:1], v9, off offset:16
 ; CHECK-NEXT:    flat_load_dword v9, v[2:3] offset:20
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1536,15 +1546,16 @@ define void @memmove_p1_p1_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr add
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_clause 0x4
 ; CHECK-NEXT:    global_load_dword v8, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:30
 ; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:28
 ; CHECK-NEXT:    global_load_dword v9, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:30
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
 ; CHECK-NEXT:    global_store_dword v[0:1], v8, off offset:16
 ; CHECK-NEXT:    global_load_dword v8, v[2:3], off offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
 ; CHECK-NEXT:    global_store_byte v[0:1], v11, off offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(3)
 ; CHECK-NEXT:    global_store_short v[0:1], v10, off offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off offset:20
@@ -1590,11 +1601,11 @@ define void @memmove_p1_p1_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:30
 ; CHECK-NEXT:    global_load_dword v9, v[2:3], off offset:16
+; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:30
 ; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:28
 ; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
+; CHECK-NEXT:    s_waitcnt vmcnt(3)
 ; CHECK-NEXT:    global_store_dword v[0:1], v9, off offset:16
 ; CHECK-NEXT:    global_load_dword v9, v[2:3], off offset:20
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
@@ -1755,12 +1766,12 @@ define void @memmove_p1_p3_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr add
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    ds_read2_b64 v[3:6], v2 offset1:1
 ; CHECK-NEXT:    ds_read_b32 v7, v2 offset:16
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:30
 ; CHECK-NEXT:    ds_read2_b32 v[8:9], v2 offset0:5 offset1:6
+; CHECK-NEXT:    ds_read_u8 v10, v2 offset:30
 ; CHECK-NEXT:    ds_read_u16 v2, v2 offset:28
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
 ; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
-; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
 ; CHECK-NEXT:    global_store_dwordx3 v[0:1], v[7:9], off offset:16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    global_store_short v[0:1], v2, off offset:28
@@ -1806,12 +1817,12 @@ define void @memmove_p1_p3_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    ds_read_b128 v[3:6], v2
 ; CHECK-NEXT:    ds_read_b32 v7, v2 offset:16
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:30
 ; CHECK-NEXT:    ds_read2_b32 v[8:9], v2 offset0:5 offset1:6
+; CHECK-NEXT:    ds_read_u8 v10, v2 offset:30
 ; CHECK-NEXT:    ds_read_u16 v2, v2 offset:28
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
 ; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
-; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
 ; CHECK-NEXT:    global_store_dwordx3 v[0:1], v[7:9], off offset:16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    global_store_short v[0:1], v2, off offset:28
@@ -1963,15 +1974,16 @@ define void @memmove_p1_p4_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr add
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_clause 0x4
 ; CHECK-NEXT:    global_load_dword v8, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:30
 ; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:28
 ; CHECK-NEXT:    global_load_dword v9, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:30
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
 ; CHECK-NEXT:    global_store_dword v[0:1], v8, off offset:16
 ; CHECK-NEXT:    global_load_dword v8, v[2:3], off offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
 ; CHECK-NEXT:    global_store_byte v[0:1], v11, off offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(3)
 ; CHECK-NEXT:    global_store_short v[0:1], v10, off offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off offset:20
@@ -2017,11 +2029,11 @@ define void @memmove_p1_p4_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:30
 ; CHECK-NEXT:    global_load_dword v9, v[2:3], off offset:16
+; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:30
 ; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:28
 ; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
+; CHECK-NEXT:    s_waitcnt vmcnt(3)
 ; CHECK-NEXT:    global_store_dword v[0:1], v9, off offset:16
 ; CHECK-NEXT:    global_load_dword v9, v[2:3], off offset:20
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
@@ -2077,20 +2089,22 @@ define void @memmove_p1_p5_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr add
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_clause 0x8
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen
 ; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
+; CHECK-NEXT:    s_waitcnt vmcnt(8)
 ; CHECK-NEXT:    global_store_dword v[0:1], v10, off offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
+; CHECK-NEXT:    s_waitcnt vmcnt(7)
 ; CHECK-NEXT:    global_store_short v[0:1], v11, off offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(6)
 ; CHECK-NEXT:    global_store_byte v[0:1], v9, off offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
 ; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[7:8], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off offset:8
@@ -2145,20 +2159,22 @@ define void @memmove_p1_p5_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr add
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_clause 0x8
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen
 ; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
+; CHECK-NEXT:    s_waitcnt vmcnt(8)
 ; CHECK-NEXT:    global_store_dword v[0:1], v10, off offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
+; CHECK-NEXT:    s_waitcnt vmcnt(7)
 ; CHECK-NEXT:    global_store_short v[0:1], v11, off offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(6)
 ; CHECK-NEXT:    global_store_byte v[0:1], v9, off offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
 ; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[7:8], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off offset:8
@@ -2213,18 +2229,20 @@ define void @memmove_p1_p5_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr add
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_clause 0x8
+; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
 ; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
+; CHECK-NEXT:    s_waitcnt vmcnt(8)
 ; CHECK-NEXT:    global_store_short v[0:1], v11, off offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(7)
 ; CHECK-NEXT:    global_store_byte v[0:1], v10, off offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(3)
 ; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    global_store_dwordx3 v[0:1], v[7:9], off offset:16
@@ -2279,18 +2297,20 @@ define void @memmove_p1_p5_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_clause 0x8
+; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
 ; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
+; CHECK-NEXT:    s_waitcnt vmcnt(8)
 ; CHECK-NEXT:    global_store_short v[0:1], v11, off offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(7)
 ; CHECK-NEXT:    global_store_byte v[0:1], v10, off offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(3)
 ; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    global_store_dwordx3 v[0:1], v[7:9], off offset:16
@@ -2513,11 +2533,11 @@ define void @memmove_p3_p0_sz31_align_16_16(ptr addrspace(3) align 16 %dst, ptr
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:30
 ; CHECK-NEXT:    flat_load_dword v8, v[1:2] offset:16
+; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:30
 ; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:28
 ; CHECK-NEXT:    flat_load_dwordx4 v[3:6], v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
 ; CHECK-NEXT:    ds_write_b32 v0, v8 offset:16
 ; CHECK-NEXT:    flat_load_dword v8, v[1:2] offset:20
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3260,19 +3280,20 @@ define void @memmove_p3_p5_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr add
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_clause 0x8
+; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_ushort v10, v1, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
 ; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v10, v1, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
+; CHECK-NEXT:    s_waitcnt vmcnt(8)
 ; CHECK-NEXT:    ds_write_b32 v0, v9 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
+; CHECK-NEXT:    s_waitcnt vmcnt(7)
 ; CHECK-NEXT:    ds_write_b16 v0, v10 offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(6)
 ; CHECK-NEXT:    ds_write_b8 v0, v8 offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
 ; CHECK-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
@@ -3332,19 +3353,20 @@ define void @memmove_p3_p5_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr add
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_clause 0x8
+; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_ushort v10, v1, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
 ; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v10, v1, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
+; CHECK-NEXT:    s_waitcnt vmcnt(8)
 ; CHECK-NEXT:    ds_write_b32 v0, v9 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
+; CHECK-NEXT:    s_waitcnt vmcnt(7)
 ; CHECK-NEXT:    ds_write_b16 v0, v10 offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(6)
 ; CHECK-NEXT:    ds_write_b8 v0, v8 offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
 ; CHECK-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
@@ -3477,20 +3499,22 @@ define void @memmove_p3_p5_sz31_align_16_16(ptr addrspace(3) align 16 %dst, ptr
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_clause 0x8
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    buffer_load_ushort v10, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
+; CHECK-NEXT:    s_waitcnt vmcnt(7)
 ; CHECK-NEXT:    ds_write2_b32 v0, v8, v9 offset0:5 offset1:6
+; CHECK-NEXT:    s_waitcnt vmcnt(6)
 ; CHECK-NEXT:    ds_write_b32 v0, v7 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
+; CHECK-NEXT:    s_waitcnt vmcnt(5)
 ; CHECK-NEXT:    ds_write_b16 v0, v10 offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
 ; CHECK-NEXT:    ds_write_b8 v0, v6 offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    ds_write_b128 v0, v[2:5]
@@ -3751,11 +3775,11 @@ define void @memmove_p5_p0_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:30
 ; CHECK-NEXT:    flat_load_dword v8, v[1:2] offset:16
+; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:30
 ; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:28
 ; CHECK-NEXT:    flat_load_dwordx4 v[3:6], v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
 ; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    flat_load_dword v8, v[1:2] offset:20
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll
index 30ad3be46053c..344788fff76ee 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll
@@ -12,23 +12,24 @@ define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr ad
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
-; GFX9-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
 ; GFX9-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7
 ; GFX9-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
 ; GFX9-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
 ; GFX9-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4
+; GFX9-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
 ; GFX9-NEXT:    global_load_ubyte v7, v1, s[2:3]
 ; GFX9-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2
 ; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v6
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_e32 v3, v4, v7
@@ -49,26 +50,29 @@ define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr ad
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x7
-; GFX10-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
-; GFX10-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
-; GFX10-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7
 ; GFX10-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
 ; GFX10-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
+; GFX10-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
+; GFX10-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7
 ; GFX10-NEXT:    global_load_ubyte v6, v1, s[2:3]
 ; GFX10-NEXT:    global_load_ubyte v7, v1, s[2:3] offset:2
 ; GFX10-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:4
+; GFX10-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
 ; GFX10-NEXT:    s_waitcnt vmcnt(7)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(6)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; GFX10-NEXT:    s_waitcnt vmcnt(5)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX10-NEXT:    s_waitcnt vmcnt(4)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; GFX10-NEXT:    s_waitcnt vmcnt(3)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_or_b32_e32 v0, v0, v8
 ; GFX10-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX10-NEXT:    s_waitcnt vmcnt(2)
 ; GFX10-NEXT:    v_or_b32_sdwa v5, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v8
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_e32 v3, v5, v4
 ; GFX10-NEXT:    v_or_b32_e32 v0, v2, v0
@@ -91,23 +95,24 @@ define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
-; GFX9-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
 ; GFX9-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7
 ; GFX9-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
 ; GFX9-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
 ; GFX9-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4
+; GFX9-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
 ; GFX9-NEXT:    global_load_ubyte v7, v1, s[2:3]
 ; GFX9-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2
 ; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v6
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_e32 v3, v4, v7
@@ -129,26 +134,29 @@ define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x7
-; GFX10-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
-; GFX10-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
-; GFX10-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7
 ; GFX10-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
 ; GFX10-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
+; GFX10-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
+; GFX10-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7
 ; GFX10-NEXT:    global_load_ubyte v6, v1, s[2:3]
 ; GFX10-NEXT:    global_load_ubyte v7, v1, s[2:3] offset:2
 ; GFX10-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:4
+; GFX10-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
 ; GFX10-NEXT:    s_waitcnt vmcnt(7)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(6)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; GFX10-NEXT:    s_waitcnt vmcnt(5)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX10-NEXT:    s_waitcnt vmcnt(4)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; GFX10-NEXT:    s_waitcnt vmcnt(3)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_or_b32_e32 v0, v0, v8
 ; GFX10-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX10-NEXT:    s_waitcnt vmcnt(2)
 ; GFX10-NEXT:    v_or_b32_sdwa v5, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v8
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_e32 v3, v5, v4
 ; GFX10-NEXT:    v_or_b32_e32 v0, v2, v0
@@ -172,23 +180,24 @@ define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr ad
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
-; GFX9-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
 ; GFX9-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7
 ; GFX9-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
 ; GFX9-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
 ; GFX9-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4
+; GFX9-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
 ; GFX9-NEXT:    global_load_ubyte v7, v1, s[2:3]
 ; GFX9-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2
 ; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v6
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_e32 v3, v4, v7
@@ -211,22 +220,23 @@ define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr ad
 ; GFX10-NEXT:    s_clause 0x7
 ; GFX10-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
 ; GFX10-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:7
-; GFX10-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:6
 ; GFX10-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
 ; GFX10-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
 ; GFX10-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4
+; GFX10-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:6
 ; GFX10-NEXT:    global_load_ubyte v7, v1, s[2:3]
 ; GFX10-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2
 ; GFX10-NEXT:    s_waitcnt vmcnt(7)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX10-NEXT:    s_waitcnt vmcnt(6)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; GFX10-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-NEXT:    s_waitcnt vmcnt(5)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; GFX10-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-NEXT:    s_waitcnt vmcnt(4)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX10-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-NEXT:    s_waitcnt vmcnt(3)
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v6
+; GFX10-NEXT:    s_waitcnt vmcnt(2)
 ; GFX10-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    v_or_b32_e32 v3, v4, v7
@@ -253,23 +263,24 @@ define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
-; GFX9-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
 ; GFX9-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7
 ; GFX9-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
 ; GFX9-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
 ; GFX9-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4
+; GFX9-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
 ; GFX9-NEXT:    global_load_ubyte v7, v1, s[2:3]
 ; GFX9-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2
 ; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v6
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_e32 v3, v4, v7
@@ -293,22 +304,23 @@ define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace
 ; GFX10-NEXT:    s_clause 0x7
 ; GFX10-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
 ; GFX10-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:7
-; GFX10-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:6
 ; GFX10-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
 ; GFX10-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
 ; GFX10-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4
+; GFX10-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:6
 ; GFX10-NEXT:    global_load_ubyte v7, v1, s[2:3]
 ; GFX10-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2
 ; GFX10-NEXT:    s_waitcnt vmcnt(7)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX10-NEXT:    s_waitcnt vmcnt(6)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; GFX10-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-NEXT:    s_waitcnt vmcnt(5)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; GFX10-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-NEXT:    s_waitcnt vmcnt(4)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX10-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-NEXT:    s_waitcnt vmcnt(3)
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v6
+; GFX10-NEXT:    s_waitcnt vmcnt(2)
 ; GFX10-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    v_or_b32_e32 v3, v4, v7
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll
index cf0fbe4506d20..ae57b8623f20b 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll
@@ -677,13 +677,13 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt
 ; W64-O0-NEXT:    v_readlane_b32 s4, v17, 9
 ; W64-O0-NEXT:    v_readlane_b32 s5, v17, 10
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_waitcnt vmcnt(3)
 ; W64-O0-NEXT:    global_store_dword v[3:4], v5, off
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    global_store_dword v[0:1], v2, off
@@ -1118,12 +1118,13 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
 ; W64-O0-NEXT:    v_readlane_b32 s7, v13, 3
 ; W64-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; W64-O0-NEXT:    v_readlane_b32 s4, v13, 1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b32 s5, 0x3ff
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    s_waitcnt vmcnt(1)
 ; W64-O0-NEXT:    v_and_b32_e64 v1, v1, s5
 ; W64-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v1, s4
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 s[4:5], exec
 ; W64-O0-NEXT:    v_writelane_b32 v13, s4, 10
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
index 6368030b445fe..1c13a21f781d0 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
@@ -697,13 +697,13 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8)
 ; W64-O0-NEXT:    v_readlane_b32 s4, v17, 9
 ; W64-O0-NEXT:    v_readlane_b32 s5, v17, 10
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_waitcnt vmcnt(3)
 ; W64-O0-NEXT:    global_store_dword v[3:4], v5, off
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    global_store_dword v[0:1], v2, off
@@ -1149,12 +1149,13 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j,
 ; W64-O0-NEXT:    v_readlane_b32 s7, v13, 3
 ; W64-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; W64-O0-NEXT:    v_readlane_b32 s4, v13, 1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b32 s5, 0x3ff
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    s_waitcnt vmcnt(1)
 ; W64-O0-NEXT:    v_and_b32_e64 v1, v1, s5
 ; W64-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v1, s4
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 s[4:5], exec
 ; W64-O0-NEXT:    v_writelane_b32 v13, s4, 10
@@ -1171,14 +1172,17 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j,
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    v_readlane_b32 s4, v13, 0
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_waitcnt vmcnt(3)
 ; W64-O0-NEXT:    v_mov_b32_e32 v6, v4
+; W64-O0-NEXT:    s_waitcnt vmcnt(2)
 ; W64-O0-NEXT:    v_mov_b32_e32 v0, v3
+; W64-O0-NEXT:    s_waitcnt vmcnt(1)
 ; W64-O0-NEXT:    v_mov_b32_e32 v4, v2
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    v_mov_b32_e32 v5, v1
 ; W64-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
 ; W64-O0-NEXT:    v_mov_b32_e32 v1, v6
diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
index 0741cb256cc24..eba995e13e448 100644
--- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
@@ -3641,11 +3641,11 @@ define hidden void @extract_v6i16(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x3
-; GFX10-NEXT:    global_load_ushort v2, v[0:1], off offset:6
 ; GFX10-NEXT:    global_load_ushort v3, v[0:1], off
 ; GFX10-NEXT:    global_load_ushort v8, v[0:1], off offset:2
+; GFX10-NEXT:    global_load_ushort v2, v[0:1], off offset:6
 ; GFX10-NEXT:    global_load_ushort v9, v[0:1], off offset:4
-; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    s_waitcnt vmcnt(2)
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v8, 16, v3
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 16, v9
diff --git a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
index f4a9e7e8f2759..994abb653e086 100644
--- a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
@@ -306,45 +306,45 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
 ; GFX906-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX906-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:64 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:68 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:72 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:76 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:80 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:84 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:88 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:92 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:100 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v23, off, s[0:3], s33 offset:104 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v24, off, s[0:3], s33 offset:108 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v25, off, s[0:3], s33 offset:112 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v26, off, s[0:3], s33 offset:116 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v27, off, s[0:3], s33 offset:120 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v28, off, s[0:3], s33 offset:124 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v29, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v30, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:140 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:144 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v26, off, s[0:3], s33 offset:116 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v27, off, s[0:3], s33 offset:120 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v28, off, s[0:3], s33 offset:124 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v29, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:100 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v23, off, s[0:3], s33 offset:104 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v24, off, s[0:3], s33 offset:108 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v25, off, s[0:3], s33 offset:112 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:84 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:88 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:92 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:68 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:72 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:76 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:80 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:64 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX906-NEXT:    v_readlane_b32 s30, v41, 0
 ; GFX906-NEXT:    s_mov_b32 s32, s33
 ; GFX906-NEXT:    v_readlane_b32 s4, v41, 4
 ; GFX906-NEXT:    v_readlane_b32 s34, v41, 2
 ; GFX906-NEXT:    v_readlane_b32 s35, v41, 3
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(28)
 ; GFX906-NEXT:    flat_store_dwordx4 v[0:1], v[30:33] offset:112
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    flat_store_dwordx4 v[0:1], v[26:29] offset:96
@@ -685,40 +685,40 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
 ; GFX908-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX908-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
 ; GFX908-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:64 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:68 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:72 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:76 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:80 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:84 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:88 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:92 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:100 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:104 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:108 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v23, off, s[0:3], s33 offset:112 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v24, off, s[0:3], s33 offset:116 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v25, off, s[0:3], s33 offset:120 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v26, off, s[0:3], s33 offset:124 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v27, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v28, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload
-; GFX908-NEXT:    buffer_load_dword v29, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload
 ; GFX908-NEXT:    buffer_load_dword v30, off, s[0:3], s33 offset:140 ; 4-byte Folded Reload
 ; GFX908-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:144 ; 4-byte Folded Reload
 ; GFX908-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:148 ; 4-byte Folded Reload
 ; GFX908-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:152 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v26, off, s[0:3], s33 offset:124 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v27, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v28, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v29, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:108 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v23, off, s[0:3], s33 offset:112 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v24, off, s[0:3], s33 offset:116 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v25, off, s[0:3], s33 offset:120 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:92 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:100 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:104 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:76 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:80 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:84 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:88 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:64 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:68 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:72 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
 ; GFX908-NEXT:    s_mov_b64 s[4:5], exec
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    s_waitcnt vmcnt(28)
 ; GFX908-NEXT:    flat_store_dwordx4 v[0:1], v[30:33] offset:112
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    flat_store_dwordx4 v[0:1], v[26:29] offset:96
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index 85a9aba1a0e51..10ba7d8365d7f 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -2163,9 +2163,9 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) {
 ; GFX9-NEXT:    s_movk_i32 s0, 0x1000
 ; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, s0, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dwordx2 v[14:15], v[12:13], off
 ; GFX9-NEXT:    global_load_dwordx2 v[16:17], v[2:3], off
 ; GFX9-NEXT:    global_load_dwordx2 v[18:19], v[12:13], off offset:2048
+; GFX9-NEXT:    global_load_dwordx2 v[14:15], v[12:13], off
 ; GFX9-NEXT:    global_load_dwordx2 v[20:21], v[0:1], off offset:2048
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v4
@@ -2176,12 +2176,13 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v10, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v11, v1, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v16, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v17, v1, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v18, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v19, v1, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v14, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v15, v1, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -2302,8 +2303,8 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) {
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v11, null, 0, v1, vcc_lo
 ; GFX11-NEXT:    s_clause 0x4
 ; GFX11-NEXT:    global_load_b64 v[12:13], v[8:9], off offset:2048
-; GFX11-NEXT:    global_load_b64 v[14:15], v[10:11], off
 ; GFX11-NEXT:    global_load_b64 v[8:9], v[8:9], off
+; GFX11-NEXT:    global_load_b64 v[14:15], v[10:11], off
 ; GFX11-NEXT:    global_load_b64 v[10:11], v[10:11], off offset:2048
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off offset:2048
 ; GFX11-NEXT:    s_waitcnt vmcnt(6)
@@ -2318,7 +2319,7 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) {
 ; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v12, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v13, v3, vcc_lo
-; GFX11-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-NEXT:    s_waitcnt vmcnt(3)
 ; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v8, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v9, v3, vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
index 8abbdad893819..d6e62104dcfdb 100644
--- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
@@ -586,47 +586,52 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_readlane_b32 s4, v30, 4
 ; GFX9-O0-NEXT:    v_readlane_b32 s5, v30, 5
 ; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB0_9
 ; GFX9-O0-NEXT:  .LBB0_4: ; %udiv-loop-exit
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 1
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[2:3], s4, v[0:1]
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[9:10], s4, v[9:10]
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 63
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[0:1], s4, v[0:1]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v8
 ; GFX9-O0-NEXT:    v_or3_b32 v4, v4, v11, v12
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v9
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-O0-NEXT:    v_or3_b32 v0, v0, v1, v7
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v6
 ; GFX9-O0-NEXT:    v_or_b32_e64 v4, v4, v7
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v5
 ; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
 ; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
@@ -646,26 +651,29 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_readlane_b32 s4, v30, 8
 ; GFX9-O0-NEXT:    v_readlane_b32 s5, v30, 9
 ; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB0_4
 ; GFX9-O0-NEXT:  .LBB0_6: ; %udiv-do-while
@@ -676,35 +684,36 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_readlane_b32 s6, v30, 10
 ; GFX9-O0-NEXT:    v_readlane_b32 s7, v30, 11
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 63
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(16)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(22)
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[28:29], s4, v[2:3]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v29
 ; GFX9-O0-NEXT:    s_mov_b32 s5, 1
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(20)
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[22:23], s5, v[22:23]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v23
 ; GFX9-O0-NEXT:    v_or_b32_e64 v4, v4, v5
@@ -714,6 +723,7 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v4
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[28:29], s5, v[2:3]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[4:5], s4, v[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v29
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v5
@@ -723,25 +733,28 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_or_b32_e64 v4, v3, v4
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[2:3], s5, v[0:1]
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[28:29], s5, v[6:7]
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[0:1], s4, v[0:1]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v29
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(10)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v27
 ; GFX9-O0-NEXT:    v_or3_b32 v6, v6, v7, v10
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v28
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(14)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v26
 ; GFX9-O0-NEXT:    v_or3_b32 v0, v0, v1, v7
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(8)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(13)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v25
 ; GFX9-O0-NEXT:    v_or_b32_e64 v6, v6, v7
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(12)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v24
 ; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
 ; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
@@ -750,12 +763,14 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v22
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v23
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(11)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v11
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(10)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v14
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v15
 ; GFX9-O0-NEXT:    v_sub_co_u32_e32 v13, vcc, v13, v6
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v12, vcc, v12, v10, vcc
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v11, vcc, v11, v4, vcc
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v5, vcc
@@ -773,13 +788,17 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v12
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, 0
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v21
 ; GFX9-O0-NEXT:    v_and_b32_e64 v22, v7, v22
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-O0-NEXT:    v_and_b32_e64 v20, v11, v20
 ; GFX9-O0-NEXT:    ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v22
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v19
 ; GFX9-O0-NEXT:    v_and_b32_e64 v7, v7, v22
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-O0-NEXT:    v_and_b32_e64 v22, v11, v18
 ; GFX9-O0-NEXT:    ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v7
@@ -795,12 +814,15 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v7
 ; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v10
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 killed $vgpr8_vgpr9 killed $exec
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], -1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s8
 ; GFX9-O0-NEXT:    s_mov_b32 s4, s9
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v16
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v17
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v16, s5
 ; GFX9-O0-NEXT:    v_add_co_u32_e32 v19, vcc, v11, v16
@@ -888,29 +910,30 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(9)
+; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(13)
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[6:7], v4, v[18:19]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v7
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 64
 ; GFX9-O0-NEXT:    v_sub_u32_e64 v20, s4, v4
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(11)
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[20:21], v20, v[14:15]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v21
 ; GFX9-O0-NEXT:    v_or_b32_e64 v5, v5, v22
@@ -948,12 +971,16 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[6:7]
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v14
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(10)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v12
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v13
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], -1
 ; GFX9-O0-NEXT:    s_mov_b32 s7, s8
 ; GFX9-O0-NEXT:    s_mov_b32 s6, s9
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v16
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v17
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v16, s7
 ; GFX9-O0-NEXT:    v_add_co_u32_e32 v16, vcc, v15, v16
@@ -989,10 +1016,11 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
@@ -1000,39 +1028,43 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(20)
 ; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(20)
 ; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(20)
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(20)
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB0_6
 ; GFX9-O0-NEXT:  .LBB0_8: ; %udiv-bb1
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s6
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-O0-NEXT:    s_mov_b32 s4, s7
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX9-O0-NEXT:    s_mov_b32 s8, s6
 ; GFX9-O0-NEXT:    s_mov_b32 s9, s7
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s5
 ; GFX9-O0-NEXT:    v_add_co_u32_e32 v8, vcc, v3, v4
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-O0-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s8
 ; GFX9-O0-NEXT:    v_addc_co_u32_e32 v0, vcc, v0, v4, vcc
@@ -1054,10 +1086,12 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 0x7f
 ; GFX9-O0-NEXT:    v_sub_u32_e64 v2, s4, v3
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[4:5], v2, v[10:11]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v5
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 64
 ; GFX9-O0-NEXT:    v_sub_u32_e64 v13, s4, v2
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[13:14], v13, v[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v14
 ; GFX9-O0-NEXT:    v_or_b32_e64 v12, v12, v15
@@ -1136,6 +1170,14 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_cbranch_execz .LBB0_5
 ; GFX9-O0-NEXT:    s_branch .LBB0_7
 ; GFX9-O0-NEXT:  .LBB0_9: ; %udiv-end
+; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
@@ -1144,22 +1186,14 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 32
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(14)
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[0:1], s4, v[16:17]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(13)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v20
 ; GFX9-O0-NEXT:    v_mul_lo_u32 v8, v1, v0
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(12)
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[20:21], s4, v[20:21]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v20
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v16
@@ -1186,10 +1220,13 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_or_b32_e64 v16, v5, v8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v0
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(10)
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[8:9], s4, v[18:19]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v8
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v14
 ; GFX9-O0-NEXT:    v_mul_lo_u32 v9, v8, v5
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[14:15], s4, v[14:15]
 ; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v18
@@ -1356,7 +1393,9 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v9
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v0
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v1
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v3
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v12
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v13
@@ -1888,47 +1927,52 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_readlane_b32 s4, v30, 2
 ; GFX9-O0-NEXT:    v_readlane_b32 s5, v30, 3
 ; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB1_9
 ; GFX9-O0-NEXT:  .LBB1_4: ; %udiv-loop-exit
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 1
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[2:3], s4, v[0:1]
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[9:10], s4, v[9:10]
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 63
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[0:1], s4, v[0:1]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v8
 ; GFX9-O0-NEXT:    v_or3_b32 v4, v4, v11, v12
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v9
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-O0-NEXT:    v_or3_b32 v0, v0, v1, v7
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v6
 ; GFX9-O0-NEXT:    v_or_b32_e64 v4, v4, v7
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v5
 ; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
 ; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
@@ -1948,26 +1992,29 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_readlane_b32 s4, v30, 6
 ; GFX9-O0-NEXT:    v_readlane_b32 s5, v30, 7
 ; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB1_4
 ; GFX9-O0-NEXT:  .LBB1_6: ; %udiv-do-while
@@ -1978,35 +2025,36 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_readlane_b32 s6, v30, 8
 ; GFX9-O0-NEXT:    v_readlane_b32 s7, v30, 9
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 63
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(16)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(22)
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[28:29], s4, v[2:3]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v29
 ; GFX9-O0-NEXT:    s_mov_b32 s5, 1
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(20)
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[22:23], s5, v[22:23]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v23
 ; GFX9-O0-NEXT:    v_or_b32_e64 v4, v4, v5
@@ -2016,6 +2064,7 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v4
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[28:29], s5, v[2:3]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[4:5], s4, v[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v29
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v5
@@ -2025,25 +2074,28 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_or_b32_e64 v4, v3, v4
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[2:3], s5, v[0:1]
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[28:29], s5, v[6:7]
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[0:1], s4, v[0:1]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v29
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(10)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v27
 ; GFX9-O0-NEXT:    v_or3_b32 v6, v6, v7, v10
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v28
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(14)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v26
 ; GFX9-O0-NEXT:    v_or3_b32 v0, v0, v1, v7
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(8)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(13)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v25
 ; GFX9-O0-NEXT:    v_or_b32_e64 v6, v6, v7
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(12)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v24
 ; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
 ; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
@@ -2052,12 +2104,14 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v22
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v23
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(11)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v11
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(10)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v14
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v15
 ; GFX9-O0-NEXT:    v_sub_co_u32_e32 v13, vcc, v13, v6
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v12, vcc, v12, v10, vcc
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v11, vcc, v11, v4, vcc
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v5, vcc
@@ -2075,13 +2129,17 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v12
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, 0
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v21
 ; GFX9-O0-NEXT:    v_and_b32_e64 v22, v7, v22
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-O0-NEXT:    v_and_b32_e64 v20, v11, v20
 ; GFX9-O0-NEXT:    ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v22
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v19
 ; GFX9-O0-NEXT:    v_and_b32_e64 v7, v7, v22
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-O0-NEXT:    v_and_b32_e64 v22, v11, v18
 ; GFX9-O0-NEXT:    ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v7
@@ -2097,12 +2155,15 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v7
 ; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v10
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 killed $vgpr8_vgpr9 killed $exec
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], -1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s8
 ; GFX9-O0-NEXT:    s_mov_b32 s4, s9
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v16
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v17
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v16, s5
 ; GFX9-O0-NEXT:    v_add_co_u32_e32 v19, vcc, v11, v16
@@ -2190,29 +2251,30 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(9)
+; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(13)
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[6:7], v4, v[18:19]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v7
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 64
 ; GFX9-O0-NEXT:    v_sub_u32_e64 v20, s4, v4
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(11)
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[20:21], v20, v[14:15]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v21
 ; GFX9-O0-NEXT:    v_or_b32_e64 v5, v5, v22
@@ -2250,12 +2312,16 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[6:7]
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v14
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(10)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v12
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v13
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], -1
 ; GFX9-O0-NEXT:    s_mov_b32 s7, s8
 ; GFX9-O0-NEXT:    s_mov_b32 s6, s9
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v16
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v17
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v16, s7
 ; GFX9-O0-NEXT:    v_add_co_u32_e32 v16, vcc, v15, v16
@@ -2291,10 +2357,11 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
@@ -2302,39 +2369,43 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(20)
 ; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(20)
 ; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(20)
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(20)
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB1_6
 ; GFX9-O0-NEXT:  .LBB1_8: ; %udiv-bb1
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s6
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-O0-NEXT:    s_mov_b32 s4, s7
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX9-O0-NEXT:    s_mov_b32 s8, s6
 ; GFX9-O0-NEXT:    s_mov_b32 s9, s7
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s5
 ; GFX9-O0-NEXT:    v_add_co_u32_e32 v8, vcc, v3, v4
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-O0-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s8
 ; GFX9-O0-NEXT:    v_addc_co_u32_e32 v0, vcc, v0, v4, vcc
@@ -2356,10 +2427,12 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 0x7f
 ; GFX9-O0-NEXT:    v_sub_u32_e64 v2, s4, v3
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[4:5], v2, v[10:11]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v5
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 64
 ; GFX9-O0-NEXT:    v_sub_u32_e64 v13, s4, v2
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[13:14], v13, v[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v14
 ; GFX9-O0-NEXT:    v_or_b32_e64 v12, v12, v15
@@ -2438,26 +2511,26 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_cbranch_execz .LBB1_5
 ; GFX9-O0-NEXT:    s_branch .LBB1_7
 ; GFX9-O0-NEXT:  .LBB1_9: ; %udiv-end
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 32
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(10)
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[2:3], s4, v[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v2
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v12
 ; GFX9-O0-NEXT:    v_mul_lo_u32 v4, v5, v2
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[12:13], s4, v[12:13]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v12
 ; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
@@ -2484,10 +2557,13 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_or_b32_e64 v12, v3, v4
 ; GFX9-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v2
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[2:3], s4, v[14:15]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v2
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v10
 ; GFX9-O0-NEXT:    v_mul_lo_u32 v3, v2, v7
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[10:11], s4, v[10:11]
 ; GFX9-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 killed $vgpr10_vgpr11 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v14
@@ -2654,7 +2730,9 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v9
diff --git a/llvm/test/CodeGen/AMDGPU/reschedule-bundle-loads.mir b/llvm/test/CodeGen/AMDGPU/reschedule-bundle-loads.mir
new file mode 100644
index 0000000000000..f6c6f2d0fb77e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/reschedule-bundle-loads.mir
@@ -0,0 +1,198 @@
+# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -start-before si-post-ra-bundler -o - %s | FileCheck %s
+
+--- |
+  target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+  target triple = "amdgcn-amd-amdpal"
+  define dllexport amdgpu_cs void @_amdgpu_cs_main(ptr inreg noundef %userdata2, ptr %out0, ptr %out1, ptr %out2, ptr %out3) {
+  ; CHECK-LABEL: _amdgpu_cs_main:
+  ; CHECK:       ; %bb.0: ; %.entry
+  ; CHECK-NEXT:    ; implicit-def: $vgpr11
+  ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+  ; CHECK-NEXT:    v_cvt_u32_f32_e32 v15, v11
+  ; CHECK-NEXT:    ; implicit-def: $vgpr12
+  ; CHECK-NEXT:    v_cvt_u32_f32_e32 v28, v12
+  ; CHECK-NEXT:    ; implicit-def: $vgpr16
+  ; CHECK-NEXT:    ; implicit-def: $vgpr17
+  ; CHECK-NEXT:    ; implicit-def: $vgpr19
+  ; CHECK-NEXT:    ; implicit-def: $vgpr22
+  ; CHECK-NEXT:    ; implicit-def: $vgpr24
+  ; CHECK-NEXT:    ; implicit-def: $vgpr25
+  ; CHECK-NEXT:    ; implicit-def: $vgpr26
+  ; CHECK-NEXT:    ; implicit-def: $vgpr57
+  ; CHECK-NEXT:    ; implicit-def: $vgpr58
+  ; CHECK-NEXT:    ; implicit-def: $vgpr59
+  ; CHECK-NEXT:    ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11
+  ; CHECK-NEXT:    ; implicit-def: $vgpr6
+  ; CHECK-NEXT:    v_lshlrev_b32_e32 v56, 3, v6
+  ; CHECK-NEXT:    s_clause 0x1f
+  ; CHECK-NEXT:    image_load v9, [v57, v58, v19], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+  ; CHECK-NEXT:    image_load v10, [v57, v58, v17], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+  ; CHECK-NEXT:    image_load v12, [v15, v58, v19], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+  ; CHECK-NEXT:    image_load v36, [v15, v58, v17], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+  ; CHECK-NEXT:    image_load v41, [v57, v28, v19], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+  ; CHECK-NEXT:    image_load v42, [v57, v28, v17], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+  ; CHECK-NEXT:    image_load v47, [v15, v28, v19], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+  ; CHECK-NEXT:    image_load v48, [v15, v28, v17], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+  ; CHECK-NEXT:    image_load v11, [v57, v58, v22], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+  ; CHECK-NEXT:    image_load v51, [v57, v58, v16], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+  ; CHECK-NEXT:    image_load v35, [v15, v58, v22], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+  ; CHECK-NEXT:    image_load v52, [v15, v58, v16], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+  ; CHECK-NEXT:    image_load v49, [v57, v28, v22], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+  ; CHECK-NEXT:    image_load v53, [v57, v28, v16], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+  ; CHECK-NEXT:    image_load v50, [v15, v28, v22], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+  ; CHECK-NEXT:    image_load v54, [v15, v28, v16], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+  ; CHECK-NEXT:    image_load v20, v[57:59], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+  ; CHECK-NEXT:    image_load v21, [v57, v58, v24], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+  ; CHECK-NEXT:    image_load v31, [v15, v58, v59], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+  ; CHECK-NEXT:    image_load v32, [v15, v58, v24], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+  ; CHECK-NEXT:    image_load v37, [v57, v28, v59], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+  ; CHECK-NEXT:    image_load v38, [v57, v28, v24], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+  ; CHECK-NEXT:    image_load v43, [v15, v28, v59], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+  ; CHECK-NEXT:    image_load v44, [v15, v28, v24], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+  ; CHECK-NEXT:    image_load v29, [v57, v58, v25], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+  ; CHECK-NEXT:    image_load v30, [v57, v58, v26], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+  ; CHECK-NEXT:    image_load v33, [v15, v58, v25], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+  ; CHECK-NEXT:    image_load v34, [v15, v58, v26], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+  ; CHECK-NEXT:    image_load v39, [v57, v28, v25], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+  ; CHECK-NEXT:    image_load v40, [v57, v28, v26], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+  ; CHECK-NEXT:    image_load v45, [v15, v28, v25], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+  ; CHECK-NEXT:    image_load v46, [v15, v28, v26], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+  ; CHECK-NEXT:    ; implicit-def: $vgpr7
+  ; CHECK-NEXT:    ; implicit-def: $vgpr0
+  ; CHECK-NEXT:    ; implicit-def: $vgpr3
+  ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 7, v7
+  ; CHECK-NEXT:    s_waitcnt vmcnt(30)
+  ; CHECK-NEXT:    v_max3_f32 v55, v9, 0, v10
+  ; CHECK-NEXT:    v_lshrrev_b32_e32 v9, 10, v0
+  ; CHECK-NEXT:    s_waitcnt vmcnt(28)
+  ; CHECK-NEXT:    v_max3_f32 v36, v12, 0, v36
+  ; CHECK-NEXT:    v_lshlrev_b32_e32 v12, 1, v7
+  ; CHECK-NEXT:    s_waitcnt vmcnt(26)
+  ; CHECK-NEXT:    v_max3_f32 v41, v41, 0, v42
+  ; CHECK-NEXT:    v_lshlrev_b32_e32 v10, 1, v6
+  ; CHECK-NEXT:    s_waitcnt vmcnt(24)
+  ; CHECK-NEXT:    v_max3_f32 v42, v47, 0, v48
+  ; CHECK-NEXT:    s_waitcnt vmcnt(22)
+  ; CHECK-NEXT:    v_max3_f32 v47, v55, v11, v51
+  ; CHECK-NEXT:    v_mad_u32_u24 v11, 0x90, v7, v56
+  ; CHECK-NEXT:    s_waitcnt vmcnt(20)
+  ; CHECK-NEXT:    v_max3_f32 v35, v36, v35, v52
+  ; CHECK-NEXT:    s_waitcnt vmcnt(18)
+  ; CHECK-NEXT:    v_max3_f32 v36, v41, v49, v53
+  ; CHECK-NEXT:    s_waitcnt vmcnt(16)
+  ; CHECK-NEXT:    v_max3_f32 v41, v42, v50, v54
+  ; CHECK-NEXT:    s_waitcnt vmcnt(14)
+  ; CHECK-NEXT:    v_max3_f32 v20, v47, v20, v21
+  ; CHECK-NEXT:    v_add_nc_u32_e32 v21, 2, v3
+  ; CHECK-NEXT:    s_waitcnt vmcnt(12)
+  ; CHECK-NEXT:    v_max3_f32 v31, v35, v31, v32
+  ; CHECK-NEXT:    s_waitcnt vmcnt(10)
+  ; CHECK-NEXT:    v_max3_f32 v32, v36, v37, v38
+  ; CHECK-NEXT:    s_waitcnt vmcnt(8)
+  ; CHECK-NEXT:    v_max3_f32 v35, v41, v43, v44
+  ; CHECK-NEXT:    s_waitcnt vmcnt(6)
+  ; CHECK-NEXT:    v_max3_f32 v29, v20, v29, v30
+  ; CHECK-NEXT:    v_add_nc_u32_e32 v20, 3, v3
+  ; CHECK-NEXT:    s_waitcnt vmcnt(4)
+  ; CHECK-NEXT:    v_max3_f32 v30, v31, v33, v34
+  ; CHECK-NEXT:    s_waitcnt vmcnt(2)
+  ; CHECK-NEXT:    v_max3_f32 v31, v32, v39, v40
+  ; CHECK-NEXT:    s_waitcnt vmcnt(0)
+  ; CHECK-NEXT:    v_max3_f32 v32, v35, v45, v46
+  ; CHECK-NEXT:    ds_store_2addr_b32 v11, v29, v31 offset1:1
+  ; CHECK-NEXT:    ds_store_2addr_b32 v11, v30, v32 offset0:18 offset1:19
+  ; CHECK-NEXT:    s_endpgm
+  .entry:
+    ret void
+  }
+
+...
+---
+name:            _amdgpu_cs_main
+exposesReturnsTwice: false
+tracksRegLiveness: true
+body:             |
+  bb.0..entry:
+
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr6 = IMPLICIT_DEF
+    $vgpr7 = IMPLICIT_DEF
+    $vgpr11 = IMPLICIT_DEF
+    $vgpr12 = IMPLICIT_DEF
+    $vgpr16 = IMPLICIT_DEF
+    $vgpr17 = IMPLICIT_DEF
+    $vgpr19 = IMPLICIT_DEF
+    $vgpr22 = IMPLICIT_DEF
+    $vgpr24 = IMPLICIT_DEF
+    $vgpr25 = IMPLICIT_DEF
+    $vgpr26 = IMPLICIT_DEF
+    $vgpr57 = IMPLICIT_DEF
+    $vgpr58 = IMPLICIT_DEF
+    $vgpr59 = IMPLICIT_DEF
+    $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = IMPLICIT_DEF
+
+    renamable $vgpr15 = nofpexcept V_CVT_U32_F32_e32 killed $vgpr11, implicit $mode, implicit $exec
+    renamable $vgpr28 = nofpexcept V_CVT_U32_F32_e32 killed $vgpr12, implicit $mode, implicit $exec
+    renamable $vgpr20 = IMAGE_LOAD_V1_V3_gfx11 $vgpr57_vgpr58_vgpr59, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+    renamable $vgpr21 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr57, renamable $vgpr58, renamable $vgpr24, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+    renamable $vgpr29 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr57, renamable $vgpr58, renamable $vgpr25, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+    renamable $vgpr30 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr57, renamable $vgpr58, renamable $vgpr26, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+    renamable $vgpr11 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr57, renamable $vgpr58, renamable $vgpr22, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+    renamable $vgpr9 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr57, renamable $vgpr58, renamable $vgpr19, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+    renamable $vgpr10 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr57, renamable $vgpr58, renamable $vgpr17, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+    renamable $vgpr31 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr15, renamable $vgpr58, renamable $vgpr59, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+    renamable $vgpr32 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr15, renamable $vgpr58, renamable $vgpr24, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+    renamable $vgpr33 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr15, renamable $vgpr58, renamable $vgpr25, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+    renamable $vgpr34 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr15, renamable $vgpr58, renamable $vgpr26, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+    renamable $vgpr35 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr15, renamable $vgpr58, renamable $vgpr22, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+    renamable $vgpr12 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr15, renamable $vgpr58, renamable $vgpr19, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+    renamable $vgpr36 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr15, renamable $vgpr58, renamable $vgpr17, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+    renamable $vgpr37 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr57, renamable $vgpr28, renamable $vgpr59, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+    renamable $vgpr38 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr57, renamable $vgpr28, renamable $vgpr24, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+    renamable $vgpr39 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr57, renamable $vgpr28, renamable $vgpr25, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+    renamable $vgpr40 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr57, renamable $vgpr28, renamable $vgpr26, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+    renamable $vgpr41 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr57, renamable $vgpr28, renamable $vgpr19, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+    renamable $vgpr42 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr57, renamable $vgpr28, renamable $vgpr17, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+    renamable $vgpr43 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr15, renamable $vgpr28, renamable $vgpr59, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+    renamable $vgpr44 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr15, renamable $vgpr28, renamable $vgpr24, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+    renamable $vgpr45 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr15, renamable $vgpr28, renamable $vgpr25, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+    renamable $vgpr46 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr15, renamable $vgpr28, renamable $vgpr26, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+    renamable $vgpr47 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr15, renamable $vgpr28, renamable $vgpr19, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+    renamable $vgpr48 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr15, renamable $vgpr28, renamable $vgpr17, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+    renamable $vgpr49 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr57, renamable $vgpr28, renamable $vgpr22, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+    renamable $vgpr50 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr15, renamable $vgpr28, renamable $vgpr22, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+    renamable $vgpr51 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr57, renamable $vgpr58, renamable $vgpr16, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+    renamable $vgpr52 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr15, renamable $vgpr58, renamable $vgpr16, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+    renamable $vgpr53 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr57, renamable $vgpr28, renamable $vgpr16, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+    renamable $vgpr54 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr15, renamable $vgpr28, renamable $vgpr16, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+    renamable $vgpr55 = V_MAX3_F32_e64 0, killed $vgpr9, 0, 0, 0, killed $vgpr10, 0, 0, implicit $mode, implicit $exec
+    renamable $vgpr9 = V_LSHRREV_B32_e32 10, $vgpr0, implicit $exec
+    renamable $vgpr36 = V_MAX3_F32_e64 0, killed $vgpr12, 0, 0, 0, killed $vgpr36, 0, 0, implicit $mode, implicit $exec
+    renamable $vgpr56 = V_LSHLREV_B32_e32 3, $vgpr6, implicit $exec
+    renamable $vgpr41 = V_MAX3_F32_e64 0, killed $vgpr41, 0, 0, 0, killed $vgpr42, 0, 0, implicit $mode, implicit $exec
+    renamable $vgpr12 = nuw nsw V_LSHLREV_B32_e32 1, $vgpr7, implicit $exec
+    renamable $vgpr42 = V_MAX3_F32_e64 0, killed $vgpr47, 0, 0, 0, killed $vgpr48, 0, 0, implicit $mode, implicit $exec
+    renamable $vgpr10 = nuw nsw V_LSHLREV_B32_e32 1, $vgpr6, implicit $exec
+    V_CMP_EQ_U32_e32 7, $vgpr7, implicit-def $vcc, implicit $exec
+    renamable $vgpr47 = V_MAX3_F32_e64 0, killed $vgpr55, 0, killed $vgpr11, 0, killed $vgpr51, 0, 0, implicit $mode, implicit $exec
+    renamable $vgpr35 = V_MAX3_F32_e64 0, killed $vgpr36, 0, killed $vgpr35, 0, killed $vgpr52, 0, 0, implicit $mode, implicit $exec
+    renamable $vgpr36 = V_MAX3_F32_e64 0, killed $vgpr41, 0, killed $vgpr49, 0, killed $vgpr53, 0, 0, implicit $mode, implicit $exec
+    renamable $vgpr41 = V_MAX3_F32_e64 0, killed $vgpr42, 0, killed $vgpr50, 0, killed $vgpr54, 0, 0, implicit $mode, implicit $exec
+    renamable $vgpr11 = V_MAD_U32_U24_e64 144, $vgpr7, killed $vgpr56, 0, implicit $exec
+    renamable $vgpr20 = V_MAX3_F32_e64 0, killed $vgpr47, 0, killed $vgpr20, 0, killed $vgpr21, 0, 0, implicit $mode, implicit $exec
+    renamable $vgpr31 = V_MAX3_F32_e64 0, killed $vgpr35, 0, killed $vgpr31, 0, killed $vgpr32, 0, 0, implicit $mode, implicit $exec
+    renamable $vgpr32 = V_MAX3_F32_e64 0, killed $vgpr36, 0, killed $vgpr37, 0, killed $vgpr38, 0, 0, implicit $mode, implicit $exec
+    renamable $vgpr35 = V_MAX3_F32_e64 0, killed $vgpr41, 0, killed $vgpr43, 0, killed $vgpr44, 0, 0, implicit $mode, implicit $exec
+    renamable $vgpr21 = V_ADD_U32_e32 2, $vgpr3, implicit $exec
+    renamable $vgpr29 = V_MAX3_F32_e64 0, killed $vgpr20, 0, killed $vgpr29, 0, killed $vgpr30, 0, 0, implicit $mode, implicit $exec
+    renamable $vgpr30 = V_MAX3_F32_e64 0, killed $vgpr31, 0, killed $vgpr33, 0, killed $vgpr34, 0, 0, implicit $mode, implicit $exec
+    renamable $vgpr31 = V_MAX3_F32_e64 0, killed $vgpr32, 0, killed $vgpr39, 0, killed $vgpr40, 0, 0, implicit $mode, implicit $exec
+    renamable $vgpr32 = V_MAX3_F32_e64 0, killed $vgpr35, 0, killed $vgpr45, 0, killed $vgpr46, 0, 0, implicit $mode, implicit $exec
+    renamable $vgpr20 = V_ADD_U32_e32 3, $vgpr3, implicit $exec
+    DS_WRITE2_B32_gfx9 renamable $vgpr11, killed renamable $vgpr29, killed renamable $vgpr31, 0, 1, 0, implicit $exec :: (store (s32) into %ir.out0, addrspace 3), (store (s32) into %ir.out1, addrspace 3)
+    DS_WRITE2_B32_gfx9 renamable $vgpr11, killed renamable $vgpr30, killed renamable $vgpr32, 18, 19, 0, implicit $exec :: (store (s32) into %ir.out2, addrspace 3), (store (s32) into %ir.out3, addrspace 3)
+    S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
index 50056b62b3397..a4a4c33ccfe3e 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
@@ -9866,11 +9866,11 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:16
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:16
 ; GFX6-NEXT:    s_mov_b32 s2, 0x83200
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    s_waitcnt vmcnt(1)
 ; GFX6-NEXT:    buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
@@ -10324,23 +10324,22 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2030
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[38:39] offset:160
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[19:22], v5, s[38:39] offset:192
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[15:18], v5, s[38:39] offset:176
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[38:39] offset:160
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v5, s[38:39] offset:144
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2020
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2070
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[38:39] offset:112
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v5, s[38:39] offset:128
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[38:39] offset:112
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2010
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20c0
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[38:39] offset:96
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20b0
@@ -10358,10 +10357,10 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2080
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[11:14], v5, s[38:39] offset:32
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v5, s[38:39] offset:16
+; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[11:14], v5, s[38:39] offset:32
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2060
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[7:10], v5, s[38:39]
 ; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v6, 1
@@ -10499,6 +10498,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX10-FLATSCR-NEXT:    v_lshlrev_b32_e32 v5, 8, v0
 ; GFX10-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-FLATSCR-NEXT:    s_clause 0xf
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[38:39]
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[35:38], v5, s[38:39] offset:240
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[31:34], v5, s[38:39] offset:224
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[27:30], v5, s[38:39] offset:208
@@ -10514,8 +10514,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[47:50], v5, s[38:39] offset:48
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[43:46], v5, s[38:39] offset:32
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[39:42], v5, s[38:39] offset:16
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[38:39]
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(15)
 ; GFX10-FLATSCR-NEXT:    v_lshl_add_u32 v4, v0, 13, 16
 ; GFX10-FLATSCR-NEXT:    scratch_store_dword v4, v6, off
 ; GFX10-FLATSCR-NEXT:    ;;#ASMSTART
@@ -10546,6 +10545,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX10-FLATSCR-NEXT:    ; use s[0:7],s[8:15],s[16:23],s[24:31],s[40:43],s[34:35]
 ; GFX10-FLATSCR-NEXT:    ;;#ASMEND
 ; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0x2010
+; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(4)
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v88, v58
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v92, v62
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v87, v57
@@ -10564,6 +10564,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v66, v36
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v65, v35
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v36, v10
+; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v72, v42
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v76, v46
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v80, v50
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fadd.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fadd.ll
index bce7c1e5e8ab7..51fc72be41f36 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fadd.ll
@@ -2080,8 +2080,8 @@ define double @test_vector_reduce_fadd_v16double(double %sp, <16 x double> %v) {
 ; GFX9-SDAG-LABEL: test_vector_reduce_fadd_v16double:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    scratch_load_dword v33, off, s32 offset:8
 ; GFX9-SDAG-NEXT:    scratch_load_dword v31, off, s32
+; GFX9-SDAG-NEXT:    scratch_load_dword v33, off, s32 offset:8
 ; GFX9-SDAG-NEXT:    scratch_load_dword v32, off, s32 offset:4
 ; GFX9-SDAG-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-SDAG-NEXT:    v_add_f64 v[0:1], v[0:1], v[4:5]
@@ -2097,7 +2097,7 @@ define double @test_vector_reduce_fadd_v16double(double %sp, <16 x double> %v) {
 ; GFX9-SDAG-NEXT:    v_add_f64 v[0:1], v[0:1], v[24:25]
 ; GFX9-SDAG-NEXT:    v_add_f64 v[0:1], v[0:1], v[26:27]
 ; GFX9-SDAG-NEXT:    v_add_f64 v[0:1], v[0:1], v[28:29]
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-SDAG-NEXT:    v_add_f64 v[0:1], v[0:1], v[30:31]
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_add_f64 v[0:1], v[0:1], v[32:33]
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmul.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmul.ll
index 657fe0f0804f3..3b8c3de3e5433 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmul.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmul.ll
@@ -2080,8 +2080,8 @@ define double @test_vector_reduce_fmul_v16double(double %sp, <16 x double> %v) {
 ; GFX9-SDAG-LABEL: test_vector_reduce_fmul_v16double:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    scratch_load_dword v33, off, s32 offset:8
 ; GFX9-SDAG-NEXT:    scratch_load_dword v31, off, s32
+; GFX9-SDAG-NEXT:    scratch_load_dword v33, off, s32 offset:8
 ; GFX9-SDAG-NEXT:    scratch_load_dword v32, off, s32 offset:4
 ; GFX9-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
@@ -2097,7 +2097,7 @@ define double @test_vector_reduce_fmul_v16double(double %sp, <16 x double> %v) {
 ; GFX9-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], v[24:25]
 ; GFX9-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], v[26:27]
 ; GFX9-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], v[28:29]
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], v[30:31]
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], v[32:33]
diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
index a42c8ac706d27..ab6df6462816a 100644
--- a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
+++ b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
@@ -384,8 +384,8 @@ define amdgpu_gfx_whole_wave i32 @csr(i1 %active, i32 %a, i32 %b) {
 ; DAGISEL-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; DAGISEL-NEXT:    s_xor_b32 exec_lo, vcc_lo, -1
 ; DAGISEL-NEXT:    s_clause 0x3
-; DAGISEL-NEXT:    scratch_load_b32 v2, off, s32
 ; DAGISEL-NEXT:    scratch_load_b32 v0, off, s32 offset:4
+; DAGISEL-NEXT:    scratch_load_b32 v2, off, s32
 ; DAGISEL-NEXT:    scratch_load_b32 v1, off, s32 offset:8
 ; DAGISEL-NEXT:    scratch_load_b32 v49, off, s32 offset:16
 ; DAGISEL-NEXT:    s_mov_b32 exec_lo, vcc_lo
@@ -423,8 +423,8 @@ define amdgpu_gfx_whole_wave i32 @csr(i1 %active, i32 %a, i32 %b) {
 ; GISEL-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GISEL-NEXT:    s_xor_b32 exec_lo, vcc_lo, -1
 ; GISEL-NEXT:    s_clause 0x3
-; GISEL-NEXT:    scratch_load_b32 v2, off, s32
 ; GISEL-NEXT:    scratch_load_b32 v0, off, s32 offset:4
+; GISEL-NEXT:    scratch_load_b32 v2, off, s32
 ; GISEL-NEXT:    scratch_load_b32 v1, off, s32 offset:8
 ; GISEL-NEXT:    scratch_load_b32 v49, off, s32 offset:16
 ; GISEL-NEXT:    s_mov_b32 exec_lo, vcc_lo
@@ -463,8 +463,8 @@ define amdgpu_gfx_whole_wave i32 @csr(i1 %active, i32 %a, i32 %b) {
 ; DAGISEL64-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; DAGISEL64-NEXT:    s_xor_b64 exec, vcc, -1
 ; DAGISEL64-NEXT:    s_clause 0x3
-; DAGISEL64-NEXT:    scratch_load_b32 v2, off, s32
 ; DAGISEL64-NEXT:    scratch_load_b32 v0, off, s32 offset:4
+; DAGISEL64-NEXT:    scratch_load_b32 v2, off, s32
 ; DAGISEL64-NEXT:    scratch_load_b32 v1, off, s32 offset:8
 ; DAGISEL64-NEXT:    scratch_load_b32 v49, off, s32 offset:16
 ; DAGISEL64-NEXT:    s_mov_b64 exec, vcc
@@ -503,8 +503,8 @@ define amdgpu_gfx_whole_wave i32 @csr(i1 %active, i32 %a, i32 %b) {
 ; GISEL64-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GISEL64-NEXT:    s_xor_b64 exec, vcc, -1
 ; GISEL64-NEXT:    s_clause 0x3
-; GISEL64-NEXT:    scratch_load_b32 v2, off, s32
 ; GISEL64-NEXT:    scratch_load_b32 v0, off, s32 offset:4
+; GISEL64-NEXT:    scratch_load_b32 v2, off, s32
 ; GISEL64-NEXT:    scratch_load_b32 v1, off, s32 offset:8
 ; GISEL64-NEXT:    scratch_load_b32 v49, off, s32 offset:16
 ; GISEL64-NEXT:    s_mov_b64 exec, vcc
@@ -541,8 +541,8 @@ define amdgpu_gfx_whole_wave i32 @csr(i1 %active, i32 %a, i32 %b) {
 ; GFX1250-DAGISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-DAGISEL-NEXT:    s_xor_b32 exec_lo, vcc_lo, -1
 ; GFX1250-DAGISEL-NEXT:    s_clause 0x3
-; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v2, off, s32
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v0, off, s32 offset:4
+; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v2, off, s32
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v1, off, s32 offset:8
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v49, off, s32 offset:16
 ; GFX1250-DAGISEL-NEXT:    s_wait_xcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
index 74e9ab718c3d2..97be3c7e3b806 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
@@ -188,9 +188,9 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX9-O0-NEXT:  ; %bb.1: ; %if
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, 0
@@ -1027,34 +1027,37 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v30, v36
 ; GFX9-O0-NEXT:    ; kill: def $vgpr31 killed $vgpr35 killed $exec
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(13)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v7
 ; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(11)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v9
 ; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v11
 ; GFX9-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v0
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v11
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
@@ -1135,6 +1138,7 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v33
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v34
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v11
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:4
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v10
 ; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
index 6347a3783c9c6..d3f646585e4f3 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
@@ -165,9 +165,9 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-O0-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX9-O0-NEXT:  ; %bb.1: ; %if
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[16:19], 0 offset:16 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, 0
@@ -997,9 +997,9 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-O0-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX9-O0-NEXT:  ; %bb.1: ; %if
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[16:19], 0 offset:16 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, 0



More information about the llvm-commits mailing list