[llvm] [AMDGPU] Improve isBasicBlockPrologue to only add necessary instructions (PR #113303)

via llvm-commits llvm-commits at lists.llvm.org
Tue Oct 22 05:10:01 PDT 2024


https://github.com/alex-t created https://github.com/llvm/llvm-project/pull/113303

Currently, we consider all SGPR spill and WWM spill instructions to belong Basic Block prologue.
We only need those producing operands for the EXEC mask manipulation at the Block beginning or those producing operands for the formers.

>From df1f23175b05388718a6d8fe78d71521840b66f4 Mon Sep 17 00:00:00 2001
From: Alexander Timofeev <alexander.timofeev at amd.com>
Date: Mon, 21 Oct 2024 18:39:20 +0200
Subject: [PATCH] [AMDGPU] Improve isBasicBlockPrologue to only add necessary
 instructions

---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |  19 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.h          |   2 +
 .../GlobalISel/image-waterfall-loop-O0.ll     |  14 +-
 llvm/test/CodeGen/AMDGPU/collapse-endcf.ll    |  60 ++-
 .../AMDGPU/cross-block-use-is-not-abi-copy.ll |   2 +-
 llvm/test/CodeGen/AMDGPU/div_i128.ll          | 158 +++---
 .../identical-subrange-spill-infloop.ll       | 344 ++++++-------
 .../CodeGen/AMDGPU/indirect-addressing-si.ll  | 129 ++---
 ...uf-legalize-operands-non-ptr-intrinsics.ll | 102 ++--
 .../CodeGen/AMDGPU/mubuf-legalize-operands.ll | 111 ++--
 .../AMDGPU/partial-sgpr-to-vgpr-spills.ll     |  33 +-
 .../AMDGPU/postra-sink-update-dependency.mir  |   6 +-
 llvm/test/CodeGen/AMDGPU/rem_i128.ll          |  78 ++-
 .../CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll     |   7 +-
 .../CodeGen/AMDGPU/spill-scavenge-offset.ll   | 482 +++++++++---------
 llvm/test/CodeGen/AMDGPU/trap-abis.ll         |   5 +-
 .../test/CodeGen/AMDGPU/wwm-reserved-spill.ll |   4 +-
 llvm/test/CodeGen/AMDGPU/wwm-reserved.ll      |   8 +-
 18 files changed, 735 insertions(+), 829 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 89a2eb4f18946b..f0ad090ec393e7 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -8901,6 +8901,22 @@ unsigned SIInstrInfo::getLiveRangeSplitOpcode(Register SrcReg,
   return AMDGPU::COPY;
 }
 
+bool SIInstrInfo::isPrologueOperandReload(const MachineInstr &MI) const {
+  unsigned Opcode = MI.getOpcode();
+  if ((isSGPRSpill(MI) &&
+       (MI.mayLoad() || Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)) ||
+      (isWWMRegSpillOpcode(Opcode) && MI.mayLoad())) {
+    Register Reg = MI.defs().begin()->getReg();
+    const MachineBasicBlock *MBB = MI.getParent();
+    MachineBasicBlock::const_instr_iterator I(MI), E = MBB->instr_end();
+    while (++I != E) {
+      if (I->readsRegister(Reg, &RI) && isBasicBlockPrologue(*I))
+        return true;
+    }
+  }
+  return false;
+}
+
 bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI,
                                        Register Reg) const {
   // We need to handle instructions which may be inserted during register
@@ -8917,8 +8933,7 @@ bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI,
 
   uint16_t Opcode = MI.getOpcode();
   return IsNullOrVectorRegister &&
-         (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode) ||
-          Opcode == AMDGPU::IMPLICIT_DEF ||
+         (isPrologueOperandReload(MI) || Opcode == AMDGPU::IMPLICIT_DEF ||
           (!MI.isTerminator() && Opcode != AMDGPU::COPY &&
            MI.modifiesRegister(AMDGPU::EXEC, &RI)));
 }
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 7041b59964645a..04b0414f9050ff 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1342,6 +1342,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
   bool isBasicBlockPrologue(const MachineInstr &MI,
                             Register Reg = Register()) const override;
 
+  bool isPrologueOperandReload(const MachineInstr &MI) const;
+
   MachineInstr *createPHIDestinationCopy(MachineBasicBlock &MBB,
                                          MachineBasicBlock::iterator InsPt,
                                          const DebugLoc &DL, Register Src,
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll
index 88fd7dcce35f68..eb864a0757c854 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll
@@ -68,9 +68,6 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
 ; CHECK-NEXT:    buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b32 exec_lo, s21
 ; CHECK-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_or_saveexec_b32 s21, -1
-; CHECK-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_mov_b32 exec_lo, s21
 ; CHECK-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
@@ -87,7 +84,10 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
 ; CHECK-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_or_saveexec_b32 s21, -1
+; CHECK-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
+; CHECK-NEXT:    s_mov_b32 exec_lo, s21
+; CHECK-NEXT:    s_waitcnt vmcnt(1)
 ; CHECK-NEXT:    v_readfirstlane_b32 s12, v7
 ; CHECK-NEXT:    v_readfirstlane_b32 s10, v6
 ; CHECK-NEXT:    v_readfirstlane_b32 s9, v5
@@ -104,6 +104,7 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
 ; CHECK-NEXT:    s_mov_b32 s17, s6
 ; CHECK-NEXT:    s_mov_b32 s18, s5
 ; CHECK-NEXT:    s_mov_b32 s19, s4
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_writelane_b32 v16, s12, 5
 ; CHECK-NEXT:    v_writelane_b32 v16, s13, 6
 ; CHECK-NEXT:    v_writelane_b32 v16, s14, 7
@@ -137,6 +138,8 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
 ; CHECK-NEXT:    buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b32 exec_lo, s21
 ; CHECK-NEXT:  ; %bb.2: ; in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_or_saveexec_b32 s21, -1
 ; CHECK-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b32 exec_lo, s21
@@ -154,9 +157,6 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
 ; CHECK-NEXT:    v_readlane_b32 s17, v16, 1
 ; CHECK-NEXT:    v_readlane_b32 s18, v16, 2
 ; CHECK-NEXT:    v_readlane_b32 s19, v16, 3
-; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    image_sample v0, v[0:1], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
index fe17ff169cb14b..2e69bcd1ce152c 100644
--- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
+++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
@@ -67,6 +67,7 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB0_4
 ; GCN-O0-NEXT:  ; %bb.1: ; %bb.outer.then
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
 ; GCN-O0-NEXT:    buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
@@ -74,14 +75,12 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s4, v4, 0
 ; GCN-O0-NEXT:    v_readlane_b32 s5, v4, 1
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b32 s2, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s0, 0
 ; GCN-O0-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
 ; GCN-O0-NEXT:    s_mov_b32 s1, s2
 ; GCN-O0-NEXT:    ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7
 ; GCN-O0-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_ashrrev_i32_e64 v3, 31, v0
 ; GCN-O0-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-O0-NEXT:    v_mov_b32_e32 v2, v3
@@ -100,6 +99,8 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB0_3
 ; GCN-O0-NEXT:  ; %bb.2: ; %bb.inner.then
+; GCN-O0-NEXT:    s_waitcnt expcnt(1)
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
 ; GCN-O0-NEXT:    buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
@@ -107,9 +108,7 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v4, 0
 ; GCN-O0-NEXT:    v_readlane_b32 s1, v4, 1
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    v_mov_b32_e32 v0, 1
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_add_i32_e64 v1, s[2:3], v1, v0
 ; GCN-O0-NEXT:    v_ashrrev_i32_e64 v3, 31, v1
 ; GCN-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
@@ -236,6 +235,7 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB1_3
 ; GCN-O0-NEXT:  ; %bb.1: ; %bb.outer.then
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
 ; GCN-O0-NEXT:    buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
@@ -243,14 +243,12 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s4, v4, 0
 ; GCN-O0-NEXT:    v_readlane_b32 s5, v4, 1
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b32 s2, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s0, 0
 ; GCN-O0-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
 ; GCN-O0-NEXT:    s_mov_b32 s1, s2
 ; GCN-O0-NEXT:    ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7
 ; GCN-O0-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_ashrrev_i32_e64 v3, 31, v0
 ; GCN-O0-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-O0-NEXT:    v_mov_b32_e32 v2, v3
@@ -269,6 +267,8 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB1_4
 ; GCN-O0-NEXT:  ; %bb.2: ; %bb.inner.then
+; GCN-O0-NEXT:    s_waitcnt expcnt(1)
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
 ; GCN-O0-NEXT:    buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
@@ -276,9 +276,7 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v4, 0
 ; GCN-O0-NEXT:    v_readlane_b32 s1, v4, 1
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    v_mov_b32_e32 v0, 1
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_add_i32_e64 v1, s[2:3], v1, v0
 ; GCN-O0-NEXT:    v_ashrrev_i32_e64 v3, 31, v1
 ; GCN-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
@@ -312,9 +310,9 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
 ; GCN-O0-NEXT:    v_readlane_b32 s2, v4, 4
 ; GCN-O0-NEXT:    v_readlane_b32 s3, v4, 5
 ; GCN-O0-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v4, 0
 ; GCN-O0-NEXT:    v_readlane_b32 s1, v4, 1
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    v_mov_b32_e32 v0, 2
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_add_i32_e64 v1, s[2:3], v1, v0
@@ -456,17 +454,18 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB2_6
 ; GCN-O0-NEXT:  ; %bb.1: ; %bb.outer.then
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
 ; GCN-O0-NEXT:    buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b32 s0, 2
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
+; GCN-O0-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-O0-NEXT:    v_cmp_ne_u32_e64 s[0:1], v0, s0
 ; GCN-O0-NEXT:    s_mov_b64 s[2:3], exec
 ; GCN-O0-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
 ; GCN-O0-NEXT:    s_xor_b64 s[2:3], s[0:1], s[2:3]
+; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_writelane_b32 v4, s2, 4
 ; GCN-O0-NEXT:    v_writelane_b32 v4, s3, 5
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -493,6 +492,7 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB2_5
 ; GCN-O0-NEXT:  ; %bb.3: ; %bb.then
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
 ; GCN-O0-NEXT:    buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
@@ -500,9 +500,7 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v4, 0
 ; GCN-O0-NEXT:    v_readlane_b32 s1, v4, 1
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    v_mov_b32_e32 v0, 1
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_add_i32_e64 v1, s[2:3], v1, v0
 ; GCN-O0-NEXT:    v_ashrrev_i32_e64 v3, 31, v1
 ; GCN-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
@@ -518,6 +516,7 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
 ; GCN-O0-NEXT:    s_branch .LBB2_5
 ; GCN-O0-NEXT:  .LBB2_4: ; %bb.else
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
 ; GCN-O0-NEXT:    buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
@@ -525,9 +524,7 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v4, 0
 ; GCN-O0-NEXT:    v_readlane_b32 s1, v4, 1
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    v_mov_b32_e32 v0, 2
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_add_i32_e64 v1, s[2:3], v1, v0
 ; GCN-O0-NEXT:    v_ashrrev_i32_e64 v3, 31, v1
 ; GCN-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
@@ -724,13 +721,13 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB3_8
 ; GCN-O0-NEXT:  ; %bb.2: ; %bb.outer.then
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
 ; GCN-O0-NEXT:    buffer_load_dword v6, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b32 s0, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s2, 0
 ; GCN-O0-NEXT:    s_mov_b32 s4, s2
@@ -740,11 +737,12 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
 ; GCN-O0-NEXT:    s_mov_b64 s[2:3], s[4:5]
 ; GCN-O0-NEXT:    v_mov_b32_e32 v1, 1
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
+; GCN-O0-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-O0-NEXT:    buffer_store_dword v1, v[2:3], s[0:3], 0 addr64 offset:4
 ; GCN-O0-NEXT:    s_mov_b32 s0, 2
 ; GCN-O0-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s0
 ; GCN-O0-NEXT:    s_mov_b64 s[0:1], exec
+; GCN-O0-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-O0-NEXT:    v_writelane_b32 v6, s0, 4
 ; GCN-O0-NEXT:    v_writelane_b32 v6, s1, 5
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
@@ -770,13 +768,13 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:8
 ; GCN-O0-NEXT:    s_branch .LBB3_7
 ; GCN-O0-NEXT:  .LBB3_4: ; %bb.outer.else
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
 ; GCN-O0-NEXT:    buffer_load_dword v6, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b32 s1, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s0, 0
 ; GCN-O0-NEXT:    s_mov_b32 s2, s0
@@ -786,10 +784,11 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7
 ; GCN-O0-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GCN-O0-NEXT:    v_mov_b32_e32 v1, 3
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
+; GCN-O0-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-O0-NEXT:    buffer_store_dword v1, v[2:3], s[4:7], 0 addr64 offset:12
 ; GCN-O0-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s0
 ; GCN-O0-NEXT:    s_mov_b64 s[0:1], exec
+; GCN-O0-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-O0-NEXT:    v_writelane_b32 v6, s0, 6
 ; GCN-O0-NEXT:    v_writelane_b32 v6, s1, 7
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
@@ -927,6 +926,7 @@ define amdgpu_kernel void @s_endpgm_unsafe_barrier(ptr addrspace(1) nocapture %a
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB4_2
 ; GCN-O0-NEXT:  ; %bb.1: ; %bb.then
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
 ; GCN-O0-NEXT:    buffer_load_dword v3, off, s[12:15], 0 ; 4-byte Folded Reload
@@ -934,14 +934,12 @@ define amdgpu_kernel void @s_endpgm_unsafe_barrier(ptr addrspace(1) nocapture %a
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v3, 0
 ; GCN-O0-NEXT:    v_readlane_b32 s1, v3, 1
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b32 s2, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s4, 0
 ; GCN-O0-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
 ; GCN-O0-NEXT:    s_mov_b32 s5, s2
 ; GCN-O0-NEXT:    ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
 ; GCN-O0-NEXT:    s_mov_b64 s[2:3], s[4:5]
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_ashrrev_i32_e64 v2, 31, v0
 ; GCN-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
 ; GCN-O0-NEXT:    v_mov_b32_e32 v1, v2
@@ -1066,6 +1064,8 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
 ; GCN-O0-NEXT:  .LBB5_1: ; %bb1
 ; GCN-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-O0-NEXT:    s_waitcnt expcnt(1)
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
 ; GCN-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
@@ -1077,9 +1077,7 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
 ; GCN-O0-NEXT:    v_readlane_b32 s7, v6, 1
 ; GCN-O0-NEXT:    v_writelane_b32 v6, s6, 4
 ; GCN-O0-NEXT:    v_writelane_b32 v6, s7, 5
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b32 s4, 0x207
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_cmp_lt_i32_e64 s[4:5], v0, s4
 ; GCN-O0-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GCN-O0-NEXT:    v_writelane_b32 v6, s4, 6
@@ -1279,14 +1277,14 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
 ; GCN-O0-NEXT:    v_readlane_b32 s8, v6, 16
 ; GCN-O0-NEXT:    v_readlane_b32 s9, v6, 17
 ; GCN-O0-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GCN-O0-NEXT:    v_readlane_b32 s6, v6, 4
-; GCN-O0-NEXT:    v_readlane_b32 s7, v6, 5
-; GCN-O0-NEXT:    v_readlane_b32 s4, v6, 14
-; GCN-O0-NEXT:    v_readlane_b32 s5, v6, 15
 ; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    v_readlane_b32 s6, v6, 4
+; GCN-O0-NEXT:    v_readlane_b32 s7, v6, 5
+; GCN-O0-NEXT:    v_readlane_b32 s4, v6, 14
+; GCN-O0-NEXT:    v_readlane_b32 s5, v6, 15
 ; GCN-O0-NEXT:    s_and_b64 s[4:5], exec, s[4:5]
 ; GCN-O0-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
 ; GCN-O0-NEXT:    s_mov_b64 s[6:7], 0
diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
index 09dc6d6bff9e31..ccb0033a2eb2f3 100644
--- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
@@ -148,9 +148,9 @@ define { i32, half } @call_split_type_used_outside_block_struct() #0 {
 ; GCN-NEXT:    s_add_u32 s16, s16, func_struct at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s17, s17, func_struct at rel32@hi+12
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GCN-NEXT:    v_mov_b32_e32 v1, v4
 ; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
-; GCN-NEXT:    v_mov_b32_e32 v1, v4
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index d94ec56842ab87..452130914c77c3 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -672,12 +672,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_branch .LBB0_4
 ; GFX9-O0-NEXT:  .LBB0_6: ; %udiv-do-while
 ; GFX9-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s6, v30, 10
-; GFX9-O0-NEXT:    v_readlane_b32 s7, v30, 11
 ; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
@@ -702,8 +696,13 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_readlane_b32 s6, v30, 10
+; GFX9-O0-NEXT:    v_readlane_b32 s7, v30, 11
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 63
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[28:29], s4, v[2:3]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v29
 ; GFX9-O0-NEXT:    s_mov_b32 s5, 1
@@ -730,7 +729,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[0:1], s4, v[0:1]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v29
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(10)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v27
 ; GFX9-O0-NEXT:    v_or3_b32 v6, v6, v7, v10
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
@@ -740,7 +738,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v25
 ; GFX9-O0-NEXT:    v_or_b32_e64 v6, v6, v7
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
@@ -752,12 +749,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v22
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v23
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v14
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v15
 ; GFX9-O0-NEXT:    v_sub_co_u32_e32 v13, vcc, v13, v6
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v12, vcc, v12, v10, vcc
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v11, vcc, v11, v4, vcc
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v5, vcc
@@ -897,9 +892,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_cbranch_execnz .LBB0_6
 ; GFX9-O0-NEXT:    s_branch .LBB0_1
 ; GFX9-O0-NEXT:  .LBB0_7: ; %udiv-preheader
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
@@ -916,9 +908,12 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(9)
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(10)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[6:7], v4, v[20:21]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v7
 ; GFX9-O0-NEXT:    s_mov_b32 s6, 64
@@ -999,6 +994,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v14, s8
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, s7
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s6
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-O0-NEXT:    v_writelane_b32 v30, s4, 10
 ; GFX9-O0-NEXT:    v_writelane_b32 v30, s5, 11
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
@@ -1030,9 +1026,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB0_6
 ; GFX9-O0-NEXT:  .LBB0_8: ; %udiv-bb1
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
 ; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
@@ -1041,9 +1034,12 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s6
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-O0-NEXT:    s_mov_b32 s4, s7
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
@@ -1054,7 +1050,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s5
 ; GFX9-O0-NEXT:    v_add_co_u32_e32 v8, vcc, v3, v4
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-O0-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s8
 ; GFX9-O0-NEXT:    v_addc_co_u32_e32 v0, vcc, v0, v4, vcc
@@ -1158,6 +1154,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9-O0-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
 ; GFX9-O0-NEXT:    s_xor_b64 s[6:7], s[4:5], s[6:7]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-O0-NEXT:    v_writelane_b32 v30, s6, 8
 ; GFX9-O0-NEXT:    v_writelane_b32 v30, s7, 9
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
@@ -1823,12 +1820,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    s_branch .LBB0_4
 ; GFX9-G-O0-NEXT:  .LBB0_6: ; %udiv-do-while
 ; GFX9-G-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-G-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-G-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[20:21]
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-G-O0-NEXT:    v_readlane_b32 s6, v34, 6
-; GFX9-G-O0-NEXT:    v_readlane_b32 s7, v34, 7
 ; GFX9-G-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
@@ -1853,11 +1844,15 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-G-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT:    v_readlane_b32 s6, v34, 6
+; GFX9-G-O0-NEXT:    v_readlane_b32 s7, v34, 7
 ; GFX9-G-O0-NEXT:    s_mov_b64 s[4:5], 0
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v3
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX9-G-O0-NEXT:    s_mov_b32 s8, 1
@@ -1907,10 +1902,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, s8
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, v0
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v1
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(10)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v28, v30
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v29, v31
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v24, v32
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v25, v33
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, v28
@@ -1930,7 +1923,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v14
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v15
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-G-O0-NEXT:    v_sub_co_u32_e64 v13, s[8:9], v13, v4
 ; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v12, s[8:9], v12, v9, s[8:9]
 ; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v10, s[8:9], v10, v7, s[8:9]
@@ -2043,9 +2035,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    s_cbranch_execnz .LBB0_6
 ; GFX9-G-O0-NEXT:    s_branch .LBB0_1
 ; GFX9-G-O0-NEXT:  .LBB0_7: ; %udiv-preheader
-; GFX9-G-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-G-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-G-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
@@ -2063,12 +2052,15 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-G-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-G-O0-NEXT:    s_mov_b32 s4, 64
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v22, v17
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v23, v16
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v21, v4
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX9-G-O0-NEXT:    v_sub_u32_e64 v4, v18, v4
@@ -2079,7 +2071,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    v_cmp_lt_u32_e64 s[4:5], v18, v6
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX9-G-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v18, v6
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-G-O0-NEXT:    v_lshrrev_b64 v[6:7], v18, v[20:21]
 ; GFX9-G-O0-NEXT:    v_lshrrev_b64 v[25:26], v18, v[22:23]
 ; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[23:24], v5, v[20:21]
@@ -2128,6 +2120,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_mov_b64 s[4:5], s[8:9]
 ; GFX9-G-O0-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-G-O0-NEXT:    v_writelane_b32 v34, s8, 6
 ; GFX9-G-O0-NEXT:    v_writelane_b32 v34, s9, 7
 ; GFX9-G-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
@@ -2159,9 +2152,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_branch .LBB0_6
 ; GFX9-G-O0-NEXT:  .LBB0_8: ; %udiv-bb1
-; GFX9-G-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-G-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-G-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
@@ -2170,17 +2160,20 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-G-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-G-O0-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-G-O0-NEXT:    s_mov_b32 s6, 1
 ; GFX9-G-O0-NEXT:    s_mov_b32 s10, 0
 ; GFX9-G-O0-NEXT:    s_mov_b32 s9, 0
 ; GFX9-G-O0-NEXT:    s_mov_b32 s8, 0
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, s6
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-G-O0-NEXT:    v_add_co_u32_e64 v4, s[6:7], v2, v4
 ; GFX9-G-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, s10
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-G-O0-NEXT:    v_addc_co_u32_e64 v5, s[6:7], v5, v7, s[6:7]
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, s9
 ; GFX9-G-O0-NEXT:    v_addc_co_u32_e64 v7, s[6:7], v6, v7, s[6:7]
@@ -2272,6 +2265,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9-G-O0-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
 ; GFX9-G-O0-NEXT:    s_xor_b64 s[6:7], s[4:5], s[6:7]
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(17)
 ; GFX9-G-O0-NEXT:    v_writelane_b32 v34, s6, 4
 ; GFX9-G-O0-NEXT:    v_writelane_b32 v34, s7, 5
 ; GFX9-G-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
@@ -2866,12 +2860,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_branch .LBB1_4
 ; GFX9-O0-NEXT:  .LBB1_6: ; %udiv-do-while
 ; GFX9-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s6, v30, 8
-; GFX9-O0-NEXT:    v_readlane_b32 s7, v30, 9
 ; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
@@ -2896,8 +2884,13 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_readlane_b32 s6, v30, 8
+; GFX9-O0-NEXT:    v_readlane_b32 s7, v30, 9
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 63
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[28:29], s4, v[2:3]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v29
 ; GFX9-O0-NEXT:    s_mov_b32 s5, 1
@@ -2924,7 +2917,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[0:1], s4, v[0:1]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v29
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(10)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v27
 ; GFX9-O0-NEXT:    v_or3_b32 v6, v6, v7, v10
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
@@ -2934,7 +2926,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v25
 ; GFX9-O0-NEXT:    v_or_b32_e64 v6, v6, v7
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
@@ -2946,12 +2937,10 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v22
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v23
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v14
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v15
 ; GFX9-O0-NEXT:    v_sub_co_u32_e32 v13, vcc, v13, v6
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v12, vcc, v12, v10, vcc
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v11, vcc, v11, v4, vcc
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v5, vcc
@@ -3091,9 +3080,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_cbranch_execnz .LBB1_6
 ; GFX9-O0-NEXT:    s_branch .LBB1_1
 ; GFX9-O0-NEXT:  .LBB1_7: ; %udiv-preheader
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
@@ -3110,9 +3096,12 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(9)
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(10)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[6:7], v4, v[20:21]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v7
 ; GFX9-O0-NEXT:    s_mov_b32 s6, 64
@@ -3193,6 +3182,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v14, s8
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, s7
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s6
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-O0-NEXT:    v_writelane_b32 v30, s4, 8
 ; GFX9-O0-NEXT:    v_writelane_b32 v30, s5, 9
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
@@ -3224,9 +3214,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB1_6
 ; GFX9-O0-NEXT:  .LBB1_8: ; %udiv-bb1
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
@@ -3235,9 +3222,12 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s6
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-O0-NEXT:    s_mov_b32 s4, s7
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
@@ -3248,7 +3238,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s5
 ; GFX9-O0-NEXT:    v_add_co_u32_e32 v8, vcc, v3, v4
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-O0-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s8
 ; GFX9-O0-NEXT:    v_addc_co_u32_e32 v0, vcc, v0, v4, vcc
@@ -3352,6 +3342,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9-O0-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
 ; GFX9-O0-NEXT:    s_xor_b64 s[6:7], s[4:5], s[6:7]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-O0-NEXT:    v_writelane_b32 v30, s6, 6
 ; GFX9-O0-NEXT:    v_writelane_b32 v30, s7, 7
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
@@ -3920,12 +3911,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    s_branch .LBB1_4
 ; GFX9-G-O0-NEXT:  .LBB1_6: ; %udiv-do-while
 ; GFX9-G-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-G-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-G-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-G-O0-NEXT:    v_readlane_b32 s6, v34, 6
-; GFX9-G-O0-NEXT:    v_readlane_b32 s7, v34, 7
 ; GFX9-G-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
@@ -3950,11 +3935,15 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-G-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT:    v_readlane_b32 s6, v34, 6
+; GFX9-G-O0-NEXT:    v_readlane_b32 s7, v34, 7
 ; GFX9-G-O0-NEXT:    s_mov_b64 s[4:5], 0
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v3
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX9-G-O0-NEXT:    s_mov_b32 s8, 1
@@ -4004,10 +3993,8 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, s8
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, v0
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v1
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(10)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v28, v30
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v29, v31
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v20, v32
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v21, v33
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, v28
@@ -4027,7 +4014,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v12
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v13
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-G-O0-NEXT:    v_sub_co_u32_e64 v11, s[8:9], v11, v4
 ; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v10, s[8:9], v10, v9, s[8:9]
 ; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v8, s[8:9], v8, v7, s[8:9]
@@ -4148,9 +4134,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    s_cbranch_execnz .LBB1_6
 ; GFX9-G-O0-NEXT:    s_branch .LBB1_1
 ; GFX9-G-O0-NEXT:  .LBB1_7: ; %udiv-preheader
-; GFX9-G-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-G-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX9-G-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
@@ -4168,11 +4151,14 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-G-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX9-G-O0-NEXT:    s_mov_b32 s4, 64
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, v5
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, v4
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v21, v7
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v20, v6
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, s4
@@ -4238,6 +4224,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_mov_b64 s[4:5], s[8:9]
 ; GFX9-G-O0-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-G-O0-NEXT:    v_writelane_b32 v34, s8, 6
 ; GFX9-G-O0-NEXT:    v_writelane_b32 v34, s9, 7
 ; GFX9-G-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
@@ -4269,9 +4256,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_branch .LBB1_6
 ; GFX9-G-O0-NEXT:  .LBB1_8: ; %udiv-bb1
-; GFX9-G-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-G-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX9-G-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
@@ -4280,17 +4264,20 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-G-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX9-G-O0-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-G-O0-NEXT:    s_mov_b32 s6, 1
 ; GFX9-G-O0-NEXT:    s_mov_b32 s10, 0
 ; GFX9-G-O0-NEXT:    s_mov_b32 s9, 0
 ; GFX9-G-O0-NEXT:    s_mov_b32 s8, 0
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, s6
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-G-O0-NEXT:    v_add_co_u32_e64 v4, s[6:7], v1, v4
 ; GFX9-G-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, s10
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-G-O0-NEXT:    v_addc_co_u32_e64 v5, s[6:7], v3, v5, s[6:7]
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX9-G-O0-NEXT:    v_addc_co_u32_e64 v7, s[6:7], v2, v3, s[6:7]
@@ -4382,6 +4369,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9-G-O0-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
 ; GFX9-G-O0-NEXT:    s_xor_b64 s[6:7], s[4:5], s[6:7]
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(17)
 ; GFX9-G-O0-NEXT:    v_writelane_b32 v34, s6, 4
 ; GFX9-G-O0-NEXT:    v_writelane_b32 v34, s7, 5
 ; GFX9-G-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
index a4a8f43646d4ba..6c45fbf8b8e3b8 100644
--- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
+++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
@@ -8,7 +8,6 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; CHECK-NEXT:    buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill
 ; CHECK-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
 ; CHECK-NEXT:    v_writelane_b32 v5, s30, 0
 ; CHECK-NEXT:    v_writelane_b32 v5, s31, 1
@@ -32,90 +31,81 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    s_mov_b32 s5, s24
 ; CHECK-NEXT:    v_writelane_b32 v5, s51, 17
 ; CHECK-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0x0
-; CHECK-NEXT:    ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
+; CHECK-NEXT:    ; implicit-def: $vgpr6 : SGPR spill to VGPR lane
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
 ; CHECK-NEXT:    s_load_dwordx4 s[28:31], s[4:5], 0x0
 ; CHECK-NEXT:    s_movk_i32 s20, 0x130
 ; CHECK-NEXT:    s_mov_b32 s21, s24
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_writelane_b32 v7, s36, 0
-; CHECK-NEXT:    v_writelane_b32 v7, s37, 1
-; CHECK-NEXT:    v_writelane_b32 v7, s38, 2
-; CHECK-NEXT:    v_writelane_b32 v7, s39, 3
-; CHECK-NEXT:    v_writelane_b32 v7, s40, 4
-; CHECK-NEXT:    v_writelane_b32 v7, s41, 5
-; CHECK-NEXT:    v_writelane_b32 v7, s42, 6
-; CHECK-NEXT:    v_writelane_b32 v7, s43, 7
-; CHECK-NEXT:    v_writelane_b32 v7, s44, 8
-; CHECK-NEXT:    v_writelane_b32 v7, s45, 9
-; CHECK-NEXT:    v_writelane_b32 v7, s46, 10
+; CHECK-NEXT:    v_writelane_b32 v6, s36, 0
+; CHECK-NEXT:    v_writelane_b32 v6, s37, 1
+; CHECK-NEXT:    v_writelane_b32 v6, s38, 2
+; CHECK-NEXT:    v_writelane_b32 v6, s39, 3
+; CHECK-NEXT:    v_writelane_b32 v6, s40, 4
+; CHECK-NEXT:    v_writelane_b32 v6, s41, 5
+; CHECK-NEXT:    v_writelane_b32 v6, s42, 6
+; CHECK-NEXT:    v_writelane_b32 v6, s43, 7
+; CHECK-NEXT:    v_writelane_b32 v6, s44, 8
+; CHECK-NEXT:    v_writelane_b32 v6, s45, 9
+; CHECK-NEXT:    v_writelane_b32 v6, s46, 10
 ; CHECK-NEXT:    s_load_dwordx16 s[4:19], s[20:21], 0x0
-; CHECK-NEXT:    v_writelane_b32 v7, s47, 11
-; CHECK-NEXT:    v_writelane_b32 v7, s48, 12
+; CHECK-NEXT:    v_writelane_b32 v6, s47, 11
+; CHECK-NEXT:    v_writelane_b32 v6, s48, 12
 ; CHECK-NEXT:    s_mov_b32 s20, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
-; CHECK-NEXT:    v_writelane_b32 v7, s49, 13
+; CHECK-NEXT:    v_writelane_b32 v6, s49, 13
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s28
 ; CHECK-NEXT:    v_mov_b32_e32 v3, v1
 ; CHECK-NEXT:    s_mov_b32 s21, s20
 ; CHECK-NEXT:    s_mov_b32 s22, s20
 ; CHECK-NEXT:    s_mov_b32 s23, s20
-; CHECK-NEXT:    v_writelane_b32 v7, s50, 14
-; CHECK-NEXT:    v_writelane_b32 v7, s51, 15
+; CHECK-NEXT:    v_writelane_b32 v6, s50, 14
+; CHECK-NEXT:    v_writelane_b32 v6, s51, 15
 ; CHECK-NEXT:    image_sample_lz v3, v[2:3], s[44:51], s[20:23] dmask:0x1
 ; CHECK-NEXT:    v_mov_b32_e32 v2, v1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_writelane_b32 v7, s4, 16
-; CHECK-NEXT:    v_writelane_b32 v7, s5, 17
-; CHECK-NEXT:    v_writelane_b32 v7, s6, 18
-; CHECK-NEXT:    v_writelane_b32 v7, s7, 19
-; CHECK-NEXT:    v_writelane_b32 v7, s8, 20
-; CHECK-NEXT:    v_writelane_b32 v7, s9, 21
-; CHECK-NEXT:    image_sample_lz v4, v[1:2], s[4:11], s[20:23] dmask:0x1
-; CHECK-NEXT:    v_writelane_b32 v7, s10, 22
-; CHECK-NEXT:    v_writelane_b32 v7, s11, 23
-; CHECK-NEXT:    v_writelane_b32 v7, s12, 24
-; CHECK-NEXT:    v_writelane_b32 v7, s13, 25
-; CHECK-NEXT:    v_writelane_b32 v7, s14, 26
-; CHECK-NEXT:    v_writelane_b32 v7, s15, 27
+; CHECK-NEXT:    v_writelane_b32 v6, s4, 16
 ; CHECK-NEXT:    v_writelane_b32 v5, s52, 18
-; CHECK-NEXT:    v_writelane_b32 v7, s16, 28
 ; CHECK-NEXT:    v_writelane_b32 v5, s53, 19
-; CHECK-NEXT:    v_writelane_b32 v7, s17, 29
+; CHECK-NEXT:    v_writelane_b32 v6, s5, 17
 ; CHECK-NEXT:    v_writelane_b32 v5, s54, 20
-; CHECK-NEXT:    v_writelane_b32 v7, s18, 30
-; CHECK-NEXT:    s_mov_b32 s26, 48
-; CHECK-NEXT:    s_mov_b32 s27, s24
+; CHECK-NEXT:    v_writelane_b32 v6, s6, 18
+; CHECK-NEXT:    image_sample_lz v4, v[1:2], s[4:11], s[20:23] dmask:0x1
 ; CHECK-NEXT:    v_writelane_b32 v5, s55, 21
-; CHECK-NEXT:    v_writelane_b32 v7, s19, 31
-; CHECK-NEXT:    s_load_dwordx8 s[4:11], s[26:27], 0x0
+; CHECK-NEXT:    v_writelane_b32 v6, s7, 19
 ; CHECK-NEXT:    v_writelane_b32 v5, s56, 22
+; CHECK-NEXT:    v_writelane_b32 v6, s8, 20
 ; CHECK-NEXT:    v_writelane_b32 v5, s57, 23
+; CHECK-NEXT:    v_writelane_b32 v6, s9, 21
 ; CHECK-NEXT:    v_writelane_b32 v5, s58, 24
+; CHECK-NEXT:    v_writelane_b32 v6, s10, 22
 ; CHECK-NEXT:    v_writelane_b32 v5, s59, 25
+; CHECK-NEXT:    v_writelane_b32 v6, s11, 23
 ; CHECK-NEXT:    v_writelane_b32 v5, s60, 26
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_writelane_b32 v7, s4, 32
+; CHECK-NEXT:    v_writelane_b32 v6, s12, 24
 ; CHECK-NEXT:    v_writelane_b32 v5, s61, 27
-; CHECK-NEXT:    v_writelane_b32 v7, s5, 33
+; CHECK-NEXT:    v_writelane_b32 v6, s13, 25
 ; CHECK-NEXT:    v_writelane_b32 v5, s62, 28
-; CHECK-NEXT:    v_writelane_b32 v7, s6, 34
+; CHECK-NEXT:    v_writelane_b32 v6, s14, 26
 ; CHECK-NEXT:    v_writelane_b32 v5, s63, 29
-; CHECK-NEXT:    v_writelane_b32 v7, s7, 35
+; CHECK-NEXT:    v_writelane_b32 v6, s15, 27
 ; CHECK-NEXT:    v_writelane_b32 v5, s64, 30
-; CHECK-NEXT:    v_writelane_b32 v7, s8, 36
+; CHECK-NEXT:    v_writelane_b32 v6, s16, 28
 ; CHECK-NEXT:    v_writelane_b32 v5, s65, 31
-; CHECK-NEXT:    v_writelane_b32 v7, s9, 37
+; CHECK-NEXT:    v_writelane_b32 v6, s17, 29
 ; CHECK-NEXT:    v_writelane_b32 v5, s66, 32
-; CHECK-NEXT:    s_movk_i32 s28, 0x1f0
+; CHECK-NEXT:    v_writelane_b32 v6, s18, 30
+; CHECK-NEXT:    s_mov_b32 s26, 48
 ; CHECK-NEXT:    s_movk_i32 s30, 0x2f0
-; CHECK-NEXT:    s_mov_b32 s29, s24
+; CHECK-NEXT:    s_mov_b32 s27, s24
 ; CHECK-NEXT:    s_mov_b32 s31, s24
-; CHECK-NEXT:    v_writelane_b32 v7, s10, 38
 ; CHECK-NEXT:    v_writelane_b32 v5, s67, 33
-; CHECK-NEXT:    v_writelane_b32 v7, s11, 39
-; CHECK-NEXT:    s_load_dwordx16 s[52:67], s[28:29], 0x0
-; CHECK-NEXT:    s_load_dwordx16 s[4:19], s[30:31], 0x0
+; CHECK-NEXT:    v_writelane_b32 v6, s19, 31
+; CHECK-NEXT:    s_movk_i32 s28, 0x1f0
+; CHECK-NEXT:    s_mov_b32 s29, s24
+; CHECK-NEXT:    s_load_dwordx8 s[4:11], s[26:27], 0x0
+; CHECK-NEXT:    s_load_dwordx16 s[36:51], s[28:29], 0x0
+; CHECK-NEXT:    s_load_dwordx16 s[52:67], s[30:31], 0x0
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; CHECK-NEXT:    s_xor_b64 s[24:25], vcc, -1
@@ -125,41 +115,66 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    s_xor_b64 s[26:27], exec, s[26:27]
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_3
 ; CHECK-NEXT:  ; %bb.1: ; %bb48
-; CHECK-NEXT:    v_readlane_b32 s36, v7, 0
-; CHECK-NEXT:    v_readlane_b32 s44, v7, 8
-; CHECK-NEXT:    v_readlane_b32 s45, v7, 9
-; CHECK-NEXT:    v_readlane_b32 s46, v7, 10
-; CHECK-NEXT:    v_readlane_b32 s47, v7, 11
-; CHECK-NEXT:    v_readlane_b32 s48, v7, 12
-; CHECK-NEXT:    v_readlane_b32 s49, v7, 13
-; CHECK-NEXT:    v_readlane_b32 s50, v7, 14
-; CHECK-NEXT:    v_readlane_b32 s51, v7, 15
-; CHECK-NEXT:    s_and_b64 vcc, exec, -1
-; CHECK-NEXT:    v_readlane_b32 s37, v7, 1
-; CHECK-NEXT:    v_readlane_b32 s38, v7, 2
-; CHECK-NEXT:    v_readlane_b32 s39, v7, 3
-; CHECK-NEXT:    v_readlane_b32 s40, v7, 4
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_writelane_b32 v6, s36, 32
+; CHECK-NEXT:    v_writelane_b32 v6, s37, 33
+; CHECK-NEXT:    v_writelane_b32 v6, s38, 34
+; CHECK-NEXT:    v_writelane_b32 v6, s39, 35
+; CHECK-NEXT:    v_writelane_b32 v6, s40, 36
+; CHECK-NEXT:    v_writelane_b32 v6, s41, 37
+; CHECK-NEXT:    v_writelane_b32 v6, s42, 38
+; CHECK-NEXT:    v_writelane_b32 v6, s43, 39
+; CHECK-NEXT:    v_writelane_b32 v6, s44, 40
+; CHECK-NEXT:    v_writelane_b32 v6, s45, 41
+; CHECK-NEXT:    v_writelane_b32 v6, s46, 42
+; CHECK-NEXT:    v_writelane_b32 v6, s47, 43
+; CHECK-NEXT:    v_writelane_b32 v6, s48, 44
+; CHECK-NEXT:    v_writelane_b32 v6, s49, 45
+; CHECK-NEXT:    v_writelane_b32 v6, s50, 46
+; CHECK-NEXT:    v_writelane_b32 v6, s51, 47
+; CHECK-NEXT:    v_readlane_b32 s36, v6, 0
+; CHECK-NEXT:    v_readlane_b32 s44, v6, 8
+; CHECK-NEXT:    v_readlane_b32 s45, v6, 9
+; CHECK-NEXT:    v_readlane_b32 s46, v6, 10
+; CHECK-NEXT:    v_readlane_b32 s47, v6, 11
+; CHECK-NEXT:    v_readlane_b32 s48, v6, 12
+; CHECK-NEXT:    v_readlane_b32 s49, v6, 13
+; CHECK-NEXT:    v_readlane_b32 s50, v6, 14
+; CHECK-NEXT:    v_readlane_b32 s51, v6, 15
+; CHECK-NEXT:    v_readlane_b32 s37, v6, 1
+; CHECK-NEXT:    v_readlane_b32 s38, v6, 2
+; CHECK-NEXT:    v_readlane_b32 s39, v6, 3
+; CHECK-NEXT:    v_readlane_b32 s40, v6, 4
+; CHECK-NEXT:    v_readlane_b32 s41, v6, 5
 ; CHECK-NEXT:    image_sample_lz v3, v[1:2], s[44:51], s[20:23] dmask:0x1
+; CHECK-NEXT:    v_readlane_b32 s42, v6, 6
+; CHECK-NEXT:    v_readlane_b32 s43, v6, 7
+; CHECK-NEXT:    v_readlane_b32 s36, v6, 32
+; CHECK-NEXT:    v_readlane_b32 s37, v6, 33
+; CHECK-NEXT:    v_readlane_b32 s38, v6, 34
+; CHECK-NEXT:    v_readlane_b32 s39, v6, 35
+; CHECK-NEXT:    v_readlane_b32 s40, v6, 36
+; CHECK-NEXT:    v_readlane_b32 s41, v6, 37
+; CHECK-NEXT:    v_readlane_b32 s42, v6, 38
+; CHECK-NEXT:    v_readlane_b32 s43, v6, 39
+; CHECK-NEXT:    v_readlane_b32 s44, v6, 40
+; CHECK-NEXT:    v_readlane_b32 s45, v6, 41
+; CHECK-NEXT:    v_readlane_b32 s46, v6, 42
+; CHECK-NEXT:    v_readlane_b32 s47, v6, 43
+; CHECK-NEXT:    v_readlane_b32 s48, v6, 44
+; CHECK-NEXT:    v_readlane_b32 s49, v6, 45
+; CHECK-NEXT:    v_readlane_b32 s50, v6, 46
+; CHECK-NEXT:    v_readlane_b32 s51, v6, 47
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0
-; CHECK-NEXT:    v_readlane_b32 s41, v7, 5
-; CHECK-NEXT:    v_readlane_b32 s42, v7, 6
-; CHECK-NEXT:    v_readlane_b32 s43, v7, 7
+; CHECK-NEXT:    s_and_b64 vcc, exec, -1
 ; CHECK-NEXT:  .LBB0_2: ; %bb50
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    v_readlane_b32 s36, v7, 32
-; CHECK-NEXT:    v_readlane_b32 s40, v7, 36
-; CHECK-NEXT:    v_readlane_b32 s41, v7, 37
-; CHECK-NEXT:    v_readlane_b32 s42, v7, 38
-; CHECK-NEXT:    v_readlane_b32 s43, v7, 39
 ; CHECK-NEXT:    s_mov_b32 s21, s20
 ; CHECK-NEXT:    s_mov_b32 s22, s20
 ; CHECK-NEXT:    s_mov_b32 s23, s20
-; CHECK-NEXT:    v_readlane_b32 s37, v7, 33
-; CHECK-NEXT:    v_readlane_b32 s38, v7, 34
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    image_sample_lz v4, v[1:2], s[60:67], s[40:43] dmask:0x1
-; CHECK-NEXT:    v_readlane_b32 s39, v7, 35
-; CHECK-NEXT:    image_sample_lz v1, v[1:2], s[12:19], s[20:23] dmask:0x1
+; CHECK-NEXT:    image_sample_lz v4, v[1:2], s[44:51], s[8:11] dmask:0x1
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    image_sample_lz v1, v[1:2], s[60:67], s[20:23] dmask:0x1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_sub_f32_e32 v1, v1, v4
 ; CHECK-NEXT:    v_mul_f32_e32 v1, v1, v0
@@ -167,159 +182,84 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    s_mov_b64 vcc, vcc
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB0_2
 ; CHECK-NEXT:  .LBB0_3: ; %Flow14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_readlane_b32 s12, v7, 32
-; CHECK-NEXT:    v_readlane_b32 s13, v7, 33
-; CHECK-NEXT:    v_readlane_b32 s14, v7, 34
-; CHECK-NEXT:    v_readlane_b32 s15, v7, 35
-; CHECK-NEXT:    v_readlane_b32 s16, v7, 36
-; CHECK-NEXT:    v_readlane_b32 s17, v7, 37
-; CHECK-NEXT:    v_readlane_b32 s18, v7, 38
-; CHECK-NEXT:    v_readlane_b32 s19, v7, 39
-; CHECK-NEXT:    v_writelane_b32 v7, s4, 40
-; CHECK-NEXT:    v_writelane_b32 v7, s5, 41
-; CHECK-NEXT:    v_writelane_b32 v7, s6, 42
-; CHECK-NEXT:    v_writelane_b32 v7, s7, 43
-; CHECK-NEXT:    v_writelane_b32 v7, s8, 44
-; CHECK-NEXT:    v_writelane_b32 v7, s9, 45
-; CHECK-NEXT:    v_writelane_b32 v7, s10, 46
-; CHECK-NEXT:    v_writelane_b32 v7, s11, 47
-; CHECK-NEXT:    v_writelane_b32 v7, s12, 48
-; CHECK-NEXT:    v_writelane_b32 v7, s13, 49
-; CHECK-NEXT:    v_writelane_b32 v7, s14, 50
-; CHECK-NEXT:    v_writelane_b32 v7, s15, 51
-; CHECK-NEXT:    v_writelane_b32 v7, s16, 52
-; CHECK-NEXT:    v_writelane_b32 v7, s17, 53
-; CHECK-NEXT:    v_writelane_b32 v7, s18, 54
-; CHECK-NEXT:    v_writelane_b32 v7, s19, 55
-; CHECK-NEXT:    ; implicit-def: $vgpr6 : SGPR spill to VGPR lane
-; CHECK-NEXT:    v_writelane_b32 v7, s52, 56
-; CHECK-NEXT:    v_writelane_b32 v6, s60, 0
-; CHECK-NEXT:    v_writelane_b32 v7, s53, 57
-; CHECK-NEXT:    v_writelane_b32 v6, s61, 1
-; CHECK-NEXT:    v_writelane_b32 v7, s54, 58
-; CHECK-NEXT:    v_writelane_b32 v6, s62, 2
-; CHECK-NEXT:    v_writelane_b32 v7, s55, 59
-; CHECK-NEXT:    v_writelane_b32 v6, s63, 3
-; CHECK-NEXT:    v_writelane_b32 v7, s56, 60
-; CHECK-NEXT:    v_writelane_b32 v6, s64, 4
-; CHECK-NEXT:    v_writelane_b32 v7, s57, 61
-; CHECK-NEXT:    v_writelane_b32 v6, s65, 5
-; CHECK-NEXT:    v_writelane_b32 v7, s58, 62
-; CHECK-NEXT:    v_writelane_b32 v6, s66, 6
-; CHECK-NEXT:    v_writelane_b32 v7, s59, 63
-; CHECK-NEXT:    v_writelane_b32 v6, s67, 7
-; CHECK-NEXT:    s_andn2_saveexec_b64 s[20:21], s[26:27]
+; CHECK-NEXT:    s_andn2_saveexec_b64 s[28:29], s[26:27]
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_10
 ; CHECK-NEXT:  ; %bb.4: ; %bb32
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_and_saveexec_b64 s[8:9], s[24:25]
-; CHECK-NEXT:    s_xor_b64 s[22:23], exec, s[8:9]
+; CHECK-NEXT:    s_xor_b64 vcc, exec, s[8:9]
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_6
 ; CHECK-NEXT:  ; %bb.5: ; %bb43
 ; CHECK-NEXT:    s_mov_b32 s8, 0
+; CHECK-NEXT:    s_mov_b64 s[12:13], s[36:37]
 ; CHECK-NEXT:    s_mov_b32 s9, s8
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s8
-; CHECK-NEXT:    v_readlane_b32 s36, v7, 0
+; CHECK-NEXT:    s_mov_b64 s[14:15], s[38:39]
+; CHECK-NEXT:    s_mov_b64 s[16:17], s[40:41]
+; CHECK-NEXT:    s_mov_b64 s[18:19], s[42:43]
+; CHECK-NEXT:    v_readlane_b32 s36, v6, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s9
 ; CHECK-NEXT:    s_mov_b32 s10, s8
 ; CHECK-NEXT:    s_mov_b32 s11, s8
-; CHECK-NEXT:    v_readlane_b32 s37, v7, 1
-; CHECK-NEXT:    v_readlane_b32 s38, v7, 2
-; CHECK-NEXT:    v_readlane_b32 s39, v7, 3
-; CHECK-NEXT:    v_readlane_b32 s40, v7, 4
-; CHECK-NEXT:    v_readlane_b32 s41, v7, 5
-; CHECK-NEXT:    v_readlane_b32 s42, v7, 6
-; CHECK-NEXT:    v_readlane_b32 s43, v7, 7
-; CHECK-NEXT:    v_readlane_b32 s44, v7, 8
-; CHECK-NEXT:    v_readlane_b32 s45, v7, 9
-; CHECK-NEXT:    v_readlane_b32 s46, v7, 10
-; CHECK-NEXT:    v_readlane_b32 s47, v7, 11
-; CHECK-NEXT:    v_readlane_b32 s48, v7, 12
-; CHECK-NEXT:    v_readlane_b32 s49, v7, 13
-; CHECK-NEXT:    v_readlane_b32 s50, v7, 14
-; CHECK-NEXT:    v_readlane_b32 s51, v7, 15
-; CHECK-NEXT:    image_sample_lz v2, v[0:1], s[36:43], s[8:11] dmask:0x1
-; CHECK-NEXT:    v_readlane_b32 s36, v7, 16
-; CHECK-NEXT:    v_readlane_b32 s44, v7, 24
-; CHECK-NEXT:    v_readlane_b32 s45, v7, 25
-; CHECK-NEXT:    v_readlane_b32 s46, v7, 26
-; CHECK-NEXT:    v_readlane_b32 s47, v7, 27
-; CHECK-NEXT:    v_readlane_b32 s48, v7, 28
-; CHECK-NEXT:    v_readlane_b32 s49, v7, 29
-; CHECK-NEXT:    v_readlane_b32 s50, v7, 30
-; CHECK-NEXT:    v_readlane_b32 s51, v7, 31
+; CHECK-NEXT:    v_readlane_b32 s37, v6, 1
+; CHECK-NEXT:    v_readlane_b32 s38, v6, 2
+; CHECK-NEXT:    v_readlane_b32 s39, v6, 3
+; CHECK-NEXT:    v_readlane_b32 s40, v6, 4
+; CHECK-NEXT:    v_readlane_b32 s41, v6, 5
+; CHECK-NEXT:    v_readlane_b32 s42, v6, 6
+; CHECK-NEXT:    v_readlane_b32 s43, v6, 7
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v4, v3
-; CHECK-NEXT:    v_readlane_b32 s37, v7, 17
-; CHECK-NEXT:    v_readlane_b32 s38, v7, 18
-; CHECK-NEXT:    v_readlane_b32 s39, v7, 19
-; CHECK-NEXT:    image_sample_lz v0, v[0:1], s[44:51], s[12:15] dmask:0x1
-; CHECK-NEXT:    v_readlane_b32 s40, v7, 20
-; CHECK-NEXT:    v_readlane_b32 s41, v7, 21
-; CHECK-NEXT:    v_readlane_b32 s42, v7, 22
-; CHECK-NEXT:    v_readlane_b32 s43, v7, 23
+; CHECK-NEXT:    v_readlane_b32 s44, v6, 8
+; CHECK-NEXT:    v_readlane_b32 s45, v6, 9
+; CHECK-NEXT:    v_readlane_b32 s46, v6, 10
+; CHECK-NEXT:    image_sample_lz v2, v[0:1], s[36:43], s[8:11] dmask:0x1
+; CHECK-NEXT:    s_mov_b64 s[42:43], s[18:19]
+; CHECK-NEXT:    s_mov_b64 s[40:41], s[16:17]
+; CHECK-NEXT:    s_mov_b64 s[38:39], s[14:15]
+; CHECK-NEXT:    s_mov_b64 s[36:37], s[12:13]
+; CHECK-NEXT:    v_readlane_b32 s12, v6, 16
+; CHECK-NEXT:    v_readlane_b32 s20, v6, 24
+; CHECK-NEXT:    v_readlane_b32 s21, v6, 25
+; CHECK-NEXT:    v_readlane_b32 s22, v6, 26
+; CHECK-NEXT:    v_readlane_b32 s23, v6, 27
+; CHECK-NEXT:    v_readlane_b32 s24, v6, 28
+; CHECK-NEXT:    v_readlane_b32 s25, v6, 29
+; CHECK-NEXT:    v_readlane_b32 s26, v6, 30
+; CHECK-NEXT:    v_readlane_b32 s27, v6, 31
+; CHECK-NEXT:    v_readlane_b32 s47, v6, 11
+; CHECK-NEXT:    v_readlane_b32 s48, v6, 12
+; CHECK-NEXT:    v_readlane_b32 s49, v6, 13
+; CHECK-NEXT:    v_readlane_b32 s50, v6, 14
+; CHECK-NEXT:    v_readlane_b32 s51, v6, 15
+; CHECK-NEXT:    image_sample_lz v0, v[0:1], s[20:27], s[4:7] dmask:0x1
+; CHECK-NEXT:    v_readlane_b32 s13, v6, 17
+; CHECK-NEXT:    v_readlane_b32 s14, v6, 18
+; CHECK-NEXT:    v_readlane_b32 s15, v6, 19
+; CHECK-NEXT:    v_readlane_b32 s16, v6, 20
+; CHECK-NEXT:    v_readlane_b32 s17, v6, 21
+; CHECK-NEXT:    v_readlane_b32 s18, v6, 22
+; CHECK-NEXT:    v_readlane_b32 s19, v6, 23
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
 ; CHECK-NEXT:    buffer_store_dwordx3 v[2:4], off, s[8:11], 0
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
 ; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; CHECK-NEXT:    ; implicit-def: $vgpr0
 ; CHECK-NEXT:  .LBB0_6: ; %Flow12
-; CHECK-NEXT:    s_or_saveexec_b64 s[4:5], s[22:23]
-; CHECK-NEXT:    v_readlane_b32 s52, v7, 40
-; CHECK-NEXT:    v_readlane_b32 s53, v7, 41
-; CHECK-NEXT:    v_readlane_b32 s54, v7, 42
-; CHECK-NEXT:    v_readlane_b32 s55, v7, 43
-; CHECK-NEXT:    v_readlane_b32 s56, v7, 44
-; CHECK-NEXT:    v_readlane_b32 s57, v7, 45
-; CHECK-NEXT:    v_readlane_b32 s58, v7, 46
-; CHECK-NEXT:    v_readlane_b32 s59, v7, 47
-; CHECK-NEXT:    v_readlane_b32 s60, v7, 48
-; CHECK-NEXT:    v_readlane_b32 s61, v7, 49
-; CHECK-NEXT:    v_readlane_b32 s62, v7, 50
-; CHECK-NEXT:    v_readlane_b32 s63, v7, 51
-; CHECK-NEXT:    v_readlane_b32 s64, v7, 52
-; CHECK-NEXT:    v_readlane_b32 s65, v7, 53
-; CHECK-NEXT:    v_readlane_b32 s66, v7, 54
-; CHECK-NEXT:    v_readlane_b32 s67, v7, 55
-; CHECK-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; CHECK-NEXT:    s_andn2_saveexec_b64 s[4:5], vcc
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_9
 ; CHECK-NEXT:  ; %bb.7: ; %bb33.preheader
 ; CHECK-NEXT:    s_mov_b32 s8, 0
 ; CHECK-NEXT:    s_mov_b32 s6, s8
 ; CHECK-NEXT:    s_mov_b32 s7, s8
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s6
-; CHECK-NEXT:    v_readlane_b32 s36, v7, 56
 ; CHECK-NEXT:    s_mov_b32 s9, s8
 ; CHECK-NEXT:    s_mov_b32 s10, s8
 ; CHECK-NEXT:    s_mov_b32 s11, s8
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s7
-; CHECK-NEXT:    v_readlane_b32 s37, v7, 57
-; CHECK-NEXT:    v_readlane_b32 s38, v7, 58
-; CHECK-NEXT:    v_readlane_b32 s39, v7, 59
-; CHECK-NEXT:    v_readlane_b32 s40, v7, 60
-; CHECK-NEXT:    v_readlane_b32 s41, v7, 61
-; CHECK-NEXT:    v_readlane_b32 s42, v7, 62
-; CHECK-NEXT:    v_readlane_b32 s43, v7, 63
-; CHECK-NEXT:    s_nop 4
 ; CHECK-NEXT:    image_sample_lz v3, v[1:2], s[36:43], s[8:11] dmask:0x1
 ; CHECK-NEXT:    image_sample_lz v4, v[1:2], s[52:59], s[8:11] dmask:0x1
-; CHECK-NEXT:    ; kill: killed $vgpr1_vgpr2
-; CHECK-NEXT:    s_mov_b64 s[12:13], s[36:37]
 ; CHECK-NEXT:    s_and_b64 vcc, exec, 0
-; CHECK-NEXT:    v_readlane_b32 s44, v6, 0
-; CHECK-NEXT:    v_readlane_b32 s45, v6, 1
-; CHECK-NEXT:    v_readlane_b32 s46, v6, 2
-; CHECK-NEXT:    v_readlane_b32 s47, v6, 3
-; CHECK-NEXT:    v_readlane_b32 s48, v6, 4
-; CHECK-NEXT:    v_readlane_b32 s49, v6, 5
-; CHECK-NEXT:    v_readlane_b32 s50, v6, 6
-; CHECK-NEXT:    v_readlane_b32 s51, v6, 7
-; CHECK-NEXT:    s_mov_b64 s[14:15], s[38:39]
-; CHECK-NEXT:    s_mov_b64 s[16:17], s[40:41]
-; CHECK-NEXT:    s_mov_b64 s[18:19], s[42:43]
-; CHECK-NEXT:    ; kill: killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-; CHECK-NEXT:    ; kill: killed $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59
-; CHECK-NEXT:    ; kill: killed $sgpr8_sgpr9_sgpr10 killed $sgpr11
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_sub_f32_e32 v1, v4, v3
 ; CHECK-NEXT:    v_mul_f32_e32 v0, v1, v0
@@ -333,7 +273,8 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:  .LBB0_9: ; %Flow13
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; CHECK-NEXT:  .LBB0_10: ; %UnifiedReturnBlock
-; CHECK-NEXT:    s_or_b64 exec, exec, s[20:21]
+; CHECK-NEXT:    s_or_b64 exec, exec, s[28:29]
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_readlane_b32 s67, v5, 33
 ; CHECK-NEXT:    v_readlane_b32 s66, v5, 32
 ; CHECK-NEXT:    v_readlane_b32 s65, v5, 31
@@ -371,7 +312,6 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; CHECK-NEXT:    buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
index 603f457f3e05e4..596d225dd028e7 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -1651,6 +1651,7 @@ define amdgpu_kernel void @extract_neg_offset_vgpr(ptr addrspace(1) %out) {
 ; NOOPT-NEXT:    v_readlane_b32 s1, v31, 5
 ; NOOPT-NEXT:    s_mov_b64 exec, s[0:1]
 ; NOOPT-NEXT:  ; %bb.3:
+; NOOPT-NEXT:    buffer_load_dword v0, off, s[20:23], 0 offset:76 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; NOOPT-NEXT:    buffer_load_dword v31, off, s[20:23], 0 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_mov_b64 exec, s[16:17]
@@ -1659,8 +1660,6 @@ define amdgpu_kernel void @extract_neg_offset_vgpr(ptr addrspace(1) %out) {
 ; NOOPT-NEXT:    v_readlane_b32 s1, v31, 1
 ; NOOPT-NEXT:    v_readlane_b32 s2, v31, 2
 ; NOOPT-NEXT:    v_readlane_b32 s3, v31, 3
-; NOOPT-NEXT:    buffer_load_dword v0, off, s[20:23], 0 offset:76 ; 4-byte Folded Reload
-; NOOPT-NEXT:    s_waitcnt vmcnt(0)
 ; NOOPT-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; NOOPT-NEXT:    s_endpgm
 ;
@@ -4211,14 +4210,6 @@ define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addr
 ; NOOPT-NEXT:    v_readlane_b32 s1, v31, 5
 ; NOOPT-NEXT:    s_mov_b64 exec, s[0:1]
 ; NOOPT-NEXT:  ; %bb.3:
-; NOOPT-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; NOOPT-NEXT:    buffer_load_dword v31, off, s[20:23], 0 ; 4-byte Folded Reload
-; NOOPT-NEXT:    s_mov_b64 exec, s[16:17]
-; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_readlane_b32 s0, v31, 0
-; NOOPT-NEXT:    v_readlane_b32 s1, v31, 1
-; NOOPT-NEXT:    v_readlane_b32 s2, v31, 2
-; NOOPT-NEXT:    v_readlane_b32 s3, v31, 3
 ; NOOPT-NEXT:    buffer_load_dword v15, off, s[20:23], 0 offset:140 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v16, off, s[20:23], 0 offset:144 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v17, off, s[20:23], 0 offset:148 ; 4-byte Folded Reload
@@ -4235,22 +4226,26 @@ define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addr
 ; NOOPT-NEXT:    buffer_load_dword v28, off, s[20:23], 0 offset:192 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v29, off, s[20:23], 0 offset:196 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v30, off, s[20:23], 0 offset:200 ; 4-byte Folded Reload
-; NOOPT-NEXT:    s_waitcnt vmcnt(12)
+; NOOPT-NEXT:    s_or_saveexec_b64 s[16:17], -1
+; NOOPT-NEXT:    buffer_load_dword v31, off, s[20:23], 0 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_mov_b64 exec, s[16:17]
+; NOOPT-NEXT:    s_waitcnt vmcnt(0)
+; NOOPT-NEXT:    v_readlane_b32 s0, v31, 0
+; NOOPT-NEXT:    v_readlane_b32 s1, v31, 1
+; NOOPT-NEXT:    v_readlane_b32 s2, v31, 2
+; NOOPT-NEXT:    v_readlane_b32 s3, v31, 3
 ; NOOPT-NEXT:    v_mov_b32_e32 v4, v18
 ; NOOPT-NEXT:    v_mov_b32_e32 v5, v17
 ; NOOPT-NEXT:    v_mov_b32_e32 v6, v16
 ; NOOPT-NEXT:    v_mov_b32_e32 v0, v15
-; NOOPT-NEXT:    s_waitcnt vmcnt(8)
 ; NOOPT-NEXT:    v_mov_b32_e32 v1, v22
 ; NOOPT-NEXT:    v_mov_b32_e32 v2, v21
 ; NOOPT-NEXT:    v_mov_b32_e32 v3, v20
 ; NOOPT-NEXT:    v_mov_b32_e32 v7, v19
-; NOOPT-NEXT:    s_waitcnt vmcnt(4)
 ; NOOPT-NEXT:    v_mov_b32_e32 v12, v26
 ; NOOPT-NEXT:    v_mov_b32_e32 v13, v25
 ; NOOPT-NEXT:    v_mov_b32_e32 v14, v24
 ; NOOPT-NEXT:    v_mov_b32_e32 v8, v23
-; NOOPT-NEXT:    s_waitcnt vmcnt(0)
 ; NOOPT-NEXT:    v_mov_b32_e32 v9, v30
 ; NOOPT-NEXT:    v_mov_b32_e32 v10, v29
 ; NOOPT-NEXT:    v_mov_b32_e32 v11, v28
@@ -4700,14 +4695,6 @@ define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, p
 ; NOOPT-NEXT:    v_readlane_b32 s1, v31, 5
 ; NOOPT-NEXT:    s_mov_b64 exec, s[0:1]
 ; NOOPT-NEXT:  ; %bb.3:
-; NOOPT-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; NOOPT-NEXT:    buffer_load_dword v31, off, s[20:23], 0 ; 4-byte Folded Reload
-; NOOPT-NEXT:    s_mov_b64 exec, s[16:17]
-; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_readlane_b32 s0, v31, 0
-; NOOPT-NEXT:    v_readlane_b32 s1, v31, 1
-; NOOPT-NEXT:    v_readlane_b32 s2, v31, 2
-; NOOPT-NEXT:    v_readlane_b32 s3, v31, 3
 ; NOOPT-NEXT:    buffer_load_dword v15, off, s[20:23], 0 offset:140 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v16, off, s[20:23], 0 offset:144 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v17, off, s[20:23], 0 offset:148 ; 4-byte Folded Reload
@@ -4724,22 +4711,26 @@ define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, p
 ; NOOPT-NEXT:    buffer_load_dword v28, off, s[20:23], 0 offset:192 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v29, off, s[20:23], 0 offset:196 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v30, off, s[20:23], 0 offset:200 ; 4-byte Folded Reload
-; NOOPT-NEXT:    s_waitcnt vmcnt(12)
+; NOOPT-NEXT:    s_or_saveexec_b64 s[16:17], -1
+; NOOPT-NEXT:    buffer_load_dword v31, off, s[20:23], 0 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_mov_b64 exec, s[16:17]
+; NOOPT-NEXT:    s_waitcnt vmcnt(0)
+; NOOPT-NEXT:    v_readlane_b32 s0, v31, 0
+; NOOPT-NEXT:    v_readlane_b32 s1, v31, 1
+; NOOPT-NEXT:    v_readlane_b32 s2, v31, 2
+; NOOPT-NEXT:    v_readlane_b32 s3, v31, 3
 ; NOOPT-NEXT:    v_mov_b32_e32 v4, v18
 ; NOOPT-NEXT:    v_mov_b32_e32 v5, v17
 ; NOOPT-NEXT:    v_mov_b32_e32 v6, v16
 ; NOOPT-NEXT:    v_mov_b32_e32 v0, v15
-; NOOPT-NEXT:    s_waitcnt vmcnt(8)
 ; NOOPT-NEXT:    v_mov_b32_e32 v1, v22
 ; NOOPT-NEXT:    v_mov_b32_e32 v2, v21
 ; NOOPT-NEXT:    v_mov_b32_e32 v3, v20
 ; NOOPT-NEXT:    v_mov_b32_e32 v7, v19
-; NOOPT-NEXT:    s_waitcnt vmcnt(4)
 ; NOOPT-NEXT:    v_mov_b32_e32 v12, v26
 ; NOOPT-NEXT:    v_mov_b32_e32 v13, v25
 ; NOOPT-NEXT:    v_mov_b32_e32 v14, v24
 ; NOOPT-NEXT:    v_mov_b32_e32 v8, v23
-; NOOPT-NEXT:    s_waitcnt vmcnt(0)
 ; NOOPT-NEXT:    v_mov_b32_e32 v9, v30
 ; NOOPT-NEXT:    v_mov_b32_e32 v10, v29
 ; NOOPT-NEXT:    v_mov_b32_e32 v11, v28
@@ -5338,6 +5329,9 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1
 ; NOOPT-NEXT:    v_readlane_b32 s1, v18, 27
 ; NOOPT-NEXT:    s_mov_b64 exec, s[0:1]
 ; NOOPT-NEXT:  ; %bb.6:
+; NOOPT-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:76 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:156 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v2, off, s[36:39], 0 offset:84 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[28:29], -1
 ; NOOPT-NEXT:    buffer_load_dword v18, off, s[36:39], 0 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_mov_b64 exec, s[28:29]
@@ -5347,10 +5341,6 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1
 ; NOOPT-NEXT:    v_readlane_b32 s5, v18, 1
 ; NOOPT-NEXT:    v_readlane_b32 s6, v18, 2
 ; NOOPT-NEXT:    v_readlane_b32 s7, v18, 3
-; NOOPT-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:76 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:156 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v2, off, s[36:39], 0 offset:84 ; 4-byte Folded Reload
-; NOOPT-NEXT:    s_waitcnt vmcnt(0)
 ; NOOPT-NEXT:    buffer_store_dword v2, off, s[4:7], 0
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
 ; NOOPT-NEXT:    buffer_store_dword v1, off, s[4:7], 0
@@ -5366,11 +5356,11 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1
 ; NOOPT-NEXT:    s_mov_b64 exec, s[0:1]
 ; NOOPT-NEXT:    s_cbranch_execz .LBB16_8
 ; NOOPT-NEXT:  ; %bb.7: ; %bb1
+; NOOPT-NEXT:    ; implicit-def: $sgpr0_sgpr1
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[28:29], -1
 ; NOOPT-NEXT:    s_waitcnt expcnt(0)
 ; NOOPT-NEXT:    buffer_load_dword v18, off, s[36:39], 0 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_mov_b64 exec, s[28:29]
-; NOOPT-NEXT:    ; implicit-def: $sgpr0_sgpr1
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
 ; NOOPT-NEXT:    v_readlane_b32 s4, v18, 25
 ; NOOPT-NEXT:    s_mov_b32 s7, s1
@@ -5987,9 +5977,6 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1)
 ; NOOPT-NEXT:    v_readlane_b32 s1, v32, 6
 ; NOOPT-NEXT:    s_mov_b64 exec, s[0:1]
 ; NOOPT-NEXT:  ; %bb.3:
-; NOOPT-NEXT:    s_or_saveexec_b64 s[26:27], -1
-; NOOPT-NEXT:    buffer_load_dword v32, off, s[28:31], 0 ; 4-byte Folded Reload
-; NOOPT-NEXT:    s_mov_b64 exec, s[26:27]
 ; NOOPT-NEXT:    buffer_load_dword v0, off, s[28:31], 0 offset:88 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v1, off, s[28:31], 0 offset:92 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v2, off, s[28:31], 0 offset:96 ; 4-byte Folded Reload
@@ -6006,10 +5993,13 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1)
 ; NOOPT-NEXT:    buffer_load_dword v13, off, s[28:31], 0 offset:140 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v14, off, s[28:31], 0 offset:144 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v15, off, s[28:31], 0 offset:148 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_or_saveexec_b64 s[26:27], -1
+; NOOPT-NEXT:    buffer_load_dword v32, off, s[28:31], 0 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_mov_b64 exec, s[26:27]
 ; NOOPT-NEXT:    v_mov_b32_e32 v16, 63
 ; NOOPT-NEXT:    buffer_store_dword v16, off, s[28:31], 0 offset:216 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b64 s[0:1], exec
-; NOOPT-NEXT:    s_waitcnt vmcnt(14)
+; NOOPT-NEXT:    s_waitcnt vmcnt(1)
 ; NOOPT-NEXT:    v_writelane_b32 v32, s0, 9
 ; NOOPT-NEXT:    v_writelane_b32 v32, s1, 10
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[26:27], -1
@@ -6018,22 +6008,18 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1)
 ; NOOPT-NEXT:    buffer_store_dword v0, off, s[28:31], 0 offset:152 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    buffer_store_dword v1, off, s[28:31], 0 offset:156 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    buffer_store_dword v2, off, s[28:31], 0 offset:160 ; 4-byte Folded Spill
-; NOOPT-NEXT:    s_waitcnt vmcnt(14)
 ; NOOPT-NEXT:    buffer_store_dword v3, off, s[28:31], 0 offset:164 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    buffer_store_dword v4, off, s[28:31], 0 offset:168 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    buffer_store_dword v5, off, s[28:31], 0 offset:172 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    buffer_store_dword v6, off, s[28:31], 0 offset:176 ; 4-byte Folded Spill
-; NOOPT-NEXT:    s_waitcnt vmcnt(14)
 ; NOOPT-NEXT:    buffer_store_dword v7, off, s[28:31], 0 offset:180 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    buffer_store_dword v8, off, s[28:31], 0 offset:184 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    buffer_store_dword v9, off, s[28:31], 0 offset:188 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    buffer_store_dword v10, off, s[28:31], 0 offset:192 ; 4-byte Folded Spill
-; NOOPT-NEXT:    s_waitcnt vmcnt(14)
 ; NOOPT-NEXT:    buffer_store_dword v11, off, s[28:31], 0 offset:196 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    buffer_store_dword v12, off, s[28:31], 0 offset:200 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    buffer_store_dword v13, off, s[28:31], 0 offset:204 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    buffer_store_dword v14, off, s[28:31], 0 offset:208 ; 4-byte Folded Spill
-; NOOPT-NEXT:    s_waitcnt vmcnt(14)
 ; NOOPT-NEXT:    buffer_store_dword v15, off, s[28:31], 0 offset:212 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    ; implicit-def: $sgpr0_sgpr1
 ; NOOPT-NEXT:  .LBB17_4: ; =>This Inner Loop Header: Depth=1
@@ -6125,15 +6111,6 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1)
 ; NOOPT-NEXT:    v_readlane_b32 s1, v32, 10
 ; NOOPT-NEXT:    s_mov_b64 exec, s[0:1]
 ; NOOPT-NEXT:  ; %bb.6:
-; NOOPT-NEXT:    s_or_saveexec_b64 s[26:27], -1
-; NOOPT-NEXT:    buffer_load_dword v32, off, s[28:31], 0 ; 4-byte Folded Reload
-; NOOPT-NEXT:    s_mov_b64 exec, s[26:27]
-; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_readlane_b32 s0, v32, 4
-; NOOPT-NEXT:    v_readlane_b32 s4, v32, 0
-; NOOPT-NEXT:    v_readlane_b32 s5, v32, 1
-; NOOPT-NEXT:    v_readlane_b32 s6, v32, 2
-; NOOPT-NEXT:    v_readlane_b32 s7, v32, 3
 ; NOOPT-NEXT:    buffer_load_dword v0, off, s[28:31], 0 offset:84 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v16, off, s[28:31], 0 offset:220 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v17, off, s[28:31], 0 offset:224 ; 4-byte Folded Reload
@@ -6151,22 +6128,27 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1)
 ; NOOPT-NEXT:    buffer_load_dword v29, off, s[28:31], 0 offset:272 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v30, off, s[28:31], 0 offset:276 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v31, off, s[28:31], 0 offset:280 ; 4-byte Folded Reload
-; NOOPT-NEXT:    s_waitcnt vmcnt(12)
+; NOOPT-NEXT:    s_or_saveexec_b64 s[26:27], -1
+; NOOPT-NEXT:    buffer_load_dword v32, off, s[28:31], 0 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_mov_b64 exec, s[26:27]
+; NOOPT-NEXT:    s_waitcnt vmcnt(0)
+; NOOPT-NEXT:    v_readlane_b32 s0, v32, 4
+; NOOPT-NEXT:    v_readlane_b32 s4, v32, 0
+; NOOPT-NEXT:    v_readlane_b32 s5, v32, 1
+; NOOPT-NEXT:    v_readlane_b32 s6, v32, 2
+; NOOPT-NEXT:    v_readlane_b32 s7, v32, 3
 ; NOOPT-NEXT:    v_mov_b32_e32 v5, v19
 ; NOOPT-NEXT:    v_mov_b32_e32 v6, v18
 ; NOOPT-NEXT:    v_mov_b32_e32 v7, v17
 ; NOOPT-NEXT:    v_mov_b32_e32 v1, v16
-; NOOPT-NEXT:    s_waitcnt vmcnt(8)
 ; NOOPT-NEXT:    v_mov_b32_e32 v2, v23
 ; NOOPT-NEXT:    v_mov_b32_e32 v3, v22
 ; NOOPT-NEXT:    v_mov_b32_e32 v4, v21
 ; NOOPT-NEXT:    v_mov_b32_e32 v8, v20
-; NOOPT-NEXT:    s_waitcnt vmcnt(4)
 ; NOOPT-NEXT:    v_mov_b32_e32 v13, v27
 ; NOOPT-NEXT:    v_mov_b32_e32 v14, v26
 ; NOOPT-NEXT:    v_mov_b32_e32 v15, v25
 ; NOOPT-NEXT:    v_mov_b32_e32 v9, v24
-; NOOPT-NEXT:    s_waitcnt vmcnt(0)
 ; NOOPT-NEXT:    v_mov_b32_e32 v10, v31
 ; NOOPT-NEXT:    v_mov_b32_e32 v11, v30
 ; NOOPT-NEXT:    v_mov_b32_e32 v12, v29
@@ -7282,11 +7264,11 @@ define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) {
 ; NOOPT-NEXT:    s_mov_b64 exec, s[8:9]
 ; NOOPT-NEXT:    s_cbranch_scc1 .LBB19_3
 ; NOOPT-NEXT:  .LBB19_1: ; %Flow
+; NOOPT-NEXT:    ; implicit-def: $sgpr2
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; NOOPT-NEXT:    s_waitcnt expcnt(0)
 ; NOOPT-NEXT:    buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_mov_b64 exec, s[8:9]
-; NOOPT-NEXT:    ; implicit-def: $sgpr2
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
 ; NOOPT-NEXT:    v_readlane_b32 s0, v4, 0
 ; NOOPT-NEXT:    v_readlane_b32 s1, v4, 1
@@ -7526,11 +7508,11 @@ define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) {
 ; NOOPT-NEXT:    s_mov_b64 exec, s[12:13]
 ; NOOPT-NEXT:    s_cbranch_scc1 .LBB20_3
 ; NOOPT-NEXT:  .LBB20_1: ; %Flow
+; NOOPT-NEXT:    ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[12:13], -1
 ; NOOPT-NEXT:    s_waitcnt expcnt(0)
 ; NOOPT-NEXT:    buffer_load_dword v4, off, s[16:19], 0 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_mov_b64 exec, s[12:13]
-; NOOPT-NEXT:    ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
 ; NOOPT-NEXT:    v_readlane_b32 s0, v4, 0
 ; NOOPT-NEXT:    v_readlane_b32 s1, v4, 1
@@ -9079,17 +9061,15 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
 ; NOOPT-NEXT:  .LBB26_1: ; %bb2
 ; NOOPT-NEXT:    ; =>This Loop Header: Depth=1
 ; NOOPT-NEXT:    ; Child Loop BB26_3 Depth 2
+; NOOPT-NEXT:    s_waitcnt expcnt(0)
+; NOOPT-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; NOOPT-NEXT:    s_waitcnt expcnt(1)
 ; NOOPT-NEXT:    buffer_load_dword v18, off, s[24:27], 0 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_mov_b64 exec, s[20:21]
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
 ; NOOPT-NEXT:    v_readlane_b32 s2, v18, 0
-; NOOPT-NEXT:    s_waitcnt expcnt(0)
-; NOOPT-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_mov_b64 s[0:1], -1
 ; NOOPT-NEXT:    ; implicit-def: $sgpr4
-; NOOPT-NEXT:    s_waitcnt vmcnt(0)
 ; NOOPT-NEXT:    v_cmp_ge_i32_e64 s[2:3], v0, s2
 ; NOOPT-NEXT:    v_mov_b32_e32 v0, s4
 ; NOOPT-NEXT:    s_and_b64 vcc, exec, s[2:3]
@@ -9102,11 +9082,11 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
 ; NOOPT-NEXT:    s_cbranch_vccnz .LBB26_6
 ; NOOPT-NEXT:  ; %bb.2: ; %bb4
 ; NOOPT-NEXT:    ; in Loop: Header=BB26_1 Depth=1
+; NOOPT-NEXT:    ; implicit-def: $sgpr2_sgpr3
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[20:21], -1
 ; NOOPT-NEXT:    s_waitcnt expcnt(0)
 ; NOOPT-NEXT:    buffer_load_dword v18, off, s[24:27], 0 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_mov_b64 exec, s[20:21]
-; NOOPT-NEXT:    ; implicit-def: $sgpr2_sgpr3
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
 ; NOOPT-NEXT:    v_readlane_b32 s0, v18, 1
 ; NOOPT-NEXT:    ; kill: def $sgpr3 killed $sgpr3 killed $sgpr2_sgpr3
@@ -9254,9 +9234,6 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
 ; NOOPT-NEXT:    v_readlane_b32 s1, v18, 5
 ; NOOPT-NEXT:    s_mov_b64 exec, s[0:1]
 ; NOOPT-NEXT:  ; %bb.5: ; in Loop: Header=BB26_1 Depth=1
-; NOOPT-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; NOOPT-NEXT:    buffer_load_dword v18, off, s[24:27], 0 ; 4-byte Folded Reload
-; NOOPT-NEXT:    s_mov_b64 exec, s[20:21]
 ; NOOPT-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:84 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v1, off, s[24:27], 0 offset:88 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v2, off, s[24:27], 0 offset:92 ; 4-byte Folded Reload
@@ -9273,9 +9250,13 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
 ; NOOPT-NEXT:    buffer_load_dword v13, off, s[24:27], 0 offset:136 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v14, off, s[24:27], 0 offset:140 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v15, off, s[24:27], 0 offset:144 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; NOOPT-NEXT:    buffer_load_dword v18, off, s[24:27], 0 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_mov_b64 exec, s[20:21]
 ; NOOPT-NEXT:    s_mov_b64 s[0:1], 0
 ; NOOPT-NEXT:    s_waitcnt vmcnt(14)
 ; NOOPT-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:8 ; 4-byte Folded Spill
+; NOOPT-NEXT:    s_waitcnt vmcnt(1)
 ; NOOPT-NEXT:    v_writelane_b32 v18, s0, 2
 ; NOOPT-NEXT:    v_writelane_b32 v18, s1, 3
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[20:21], -1
@@ -9283,6 +9264,8 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
 ; NOOPT-NEXT:    s_mov_b64 exec, s[20:21]
 ; NOOPT-NEXT:  .LBB26_6: ; %Flow
 ; NOOPT-NEXT:    ; in Loop: Header=BB26_1 Depth=1
+; NOOPT-NEXT:    s_waitcnt expcnt(1)
+; NOOPT-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:8 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[20:21], -1
 ; NOOPT-NEXT:    s_waitcnt expcnt(0)
 ; NOOPT-NEXT:    buffer_load_dword v18, off, s[24:27], 0 ; 4-byte Folded Reload
@@ -9290,13 +9273,11 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
 ; NOOPT-NEXT:    v_readlane_b32 s0, v18, 2
 ; NOOPT-NEXT:    v_readlane_b32 s1, v18, 3
-; NOOPT-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:8 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
 ; NOOPT-NEXT:    s_mov_b32 s0, 1
 ; NOOPT-NEXT:    ; implicit-def: $sgpr1
 ; NOOPT-NEXT:    v_cmp_ne_u32_e64 s[0:1], v1, s0
 ; NOOPT-NEXT:    s_and_b64 vcc, exec, s[0:1]
-; NOOPT-NEXT:    s_waitcnt vmcnt(0)
 ; NOOPT-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_cbranch_vccnz .LBB26_1
 ; NOOPT-NEXT:  ; %bb.7: ; %bb8
@@ -9744,14 +9725,6 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace
 ; NOOPT-NEXT:    v_readlane_b32 s1, v33, 8
 ; NOOPT-NEXT:    s_mov_b64 exec, s[0:1]
 ; NOOPT-NEXT:  ; %bb.3:
-; NOOPT-NEXT:    s_or_saveexec_b64 s[12:13], -1
-; NOOPT-NEXT:    buffer_load_dword v33, off, s[16:19], 0 ; 4-byte Folded Reload
-; NOOPT-NEXT:    s_mov_b64 exec, s[12:13]
-; NOOPT-NEXT:    s_waitcnt vmcnt(0)
-; NOOPT-NEXT:    v_readlane_b32 s0, v33, 3
-; NOOPT-NEXT:    v_readlane_b32 s1, v33, 4
-; NOOPT-NEXT:    v_readlane_b32 s2, v33, 5
-; NOOPT-NEXT:    v_readlane_b32 s3, v33, 6
 ; NOOPT-NEXT:    buffer_load_dword v4, off, s[16:19], 0 offset:136 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v5, off, s[16:19], 0 offset:140 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v17, off, s[16:19], 0 offset:148 ; 4-byte Folded Reload
@@ -9770,22 +9743,26 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace
 ; NOOPT-NEXT:    buffer_load_dword v30, off, s[16:19], 0 offset:200 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v31, off, s[16:19], 0 offset:204 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v32, off, s[16:19], 0 offset:208 ; 4-byte Folded Reload
-; NOOPT-NEXT:    s_waitcnt vmcnt(12)
+; NOOPT-NEXT:    s_or_saveexec_b64 s[12:13], -1
+; NOOPT-NEXT:    buffer_load_dword v33, off, s[16:19], 0 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_mov_b64 exec, s[12:13]
+; NOOPT-NEXT:    s_waitcnt vmcnt(0)
+; NOOPT-NEXT:    v_readlane_b32 s0, v33, 3
+; NOOPT-NEXT:    v_readlane_b32 s1, v33, 4
+; NOOPT-NEXT:    v_readlane_b32 s2, v33, 5
+; NOOPT-NEXT:    v_readlane_b32 s3, v33, 6
 ; NOOPT-NEXT:    v_mov_b32_e32 v6, v20
 ; NOOPT-NEXT:    v_mov_b32_e32 v7, v19
 ; NOOPT-NEXT:    v_mov_b32_e32 v8, v18
 ; NOOPT-NEXT:    v_mov_b32_e32 v0, v17
-; NOOPT-NEXT:    s_waitcnt vmcnt(8)
 ; NOOPT-NEXT:    v_mov_b32_e32 v1, v24
 ; NOOPT-NEXT:    v_mov_b32_e32 v2, v23
 ; NOOPT-NEXT:    v_mov_b32_e32 v3, v22
 ; NOOPT-NEXT:    v_mov_b32_e32 v9, v21
-; NOOPT-NEXT:    s_waitcnt vmcnt(4)
 ; NOOPT-NEXT:    v_mov_b32_e32 v14, v28
 ; NOOPT-NEXT:    v_mov_b32_e32 v15, v27
 ; NOOPT-NEXT:    v_mov_b32_e32 v16, v26
 ; NOOPT-NEXT:    v_mov_b32_e32 v10, v25
-; NOOPT-NEXT:    s_waitcnt vmcnt(0)
 ; NOOPT-NEXT:    v_mov_b32_e32 v11, v32
 ; NOOPT-NEXT:    v_mov_b32_e32 v12, v31
 ; NOOPT-NEXT:    v_mov_b32_e32 v13, v30
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll
index e44803d611f84b..b37469a7215f8e 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll
@@ -171,23 +171,23 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
 ; W64-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(3)
+; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
+; W64-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    s_waitcnt vmcnt(4)
 ; W64-O0-NEXT:    v_readfirstlane_b32 s8, v0
-; W64-O0-NEXT:    s_waitcnt vmcnt(2)
+; W64-O0-NEXT:    s_waitcnt vmcnt(3)
 ; W64-O0-NEXT:    v_readfirstlane_b32 s12, v1
 ; W64-O0-NEXT:    s_mov_b32 s4, s8
 ; W64-O0-NEXT:    s_mov_b32 s5, s12
 ; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1]
-; W64-O0-NEXT:    s_waitcnt vmcnt(1)
+; W64-O0-NEXT:    s_waitcnt vmcnt(2)
 ; W64-O0-NEXT:    v_readfirstlane_b32 s7, v2
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    s_waitcnt vmcnt(1)
 ; W64-O0-NEXT:    v_readfirstlane_b32 s6, v3
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
@@ -197,6 +197,7 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
 ; W64-O0-NEXT:    s_mov_b32 s9, s12
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    v_writelane_b32 v7, s8, 3
 ; W64-O0-NEXT:    v_writelane_b32 v7, s9, 4
 ; W64-O0-NEXT:    v_writelane_b32 v7, s10, 5
@@ -208,6 +209,7 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
 ; W64-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  ; %bb.2: ; in Loop: Header=BB0_1 Depth=1
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
@@ -219,9 +221,7 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
 ; W64-O0-NEXT:    v_readlane_b32 s10, v7, 5
 ; W64-O0-NEXT:    v_readlane_b32 s11, v7, 6
 ; W64-O0-NEXT:    v_readlane_b32 s6, v7, 0
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    s_nop 2
+; W64-O0-NEXT:    s_nop 4
 ; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
@@ -563,23 +563,23 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt
 ; W64-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(3)
+; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
+; W64-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    s_waitcnt vmcnt(4)
 ; W64-O0-NEXT:    v_readfirstlane_b32 s8, v0
-; W64-O0-NEXT:    s_waitcnt vmcnt(2)
+; W64-O0-NEXT:    s_waitcnt vmcnt(3)
 ; W64-O0-NEXT:    v_readfirstlane_b32 s12, v1
 ; W64-O0-NEXT:    s_mov_b32 s4, s8
 ; W64-O0-NEXT:    s_mov_b32 s5, s12
 ; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1]
-; W64-O0-NEXT:    s_waitcnt vmcnt(1)
+; W64-O0-NEXT:    s_waitcnt vmcnt(2)
 ; W64-O0-NEXT:    v_readfirstlane_b32 s7, v2
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    s_waitcnt vmcnt(1)
 ; W64-O0-NEXT:    v_readfirstlane_b32 s6, v3
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
@@ -589,6 +589,7 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt
 ; W64-O0-NEXT:    s_mov_b32 s9, s12
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    v_writelane_b32 v17, s8, 3
 ; W64-O0-NEXT:    v_writelane_b32 v17, s9, 4
 ; W64-O0-NEXT:    v_writelane_b32 v17, s10, 5
@@ -600,6 +601,7 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt
 ; W64-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  ; %bb.2: ; in Loop: Header=BB1_1 Depth=1
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
@@ -611,9 +613,7 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt
 ; W64-O0-NEXT:    v_readlane_b32 s10, v17, 5
 ; W64-O0-NEXT:    v_readlane_b32 s11, v17, 6
 ; W64-O0-NEXT:    v_readlane_b32 s6, v17, 0
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    s_nop 2
+; W64-O0-NEXT:    s_nop 4
 ; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
@@ -634,23 +634,23 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt
 ; W64-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  .LBB1_4: ; =>This Inner Loop Header: Depth=1
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(3)
+; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
+; W64-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    s_waitcnt vmcnt(4)
 ; W64-O0-NEXT:    v_readfirstlane_b32 s8, v0
-; W64-O0-NEXT:    s_waitcnt vmcnt(2)
+; W64-O0-NEXT:    s_waitcnt vmcnt(3)
 ; W64-O0-NEXT:    v_readfirstlane_b32 s12, v1
 ; W64-O0-NEXT:    s_mov_b32 s4, s8
 ; W64-O0-NEXT:    s_mov_b32 s5, s12
 ; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1]
-; W64-O0-NEXT:    s_waitcnt vmcnt(1)
+; W64-O0-NEXT:    s_waitcnt vmcnt(2)
 ; W64-O0-NEXT:    v_readfirstlane_b32 s7, v2
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    s_waitcnt vmcnt(1)
 ; W64-O0-NEXT:    v_readfirstlane_b32 s6, v3
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
@@ -660,6 +660,7 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt
 ; W64-O0-NEXT:    s_mov_b32 s9, s12
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    v_writelane_b32 v17, s8, 11
 ; W64-O0-NEXT:    v_writelane_b32 v17, s9, 12
 ; W64-O0-NEXT:    v_writelane_b32 v17, s10, 13
@@ -671,6 +672,7 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt
 ; W64-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  ; %bb.5: ; in Loop: Header=BB1_4 Depth=1
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
@@ -682,9 +684,7 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt
 ; W64-O0-NEXT:    v_readlane_b32 s10, v17, 13
 ; W64-O0-NEXT:    v_readlane_b32 s11, v17, 14
 ; W64-O0-NEXT:    v_readlane_b32 s6, v17, 0
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    s_nop 2
+; W64-O0-NEXT:    s_nop 4
 ; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@@ -1085,23 +1085,23 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
 ; W64-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(3)
+; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
+; W64-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    s_waitcnt vmcnt(4)
 ; W64-O0-NEXT:    v_readfirstlane_b32 s8, v0
-; W64-O0-NEXT:    s_waitcnt vmcnt(2)
+; W64-O0-NEXT:    s_waitcnt vmcnt(3)
 ; W64-O0-NEXT:    v_readfirstlane_b32 s12, v1
 ; W64-O0-NEXT:    s_mov_b32 s4, s8
 ; W64-O0-NEXT:    s_mov_b32 s5, s12
 ; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1]
-; W64-O0-NEXT:    s_waitcnt vmcnt(1)
+; W64-O0-NEXT:    s_waitcnt vmcnt(2)
 ; W64-O0-NEXT:    v_readfirstlane_b32 s7, v2
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    s_waitcnt vmcnt(1)
 ; W64-O0-NEXT:    v_readfirstlane_b32 s6, v3
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
@@ -1111,6 +1111,7 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
 ; W64-O0-NEXT:    s_mov_b32 s9, s12
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    v_writelane_b32 v13, s8, 4
 ; W64-O0-NEXT:    v_writelane_b32 v13, s9, 5
 ; W64-O0-NEXT:    v_writelane_b32 v13, s10, 6
@@ -1122,6 +1123,7 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
 ; W64-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  ; %bb.2: ; in Loop: Header=BB2_1 Depth=1
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
@@ -1133,9 +1135,7 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
 ; W64-O0-NEXT:    v_readlane_b32 s10, v13, 6
 ; W64-O0-NEXT:    v_readlane_b32 s11, v13, 7
 ; W64-O0-NEXT:    v_readlane_b32 s6, v13, 1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    s_nop 2
+; W64-O0-NEXT:    s_nop 4
 ; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -1149,9 +1149,9 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
 ; W64-O0-NEXT:    v_readlane_b32 s6, v13, 2
 ; W64-O0-NEXT:    v_readlane_b32 s7, v13, 3
 ; W64-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; W64-O0-NEXT:    v_readlane_b32 s4, v13, 1
 ; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; W64-O0-NEXT:    v_readlane_b32 s4, v13, 1
 ; W64-O0-NEXT:    s_mov_b32 s5, 0x3ff
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    v_and_b32_e64 v1, v1, s5
@@ -1184,23 +1184,23 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
 ; W64-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  .LBB2_5: ; =>This Inner Loop Header: Depth=1
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(3)
+; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
+; W64-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    s_waitcnt vmcnt(4)
 ; W64-O0-NEXT:    v_readfirstlane_b32 s8, v0
-; W64-O0-NEXT:    s_waitcnt vmcnt(2)
+; W64-O0-NEXT:    s_waitcnt vmcnt(3)
 ; W64-O0-NEXT:    v_readfirstlane_b32 s12, v1
 ; W64-O0-NEXT:    s_mov_b32 s4, s8
 ; W64-O0-NEXT:    s_mov_b32 s5, s12
 ; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1]
-; W64-O0-NEXT:    s_waitcnt vmcnt(1)
+; W64-O0-NEXT:    s_waitcnt vmcnt(2)
 ; W64-O0-NEXT:    v_readfirstlane_b32 s7, v2
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    s_waitcnt vmcnt(1)
 ; W64-O0-NEXT:    v_readfirstlane_b32 s6, v3
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
@@ -1210,6 +1210,7 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
 ; W64-O0-NEXT:    s_mov_b32 s9, s12
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    v_writelane_b32 v13, s8, 15
 ; W64-O0-NEXT:    v_writelane_b32 v13, s9, 16
 ; W64-O0-NEXT:    v_writelane_b32 v13, s10, 17
@@ -1221,6 +1222,7 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
 ; W64-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  ; %bb.6: ; in Loop: Header=BB2_5 Depth=1
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
@@ -1232,9 +1234,7 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
 ; W64-O0-NEXT:    v_readlane_b32 s10, v13, 17
 ; W64-O0-NEXT:    v_readlane_b32 s11, v13, 18
 ; W64-O0-NEXT:    v_readlane_b32 s6, v13, 12
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    s_nop 2
+; W64-O0-NEXT:    s_nop 4
 ; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
index 896cb6042e810b..8ed8a0ad70837b 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
@@ -182,23 +182,23 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 {
 ; W64-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(3)
+; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
+; W64-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    s_waitcnt vmcnt(4)
 ; W64-O0-NEXT:    v_readfirstlane_b32 s8, v0
-; W64-O0-NEXT:    s_waitcnt vmcnt(2)
+; W64-O0-NEXT:    s_waitcnt vmcnt(3)
 ; W64-O0-NEXT:    v_readfirstlane_b32 s12, v1
 ; W64-O0-NEXT:    s_mov_b32 s4, s8
 ; W64-O0-NEXT:    s_mov_b32 s5, s12
 ; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1]
-; W64-O0-NEXT:    s_waitcnt vmcnt(1)
+; W64-O0-NEXT:    s_waitcnt vmcnt(2)
 ; W64-O0-NEXT:    v_readfirstlane_b32 s7, v2
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    s_waitcnt vmcnt(1)
 ; W64-O0-NEXT:    v_readfirstlane_b32 s6, v3
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
@@ -208,6 +208,7 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 {
 ; W64-O0-NEXT:    s_mov_b32 s9, s12
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    v_writelane_b32 v7, s8, 3
 ; W64-O0-NEXT:    v_writelane_b32 v7, s9, 4
 ; W64-O0-NEXT:    v_writelane_b32 v7, s10, 5
@@ -219,6 +220,7 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 {
 ; W64-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  ; %bb.2: ; in Loop: Header=BB0_1 Depth=1
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
@@ -230,9 +232,7 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 {
 ; W64-O0-NEXT:    v_readlane_b32 s10, v7, 5
 ; W64-O0-NEXT:    v_readlane_b32 s11, v7, 6
 ; W64-O0-NEXT:    v_readlane_b32 s6, v7, 0
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    s_nop 2
+; W64-O0-NEXT:    s_nop 4
 ; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
@@ -598,23 +598,23 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8)
 ; W64-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(3)
+; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
+; W64-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    s_waitcnt vmcnt(4)
 ; W64-O0-NEXT:    v_readfirstlane_b32 s8, v0
-; W64-O0-NEXT:    s_waitcnt vmcnt(2)
+; W64-O0-NEXT:    s_waitcnt vmcnt(3)
 ; W64-O0-NEXT:    v_readfirstlane_b32 s12, v1
 ; W64-O0-NEXT:    s_mov_b32 s4, s8
 ; W64-O0-NEXT:    s_mov_b32 s5, s12
 ; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1]
-; W64-O0-NEXT:    s_waitcnt vmcnt(1)
+; W64-O0-NEXT:    s_waitcnt vmcnt(2)
 ; W64-O0-NEXT:    v_readfirstlane_b32 s7, v2
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    s_waitcnt vmcnt(1)
 ; W64-O0-NEXT:    v_readfirstlane_b32 s6, v3
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
@@ -624,6 +624,7 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8)
 ; W64-O0-NEXT:    s_mov_b32 s9, s12
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    v_writelane_b32 v17, s8, 3
 ; W64-O0-NEXT:    v_writelane_b32 v17, s9, 4
 ; W64-O0-NEXT:    v_writelane_b32 v17, s10, 5
@@ -635,6 +636,7 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8)
 ; W64-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  ; %bb.2: ; in Loop: Header=BB1_1 Depth=1
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
@@ -646,9 +648,7 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8)
 ; W64-O0-NEXT:    v_readlane_b32 s10, v17, 5
 ; W64-O0-NEXT:    v_readlane_b32 s11, v17, 6
 ; W64-O0-NEXT:    v_readlane_b32 s6, v17, 0
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    s_nop 2
+; W64-O0-NEXT:    s_nop 4
 ; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
@@ -669,23 +669,23 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8)
 ; W64-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  .LBB1_4: ; =>This Inner Loop Header: Depth=1
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(3)
+; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
+; W64-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    s_waitcnt vmcnt(4)
 ; W64-O0-NEXT:    v_readfirstlane_b32 s8, v0
-; W64-O0-NEXT:    s_waitcnt vmcnt(2)
+; W64-O0-NEXT:    s_waitcnt vmcnt(3)
 ; W64-O0-NEXT:    v_readfirstlane_b32 s12, v1
 ; W64-O0-NEXT:    s_mov_b32 s4, s8
 ; W64-O0-NEXT:    s_mov_b32 s5, s12
 ; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1]
-; W64-O0-NEXT:    s_waitcnt vmcnt(1)
+; W64-O0-NEXT:    s_waitcnt vmcnt(2)
 ; W64-O0-NEXT:    v_readfirstlane_b32 s7, v2
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    s_waitcnt vmcnt(1)
 ; W64-O0-NEXT:    v_readfirstlane_b32 s6, v3
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
@@ -695,6 +695,7 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8)
 ; W64-O0-NEXT:    s_mov_b32 s9, s12
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    v_writelane_b32 v17, s8, 11
 ; W64-O0-NEXT:    v_writelane_b32 v17, s9, 12
 ; W64-O0-NEXT:    v_writelane_b32 v17, s10, 13
@@ -706,6 +707,7 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8)
 ; W64-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  ; %bb.5: ; in Loop: Header=BB1_4 Depth=1
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
@@ -717,9 +719,7 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8)
 ; W64-O0-NEXT:    v_readlane_b32 s10, v17, 13
 ; W64-O0-NEXT:    v_readlane_b32 s11, v17, 14
 ; W64-O0-NEXT:    v_readlane_b32 s6, v17, 0
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    s_nop 2
+; W64-O0-NEXT:    s_nop 4
 ; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
@@ -1136,23 +1136,23 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j,
 ; W64-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(3)
+; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
+; W64-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    s_waitcnt vmcnt(4)
 ; W64-O0-NEXT:    v_readfirstlane_b32 s8, v0
-; W64-O0-NEXT:    s_waitcnt vmcnt(2)
+; W64-O0-NEXT:    s_waitcnt vmcnt(3)
 ; W64-O0-NEXT:    v_readfirstlane_b32 s12, v1
 ; W64-O0-NEXT:    s_mov_b32 s4, s8
 ; W64-O0-NEXT:    s_mov_b32 s5, s12
 ; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1]
-; W64-O0-NEXT:    s_waitcnt vmcnt(1)
+; W64-O0-NEXT:    s_waitcnt vmcnt(2)
 ; W64-O0-NEXT:    v_readfirstlane_b32 s7, v2
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    s_waitcnt vmcnt(1)
 ; W64-O0-NEXT:    v_readfirstlane_b32 s6, v3
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
@@ -1162,6 +1162,7 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j,
 ; W64-O0-NEXT:    s_mov_b32 s9, s12
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    v_writelane_b32 v13, s8, 4
 ; W64-O0-NEXT:    v_writelane_b32 v13, s9, 5
 ; W64-O0-NEXT:    v_writelane_b32 v13, s10, 6
@@ -1173,6 +1174,7 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j,
 ; W64-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  ; %bb.2: ; in Loop: Header=BB2_1 Depth=1
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
@@ -1184,9 +1186,7 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j,
 ; W64-O0-NEXT:    v_readlane_b32 s10, v13, 6
 ; W64-O0-NEXT:    v_readlane_b32 s11, v13, 7
 ; W64-O0-NEXT:    v_readlane_b32 s6, v13, 1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    s_nop 2
+; W64-O0-NEXT:    s_nop 4
 ; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
@@ -1200,9 +1200,9 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j,
 ; W64-O0-NEXT:    v_readlane_b32 s6, v13, 2
 ; W64-O0-NEXT:    v_readlane_b32 s7, v13, 3
 ; W64-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; W64-O0-NEXT:    v_readlane_b32 s4, v13, 1
 ; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; W64-O0-NEXT:    v_readlane_b32 s4, v13, 1
 ; W64-O0-NEXT:    s_mov_b32 s5, 0x3ff
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    v_and_b32_e64 v1, v1, s5
@@ -1218,16 +1218,15 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j,
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; W64-O0-NEXT:    s_cbranch_execz .LBB2_8
 ; W64-O0-NEXT:  ; %bb.4: ; %bb1
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    v_readlane_b32 s4, v13, 0
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
+; W64-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    v_readlane_b32 s4, v13, 0
 ; W64-O0-NEXT:    v_mov_b32_e32 v6, v4
 ; W64-O0-NEXT:    v_mov_b32_e32 v0, v3
 ; W64-O0-NEXT:    v_mov_b32_e32 v4, v2
@@ -1256,23 +1255,23 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j,
 ; W64-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  .LBB2_5: ; =>This Inner Loop Header: Depth=1
-; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; W64-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(3)
+; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
+; W64-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    s_waitcnt vmcnt(4)
 ; W64-O0-NEXT:    v_readfirstlane_b32 s8, v0
-; W64-O0-NEXT:    s_waitcnt vmcnt(2)
+; W64-O0-NEXT:    s_waitcnt vmcnt(3)
 ; W64-O0-NEXT:    v_readfirstlane_b32 s12, v1
 ; W64-O0-NEXT:    s_mov_b32 s4, s8
 ; W64-O0-NEXT:    s_mov_b32 s5, s12
 ; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1]
-; W64-O0-NEXT:    s_waitcnt vmcnt(1)
+; W64-O0-NEXT:    s_waitcnt vmcnt(2)
 ; W64-O0-NEXT:    v_readfirstlane_b32 s7, v2
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    s_waitcnt vmcnt(1)
 ; W64-O0-NEXT:    v_readfirstlane_b32 s6, v3
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
@@ -1282,6 +1281,7 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j,
 ; W64-O0-NEXT:    s_mov_b32 s9, s12
 ; W64-O0-NEXT:    s_mov_b32 s10, s7
 ; W64-O0-NEXT:    s_mov_b32 s11, s6
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    v_writelane_b32 v13, s8, 15
 ; W64-O0-NEXT:    v_writelane_b32 v13, s9, 16
 ; W64-O0-NEXT:    v_writelane_b32 v13, s10, 17
@@ -1293,6 +1293,7 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j,
 ; W64-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  ; %bb.6: ; in Loop: Header=BB2_5 Depth=1
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
@@ -1304,9 +1305,7 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j,
 ; W64-O0-NEXT:    v_readlane_b32 s10, v13, 17
 ; W64-O0-NEXT:    v_readlane_b32 s11, v13, 18
 ; W64-O0-NEXT:    v_readlane_b32 s6, v13, 12
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
-; W64-O0-NEXT:    s_nop 2
+; W64-O0-NEXT:    s_nop 4
 ; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
diff --git a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
index 078b133a93d6f3..b8d35b06519a31 100644
--- a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
@@ -223,12 +223,15 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out,
 ; GCN-NEXT:    s_cbranch_scc1 .LBB0_2
 ; GCN-NEXT:  ; %bb.1: ; %bb0
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[92:95], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v2, off, s[92:95], 0 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GCN-NEXT:    buffer_load_dword v1, off, s[92:95], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
-; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[92:95], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[34:35]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_readlane_b32 s8, v0, 56
 ; GCN-NEXT:    v_readlane_b32 s9, v0, 57
 ; GCN-NEXT:    v_readlane_b32 s10, v0, 58
@@ -293,7 +296,6 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out,
 ; GCN-NEXT:    v_readlane_b32 s73, v0, 5
 ; GCN-NEXT:    v_readlane_b32 s74, v0, 6
 ; GCN-NEXT:    v_readlane_b32 s75, v0, 7
-; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_readlane_b32 s76, v1, 56
 ; GCN-NEXT:    v_readlane_b32 s77, v1, 57
 ; GCN-NEXT:    v_readlane_b32 s78, v1, 58
@@ -318,9 +320,6 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out,
 ; GCN-NEXT:    v_readlane_b32 s5, v1, 5
 ; GCN-NEXT:    v_readlane_b32 s6, v1, 6
 ; GCN-NEXT:    v_readlane_b32 s7, v1, 7
-; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v2, off, s[92:95], 0 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[0:7]
 ; GCN-NEXT:    ;;#ASMEND
@@ -379,7 +378,6 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out,
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[0:7]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_readlane_b32 s0, v2, 0
 ; GCN-NEXT:    v_readlane_b32 s1, v2, 1
 ; GCN-NEXT:    v_readlane_b32 s2, v2, 2
@@ -585,12 +583,12 @@ define amdgpu_kernel void @split_sgpr_spill_2_vgprs(ptr addrspace(1) %out, i32 %
 ; GCN-NEXT:    s_cbranch_scc1 .LBB1_2
 ; GCN-NEXT:  ; %bb.1: ; %bb0
 ; GCN-NEXT:    s_or_saveexec_b64 s[28:29], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[28:29]
 ; GCN-NEXT:    s_or_saveexec_b64 s[28:29], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[28:29]
-; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_readlane_b32 s16, v0, 8
 ; GCN-NEXT:    v_readlane_b32 s17, v0, 9
 ; GCN-NEXT:    v_readlane_b32 s20, v0, 0
@@ -601,7 +599,6 @@ define amdgpu_kernel void @split_sgpr_spill_2_vgprs(ptr addrspace(1) %out, i32 %
 ; GCN-NEXT:    v_readlane_b32 s25, v0, 5
 ; GCN-NEXT:    v_readlane_b32 s26, v0, 6
 ; GCN-NEXT:    v_readlane_b32 s27, v0, 7
-; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_readlane_b32 s36, v1, 32
 ; GCN-NEXT:    v_readlane_b32 s37, v1, 33
 ; GCN-NEXT:    v_readlane_b32 s38, v1, 34
@@ -829,6 +826,9 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %
 ; GCN-NEXT:    s_cbranch_scc1 .LBB2_2
 ; GCN-NEXT:  ; %bb.1: ; %bb0
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GCN-NEXT:    buffer_load_dword v32, off, s[52:55], 0 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[34:35]
+; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GCN-NEXT:    buffer_load_dword v31, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -880,9 +880,6 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %
 ; GCN-NEXT:    v_readlane_b32 s29, v31, 13
 ; GCN-NEXT:    v_readlane_b32 s30, v31, 14
 ; GCN-NEXT:    v_readlane_b32 s31, v31, 15
-; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v32, off, s[52:55], 0 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[16:31]
 ; GCN-NEXT:    ;;#ASMEND
@@ -905,7 +902,6 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %
 ; GCN-NEXT:    v_readlane_b32 s17, v31, 61
 ; GCN-NEXT:    v_readlane_b32 s18, v31, 62
 ; GCN-NEXT:    v_readlane_b32 s19, v31, 63
-; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_readlane_b32 s0, v32, 0
 ; GCN-NEXT:    v_readlane_b32 s1, v32, 1
 ; GCN-NEXT:    ;;#ASMSTART
@@ -1065,6 +1061,9 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 {
 ; GCN-NEXT:    s_cbranch_scc1 .LBB3_2
 ; GCN-NEXT:  ; %bb.1: ; %bb0
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GCN-NEXT:    buffer_load_dword v32, off, s[52:55], 0 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[34:35]
+; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GCN-NEXT:    buffer_load_dword v31, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -1116,9 +1115,6 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 {
 ; GCN-NEXT:    v_readlane_b32 s29, v31, 13
 ; GCN-NEXT:    v_readlane_b32 s30, v31, 14
 ; GCN-NEXT:    v_readlane_b32 s31, v31, 15
-; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v32, off, s[52:55], 0 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def v0
 ; GCN-NEXT:    ;;#ASMEND
@@ -1144,7 +1140,6 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 {
 ; GCN-NEXT:    v_readlane_b32 s17, v31, 61
 ; GCN-NEXT:    v_readlane_b32 s18, v31, 62
 ; GCN-NEXT:    v_readlane_b32 s19, v31, 63
-; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_readlane_b32 s0, v32, 0
 ; GCN-NEXT:    v_readlane_b32 s1, v32, 1
 ; GCN-NEXT:    ;;#ASMSTART
diff --git a/llvm/test/CodeGen/AMDGPU/postra-sink-update-dependency.mir b/llvm/test/CodeGen/AMDGPU/postra-sink-update-dependency.mir
index 14617e066f954c..31221064e25c42 100644
--- a/llvm/test/CodeGen/AMDGPU/postra-sink-update-dependency.mir
+++ b/llvm/test/CodeGen/AMDGPU/postra-sink-update-dependency.mir
@@ -14,8 +14,6 @@ body:             |
   ; CHECK-NEXT:   liveins: $sgpr0, $sgpr3, $sgpr2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   $vgpr1 = IMPLICIT_DEF
-  ; CHECK-NEXT:   renamable $sgpr4 = COPY $sgpr2
-  ; CHECK-NEXT:   renamable $sgpr2 = COPY $sgpr3
   ; CHECK-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr0, 0, $vgpr1
   ; CHECK-NEXT:   $sgpr1 = S_AND_SAVEEXEC_B32 $sgpr0, implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.1, implicit $exec
@@ -26,8 +24,10 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $sgpr0, $sgpr2, $sgpr4, $vgpr1
+  ; CHECK-NEXT:   liveins: $sgpr0, $sgpr2, $sgpr3, $vgpr1
   ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $sgpr4 = COPY $sgpr2
+  ; CHECK-NEXT:   renamable $sgpr2 = COPY $sgpr3
   ; CHECK-NEXT:   $sgpr3 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 0
   ; CHECK-NEXT:   renamable $sgpr0_sgpr1 = S_GETPC_B64_pseudo
   ; CHECK-NEXT:   renamable $sgpr5 = COPY $sgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
index 6583d5e8aa5a07..21ff7f8cf207c3 100644
--- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
@@ -713,12 +713,6 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_branch .LBB0_4
 ; GFX9-O0-NEXT:  .LBB0_6: ; %udiv-do-while
 ; GFX9-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s6, v30, 10
-; GFX9-O0-NEXT:    v_readlane_b32 s7, v30, 11
 ; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
@@ -743,8 +737,13 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_readlane_b32 s6, v30, 10
+; GFX9-O0-NEXT:    v_readlane_b32 s7, v30, 11
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 63
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[28:29], s4, v[2:3]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v29
 ; GFX9-O0-NEXT:    s_mov_b32 s5, 1
@@ -771,7 +770,6 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[0:1], s4, v[0:1]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v29
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(10)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v27
 ; GFX9-O0-NEXT:    v_or3_b32 v6, v6, v7, v10
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
@@ -781,7 +779,6 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v25
 ; GFX9-O0-NEXT:    v_or_b32_e64 v6, v6, v7
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
@@ -793,12 +790,10 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v22
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v23
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v14
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v15
 ; GFX9-O0-NEXT:    v_sub_co_u32_e32 v13, vcc, v13, v6
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v12, vcc, v12, v10, vcc
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v11, vcc, v11, v4, vcc
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v5, vcc
@@ -938,9 +933,6 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_cbranch_execnz .LBB0_6
 ; GFX9-O0-NEXT:    s_branch .LBB0_1
 ; GFX9-O0-NEXT:  .LBB0_7: ; %udiv-preheader
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
@@ -957,9 +949,12 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(9)
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(10)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[6:7], v4, v[20:21]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v7
 ; GFX9-O0-NEXT:    s_mov_b32 s6, 64
@@ -1040,6 +1035,7 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v14, s8
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, s7
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s6
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-O0-NEXT:    v_writelane_b32 v30, s4, 10
 ; GFX9-O0-NEXT:    v_writelane_b32 v30, s5, 11
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
@@ -1071,9 +1067,6 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB0_6
 ; GFX9-O0-NEXT:  .LBB0_8: ; %udiv-bb1
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
 ; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
@@ -1082,9 +1075,12 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s6
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-O0-NEXT:    s_mov_b32 s4, s7
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
@@ -1095,7 +1091,7 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s5
 ; GFX9-O0-NEXT:    v_add_co_u32_e32 v8, vcc, v3, v4
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-O0-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s8
 ; GFX9-O0-NEXT:    v_addc_co_u32_e32 v0, vcc, v0, v4, vcc
@@ -1199,6 +1195,7 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9-O0-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
 ; GFX9-O0-NEXT:    s_xor_b64 s[6:7], s[4:5], s[6:7]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-O0-NEXT:    v_writelane_b32 v30, s6, 8
 ; GFX9-O0-NEXT:    v_writelane_b32 v30, s7, 9
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
@@ -2095,12 +2092,6 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_branch .LBB1_4
 ; GFX9-O0-NEXT:  .LBB1_6: ; %udiv-do-while
 ; GFX9-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s6, v30, 8
-; GFX9-O0-NEXT:    v_readlane_b32 s7, v30, 9
 ; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
@@ -2125,8 +2116,13 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_readlane_b32 s6, v30, 8
+; GFX9-O0-NEXT:    v_readlane_b32 s7, v30, 9
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 63
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[28:29], s4, v[2:3]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v29
 ; GFX9-O0-NEXT:    s_mov_b32 s5, 1
@@ -2153,7 +2149,6 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[0:1], s4, v[0:1]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v29
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(10)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v27
 ; GFX9-O0-NEXT:    v_or3_b32 v6, v6, v7, v10
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
@@ -2163,7 +2158,6 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v25
 ; GFX9-O0-NEXT:    v_or_b32_e64 v6, v6, v7
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
@@ -2175,12 +2169,10 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v22
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v23
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v11
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v14
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v15
 ; GFX9-O0-NEXT:    v_sub_co_u32_e32 v13, vcc, v13, v6
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v12, vcc, v12, v10, vcc
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v11, vcc, v11, v4, vcc
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v5, vcc
@@ -2320,9 +2312,6 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_cbranch_execnz .LBB1_6
 ; GFX9-O0-NEXT:    s_branch .LBB1_1
 ; GFX9-O0-NEXT:  .LBB1_7: ; %udiv-preheader
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
@@ -2339,9 +2328,12 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(9)
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(10)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[6:7], v4, v[20:21]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v7
 ; GFX9-O0-NEXT:    s_mov_b32 s6, 64
@@ -2422,6 +2414,7 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v14, s8
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, s7
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s6
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-O0-NEXT:    v_writelane_b32 v30, s4, 8
 ; GFX9-O0-NEXT:    v_writelane_b32 v30, s5, 9
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
@@ -2453,9 +2446,6 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB1_6
 ; GFX9-O0-NEXT:  .LBB1_8: ; %udiv-bb1
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
@@ -2464,9 +2454,12 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s6
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-O0-NEXT:    s_mov_b32 s4, s7
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
@@ -2477,7 +2470,7 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s5
 ; GFX9-O0-NEXT:    v_add_co_u32_e32 v8, vcc, v3, v4
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-O0-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s8
 ; GFX9-O0-NEXT:    v_addc_co_u32_e32 v0, vcc, v0, v4, vcc
@@ -2581,6 +2574,7 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9-O0-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
 ; GFX9-O0-NEXT:    s_xor_b64 s[6:7], s[4:5], s[6:7]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-O0-NEXT:    v_writelane_b32 v30, s6, 6
 ; GFX9-O0-NEXT:    v_writelane_b32 v30, s7, 7
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
index 59036c64c8afcc..cda149012b1a7d 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
@@ -113,6 +113,9 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou
 ; GCN-NEXT:    s_cbranch_scc1 .LBB0_2
 ; GCN-NEXT:  ; %bb.1: ; %bb0
 ; GCN-NEXT:    s_or_saveexec_b64 s[24:25], -1
+; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], 0 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[24:25]
+; GCN-NEXT:    s_or_saveexec_b64 s[24:25], -1
 ; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[24:25]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -132,9 +135,6 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou
 ; GCN-NEXT:    v_readlane_b32 s17, v23, 13
 ; GCN-NEXT:    v_readlane_b32 s18, v23, 14
 ; GCN-NEXT:    v_readlane_b32 s19, v23, 15
-; GCN-NEXT:    s_or_saveexec_b64 s[24:25], -1
-; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], 0 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[24:25]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[4:19]
 ; GCN-NEXT:    ;;#ASMEND
@@ -192,7 +192,6 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou
 ; GCN-NEXT:    v_readlane_b32 s21, v23, 61
 ; GCN-NEXT:    v_readlane_b32 s22, v23, 62
 ; GCN-NEXT:    v_readlane_b32 s23, v23, 63
-; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_readlane_b32 s4, v22, 0
 ; GCN-NEXT:    v_readlane_b32 s5, v22, 1
 ; GCN-NEXT:    ;;#ASMSTART
diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
index 76ecbc0863650b..a6f127e7aba8d7 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
@@ -9750,36 +9750,23 @@ entry:
 define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GFX6-LABEL: test_limited_sgpr:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
-; GFX6-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
-; GFX6-NEXT:    s_mov_b32 s42, -1
-; GFX6-NEXT:    s_mov_b32 s43, 0xe8f000
-; GFX6-NEXT:    s_add_u32 s40, s40, s9
 ; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, -1, 0
-; GFX6-NEXT:    s_addc_u32 s41, s41, 0
 ; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, -1, v0
-; GFX6-NEXT:    v_mov_b32_e32 v6, 0
-; GFX6-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b64 exec, 15
-; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0
-; GFX6-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_writelane_b32 v1, s0, 0
-; GFX6-NEXT:    v_writelane_b32 v1, s1, 1
-; GFX6-NEXT:    v_writelane_b32 v1, s2, 2
-; GFX6-NEXT:    v_writelane_b32 v1, s3, 3
-; GFX6-NEXT:    s_mov_b32 s8, 0x80400
-; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s8 ; 4-byte Folded Spill
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 8, v0
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:240
-; GFX6-NEXT:    s_mov_b32 s2, 0x86a00
+; GFX6-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX6-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX6-NEXT:    s_mov_b32 s42, -1
+; GFX6-NEXT:    s_mov_b32 s43, 0xe8f000
+; GFX6-NEXT:    s_add_u32 s40, s40, s9
+; GFX6-NEXT:    s_addc_u32 s41, s41, 0
+; GFX6-NEXT:    s_mov_b32 s2, 0x85e00
 ; GFX6-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
@@ -9788,7 +9775,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:224
-; GFX6-NEXT:    s_mov_b32 s2, 0x86600
+; GFX6-NEXT:    s_mov_b32 s2, 0x85a00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
@@ -9796,7 +9783,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:208
-; GFX6-NEXT:    s_mov_b32 s2, 0x86200
+; GFX6-NEXT:    s_mov_b32 s2, 0x85600
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
@@ -9804,7 +9791,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:192
-; GFX6-NEXT:    s_mov_b32 s2, 0x85e00
+; GFX6-NEXT:    s_mov_b32 s2, 0x85200
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
@@ -9812,7 +9799,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:176
-; GFX6-NEXT:    s_mov_b32 s2, 0x85a00
+; GFX6-NEXT:    s_mov_b32 s2, 0x84e00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
@@ -9820,7 +9807,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:160
-; GFX6-NEXT:    s_mov_b32 s2, 0x85600
+; GFX6-NEXT:    s_mov_b32 s2, 0x84a00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
@@ -9828,7 +9815,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:144
-; GFX6-NEXT:    s_mov_b32 s2, 0x85200
+; GFX6-NEXT:    s_mov_b32 s2, 0x84600
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
@@ -9836,7 +9823,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:128
-; GFX6-NEXT:    s_mov_b32 s2, 0x84e00
+; GFX6-NEXT:    s_mov_b32 s2, 0x84200
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
@@ -9844,7 +9831,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:112
-; GFX6-NEXT:    s_mov_b32 s2, 0x84a00
+; GFX6-NEXT:    s_mov_b32 s2, 0x83e00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
@@ -9852,7 +9839,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:96
-; GFX6-NEXT:    s_mov_b32 s2, 0x84600
+; GFX6-NEXT:    s_mov_b32 s2, 0x83a00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
@@ -9860,20 +9847,13 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:80
-; GFX6-NEXT:    s_mov_b32 s2, 0x84200
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:64
-; GFX6-NEXT:    s_mov_b32 s2, 0x83a00
+; GFX6-NEXT:    s_mov_b32 s2, 0x83600
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_load_dwordx4 v[16:19], v[5:6], s[4:7], 0 addr64 offset:64
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64
 ; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:16
@@ -9883,14 +9863,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:32
-; GFX6-NEXT:    s_mov_b32 s2, 0x83600
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_load_dwordx4 v[12:15], v[5:6], s[4:7], 0 addr64 offset:32
 ; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; GFX6-NEXT:    s_mov_b64 exec, 15
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
@@ -9899,24 +9872,17 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_writelane_b32 v4, s1, 1
 ; GFX6-NEXT:    v_writelane_b32 v4, s2, 2
 ; GFX6-NEXT:    v_writelane_b32 v4, s3, 3
-; GFX6-NEXT:    s_mov_b32 s10, 0x80800
+; GFX6-NEXT:    s_mov_b32 s10, 0x80400
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s10 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[8:9]
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:48
-; GFX6-NEXT:    s_mov_b32 s0, 0x83e00
+; GFX6-NEXT:    buffer_load_dwordx4 v[20:23], v[5:6], s[4:7], 0 addr64 offset:48
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 13, v0
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 16, v4
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[40:43], s0 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill
-; GFX6-NEXT:    s_waitcnt expcnt(3)
 ; GFX6-NEXT:    v_mov_b32_e32 v7, 1
-; GFX6-NEXT:    s_mov_b64 s[0:1], exec
+; GFX6-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX6-NEXT:    buffer_store_dword v7, v4, s[40:43], 0 offen
 ; GFX6-NEXT:    ;;#ASMSTART
 ; GFX6-NEXT:    ; def s[4:11]
@@ -9932,12 +9898,12 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_writelane_b32 v4, s9, 5
 ; GFX6-NEXT:    v_writelane_b32 v4, s10, 6
 ; GFX6-NEXT:    v_writelane_b32 v4, s11, 7
-; GFX6-NEXT:    s_mov_b32 s2, 0x80c00
-; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_mov_b32 s12, 0x80800
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX6-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX6-NEXT:    ;;#ASMSTART
 ; GFX6-NEXT:    ; def s[8:15]
@@ -9949,19 +9915,18 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    ; def s[24:31]
 ; GFX6-NEXT:    ;;#ASMEND
 ; GFX6-NEXT:    ;;#ASMSTART
-; GFX6-NEXT:    ; def s[0:3]
+; GFX6-NEXT:    ; def s[4:7]
 ; GFX6-NEXT:    ;;#ASMEND
 ; GFX6-NEXT:    ;;#ASMSTART
-; GFX6-NEXT:    ; def s[4:5]
+; GFX6-NEXT:    ; def s[2:3]
 ; GFX6-NEXT:    ;;#ASMEND
 ; GFX6-NEXT:    ;;#ASMSTART
 ; GFX6-NEXT:    ; def s33
 ; GFX6-NEXT:    ;;#ASMEND
-; GFX6-NEXT:    s_and_saveexec_b64 s[6:7], vcc
-; GFX6-NEXT:    s_mov_b64 vcc, s[6:7]
+; GFX6-NEXT:    s_and_saveexec_b64 s[34:35], vcc
 ; GFX6-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX6-NEXT:  ; %bb.1: ; %bb0
-; GFX6-NEXT:    s_mov_b64 s[6:7], exec
+; GFX6-NEXT:    s_mov_b64 s[36:37], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -9973,18 +9938,18 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_writelane_b32 v4, s13, 5
 ; GFX6-NEXT:    v_writelane_b32 v4, s14, 6
 ; GFX6-NEXT:    v_writelane_b32 v4, s15, 7
-; GFX6-NEXT:    s_mov_b32 s34, 0x81400
-; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_mov_b32 s38, 0x81000
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s38 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX6-NEXT:    s_mov_b64 s[6:7], exec
+; GFX6-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX6-NEXT:    s_mov_b64 s[36:37], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
-; GFX6-NEXT:    s_mov_b32 s34, 0x80c00
+; GFX6-NEXT:    s_mov_b32 s38, 0x80800
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_readlane_b32 s8, v4, 0
 ; GFX6-NEXT:    v_readlane_b32 s9, v4, 1
@@ -9996,8 +9961,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_readlane_b32 s15, v4, 7
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX6-NEXT:    s_mov_b64 s[6:7], exec
+; GFX6-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX6-NEXT:    s_mov_b64 s[36:37], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10009,18 +9974,18 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_writelane_b32 v4, s21, 5
 ; GFX6-NEXT:    v_writelane_b32 v4, s22, 6
 ; GFX6-NEXT:    v_writelane_b32 v4, s23, 7
-; GFX6-NEXT:    s_mov_b32 s34, 0x81c00
-; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_mov_b32 s38, 0x81800
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s38 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX6-NEXT:    s_mov_b64 s[6:7], exec
+; GFX6-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX6-NEXT:    s_mov_b64 s[36:37], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
-; GFX6-NEXT:    s_mov_b32 s34, 0x81400
+; GFX6-NEXT:    s_mov_b32 s38, 0x81000
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_readlane_b32 s16, v4, 0
 ; GFX6-NEXT:    v_readlane_b32 s17, v4, 1
@@ -10032,8 +9997,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_readlane_b32 s23, v4, 7
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX6-NEXT:    s_mov_b64 s[6:7], exec
+; GFX6-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX6-NEXT:    s_mov_b64 s[36:37], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10045,18 +10010,18 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_writelane_b32 v4, s29, 5
 ; GFX6-NEXT:    v_writelane_b32 v4, s30, 6
 ; GFX6-NEXT:    v_writelane_b32 v4, s31, 7
-; GFX6-NEXT:    s_mov_b32 s34, 0x82400
-; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_mov_b32 s38, 0x82000
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s38 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX6-NEXT:    s_mov_b64 s[6:7], exec
+; GFX6-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX6-NEXT:    s_mov_b64 s[36:37], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
-; GFX6-NEXT:    s_mov_b32 s34, 0x81c00
+; GFX6-NEXT:    s_mov_b32 s38, 0x81800
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_readlane_b32 s24, v4, 0
 ; GFX6-NEXT:    v_readlane_b32 s25, v4, 1
@@ -10068,8 +10033,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_readlane_b32 s31, v4, 7
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX6-NEXT:    s_mov_b64 s[6:7], exec
+; GFX6-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX6-NEXT:    s_mov_b64 s[36:37], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 15
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10077,30 +10042,44 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_writelane_b32 v4, s1, 1
 ; GFX6-NEXT:    v_writelane_b32 v4, s2, 2
 ; GFX6-NEXT:    v_writelane_b32 v4, s3, 3
-; GFX6-NEXT:    s_mov_b32 s34, 0x82c00
-; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_mov_b32 s38, 0x82800
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s38 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX6-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX6-NEXT:    s_mov_b64 s[0:1], exec
-; GFX6-NEXT:    s_mov_b64 exec, 3
+; GFX6-NEXT:    s_mov_b64 exec, 15
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    v_writelane_b32 v4, s4, 0
 ; GFX6-NEXT:    v_writelane_b32 v4, s5, 1
-; GFX6-NEXT:    s_mov_b32 s2, 0x83000
-; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    v_writelane_b32 v4, s6, 2
+; GFX6-NEXT:    v_writelane_b32 v4, s7, 3
+; GFX6-NEXT:    s_mov_b32 s36, 0x82c00
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s36 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX6-NEXT:    s_mov_b64 s[34:35], exec
+; GFX6-NEXT:    s_mov_b64 s[0:1], exec
+; GFX6-NEXT:    s_mov_b64 exec, 3
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_writelane_b32 v4, s2, 0
+; GFX6-NEXT:    v_writelane_b32 v4, s3, 1
+; GFX6-NEXT:    s_mov_b32 s4, 0x83000
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s4 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX6-NEXT:    s_mov_b64 s[36:37], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
-; GFX6-NEXT:    s_mov_b32 s36, 0x82400
+; GFX6-NEXT:    s_mov_b32 s38, 0x82000
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s36 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_readlane_b32 s0, v4, 0
 ; GFX6-NEXT:    v_readlane_b32 s1, v4, 1
@@ -10112,13 +10091,13 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_readlane_b32 s7, v4, 7
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX6-NEXT:    s_mov_b64 s[34:35], exec
+; GFX6-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX6-NEXT:    s_mov_b64 s[44:45], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 15
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
-; GFX6-NEXT:    s_mov_b32 s44, 0x82c00
+; GFX6-NEXT:    v_mov_b32_e32 v7, 0x20b0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s44 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_readlane_b32 s36, v4, 0
 ; GFX6-NEXT:    v_readlane_b32 s37, v4, 1
@@ -10126,7 +10105,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_readlane_b32 s39, v4, 3
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX6-NEXT:    s_mov_b64 exec, s[44:45]
+; GFX6-NEXT:    s_mov_b64 vcc, s[34:35]
 ; GFX6-NEXT:    s_mov_b64 s[44:45], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 3
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
@@ -10142,8 +10122,64 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    ;;#ASMSTART
 ; GFX6-NEXT:    ; use s[8:15],s[16:23],s[24:31],s[0:7],s[36:39],s[34:35]
 ; GFX6-NEXT:    ;;#ASMEND
+; GFX6-NEXT:    s_mov_b64 s[34:35], vcc
+; GFX6-NEXT:    s_mov_b64 s[4:5], exec
+; GFX6-NEXT:    s_mov_b64 exec, 15
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    s_mov_b32 s6, 0x82800
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s6 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_readlane_b32 s0, v4, 0
+; GFX6-NEXT:    v_readlane_b32 s1, v4, 1
+; GFX6-NEXT:    v_readlane_b32 s2, v4, 2
+; GFX6-NEXT:    v_readlane_b32 s3, v4, 3
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX6-NEXT:    s_mov_b32 s2, 0x86200
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_mov_b32 s2, 0x86600
+; GFX6-NEXT:    buffer_store_dword v12, off, s[40:43], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v13, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v14, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v15, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_mov_b32 s2, 0x86a00
+; GFX6-NEXT:    buffer_store_dword v16, off, s[40:43], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v17, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v18, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v19, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    v_mov_b32_e32 v0, v20
+; GFX6-NEXT:    v_mov_b32_e32 v1, v21
+; GFX6-NEXT:    v_mov_b32_e32 v2, v22
+; GFX6-NEXT:    v_mov_b32_e32 v3, v23
+; GFX6-NEXT:    s_waitcnt expcnt(3)
 ; GFX6-NEXT:    ;;#ASMSTART
 ; GFX6-NEXT:    ;;#ASMEND
+; GFX6-NEXT:    buffer_load_dword v16, off, s[40:43], s2 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_waitcnt expcnt(2)
+; GFX6-NEXT:    buffer_load_dword v17, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_waitcnt expcnt(1)
+; GFX6-NEXT:    buffer_load_dword v18, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dword v19, off, s[40:43], s2 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s2, 0x86600
+; GFX6-NEXT:    v_mov_b32_e32 v23, v3
+; GFX6-NEXT:    buffer_load_dword v12, off, s[40:43], s2 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v13, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v14, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v15, off, s[40:43], s2 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s2, 0x86200
+; GFX6-NEXT:    v_mov_b32_e32 v22, v2
+; GFX6-NEXT:    v_mov_b32_e32 v21, v1
+; GFX6-NEXT:    v_mov_b32_e32 v20, v0
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s2 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    ;;#ASMSTART
 ; GFX6-NEXT:    ;;#ASMEND
 ; GFX6-NEXT:    ;;#ASMSTART
@@ -10157,157 +10193,121 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    ;;#ASMSTART
 ; GFX6-NEXT:    ;;#ASMEND
 ; GFX6-NEXT:  .LBB1_2: ; %ret
-; GFX6-NEXT:    s_or_b64 exec, exec, vcc
-; GFX6-NEXT:    s_mov_b64 s[4:5], exec
+; GFX6-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX6-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 15
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
-; GFX6-NEXT:    s_mov_b32 s6, 0x80400
+; GFX6-NEXT:    s_mov_b32 s8, 0x80400
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s6 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_readlane_b32 s0, v4, 0
-; GFX6-NEXT:    v_readlane_b32 s1, v4, 1
-; GFX6-NEXT:    v_readlane_b32 s2, v4, 2
-; GFX6-NEXT:    v_readlane_b32 s3, v4, 3
+; GFX6-NEXT:    v_readlane_b32 s4, v4, 0
+; GFX6-NEXT:    v_readlane_b32 s5, v4, 1
+; GFX6-NEXT:    v_readlane_b32 s6, v4, 2
+; GFX6-NEXT:    v_readlane_b32 s7, v4, 3
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX6-NEXT:    s_mov_b64 s[36:37], s[0:1]
-; GFX6-NEXT:    s_mov_b64 s[4:5], exec
-; GFX6-NEXT:    s_mov_b64 exec, 15
-; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
-; GFX6-NEXT:    s_mov_b32 s6, 0x80800
+; GFX6-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX6-NEXT:    s_mov_b32 s4, 0x85e00
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GFX6-NEXT:    s_mov_b32 s4, 0x85a00
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:240
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s6 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s4, 0x85600
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_readlane_b32 s0, v4, 0
-; GFX6-NEXT:    v_readlane_b32 s1, v4, 1
-; GFX6-NEXT:    v_readlane_b32 s2, v4, 2
-; GFX6-NEXT:    v_readlane_b32 s3, v4, 3
-; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:224
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s4, 0x85200
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX6-NEXT:    s_mov_b32 s0, 0x86a00
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b64 s[38:39], s[2:3]
-; GFX6-NEXT:    s_mov_b32 s0, 0x86600
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:240
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s0, 0x86200
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:224
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s0, 0x85e00
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:208
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s0, 0x85a00
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:192
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s0, 0x85600
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:176
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s0, 0x85200
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:160
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s0, 0x84e00
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:144
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s0, 0x84a00
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:128
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s0, 0x84600
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:112
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s0, 0x84200
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:96
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s0, 0x83a00
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:80
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s0, 0x83e00
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:64
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s0, 0x83600
+; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:208
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s4, 0x84e00
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:192
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s4, 0x84a00
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:176
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s4, 0x84600
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:160
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s4, 0x84200
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:144
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s4, 0x83e00
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:128
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s4, 0x83a00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:48
+; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:112
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s0, 0x83200
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s4, 0x83600
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:32
+; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:96
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s4, 0x83200
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:80
+; GFX6-NEXT:    buffer_store_dwordx4 v[16:19], v[5:6], s[0:3], 0 addr64 offset:64
+; GFX6-NEXT:    buffer_store_dwordx4 v[20:23], v[5:6], s[0:3], 0 addr64 offset:48
+; GFX6-NEXT:    buffer_store_dwordx4 v[12:15], v[5:6], s[0:3], 0 addr64 offset:32
+; GFX6-NEXT:    s_waitcnt expcnt(3)
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:16
-; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64
+; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:16
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-FLATSCR-LABEL: test_limited_sgpr:
diff --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
index 0daa6860072616..4dfd4c095c87a0 100644
--- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll
+++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
@@ -367,15 +367,14 @@ define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrs
 ; HSA-TRAP-GFX1100-O0-NEXT:    scratch_store_b32 off, v0, off offset:4 ; 4-byte Folded Spill
 ; HSA-TRAP-GFX1100-O0-NEXT:    s_cbranch_execnz .LBB2_2
 ; HSA-TRAP-GFX1100-O0-NEXT:  ; %bb.1:
+; HSA-TRAP-GFX1100-O0-NEXT:    scratch_load_b32 v0, off, off offset:8 ; 4-byte Folded Reload
+; HSA-TRAP-GFX1100-O0-NEXT:    scratch_load_b32 v1, off, off offset:4 ; 4-byte Folded Reload
 ; HSA-TRAP-GFX1100-O0-NEXT:    s_or_saveexec_b32 s6, -1
 ; HSA-TRAP-GFX1100-O0-NEXT:    scratch_load_b32 v2, off, off ; 4-byte Folded Reload
 ; HSA-TRAP-GFX1100-O0-NEXT:    s_mov_b32 exec_lo, s6
 ; HSA-TRAP-GFX1100-O0-NEXT:    s_waitcnt vmcnt(0)
 ; HSA-TRAP-GFX1100-O0-NEXT:    v_readlane_b32 s0, v2, 0
 ; HSA-TRAP-GFX1100-O0-NEXT:    v_readlane_b32 s1, v2, 1
-; HSA-TRAP-GFX1100-O0-NEXT:    scratch_load_b32 v0, off, off offset:8 ; 4-byte Folded Reload
-; HSA-TRAP-GFX1100-O0-NEXT:    scratch_load_b32 v1, off, off offset:4 ; 4-byte Folded Reload
-; HSA-TRAP-GFX1100-O0-NEXT:    s_waitcnt vmcnt(0)
 ; HSA-TRAP-GFX1100-O0-NEXT:    global_store_b32 v0, v1, s[0:1] dlc
 ; HSA-TRAP-GFX1100-O0-NEXT:    s_waitcnt_vscnt null, 0x0
 ; HSA-TRAP-GFX1100-O0-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
index 1089093ea691c3..5a3dc017354860 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
@@ -213,12 +213,12 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
 ; GFX9-O0-NEXT:    v_readlane_b32 s36, v5, 4
 ; GFX9-O0-NEXT:    v_readlane_b32 s37, v5, 5
 ; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[36:37]
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    v_readlane_b32 s38, v5, 0
 ; GFX9-O0-NEXT:    v_readlane_b32 s39, v5, 1
 ; GFX9-O0-NEXT:    v_readlane_b32 s34, v5, 2
 ; GFX9-O0-NEXT:    v_readlane_b32 s35, v5, 3
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[36:37], v0, v3
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[36:37]
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
index 027081752a11bb..c2d15ee9e288b2 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
@@ -190,12 +190,12 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    v_readlane_b32 s4, v5, 5
 ; GFX9-O0-NEXT:    v_readlane_b32 s5, v5, 6
 ; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    v_readlane_b32 s2, v5, 1
 ; GFX9-O0-NEXT:    v_readlane_b32 s3, v5, 2
 ; GFX9-O0-NEXT:    v_readlane_b32 s0, v5, 3
 ; GFX9-O0-NEXT:    v_readlane_b32 s1, v5, 4
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[4:5], v0, v3
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
@@ -1042,12 +1042,12 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    v_readlane_b32 s4, v5, 5
 ; GFX9-O0-NEXT:    v_readlane_b32 s5, v5, 6
 ; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    v_readlane_b32 s2, v5, 1
 ; GFX9-O0-NEXT:    v_readlane_b32 s3, v5, 2
 ; GFX9-O0-NEXT:    v_readlane_b32 s0, v5, 3
 ; GFX9-O0-NEXT:    v_readlane_b32 s1, v5, 4
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[4:5], v0, v3
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]



More information about the llvm-commits mailing list