[llvm] [AMDGPU] Improve isBasicBlockPrologue helper function (PR #69924)

Christudasan Devadasan via llvm-commits llvm-commits at lists.llvm.org
Fri Oct 27 05:28:41 PDT 2023


https://github.com/cdevadas updated https://github.com/llvm/llvm-project/pull/69924

>From ffba940347b4daa811997e00224ba42a497fa8ea Mon Sep 17 00:00:00 2001
From: Christudasan Devadasan <Christudasan.Devadasan at amd.com>
Date: Mon, 23 Oct 2023 14:03:03 +0530
Subject: [PATCH] [AMDGPU] Try to fix the block prologs broken by RA inserted
 instructions

The insertion point determined by RA while attempting spills and liverange
split at the beginning of a block goes wrong at times, and the newly
inserted vector instructions are placed before the exec-mask restore
instruction which is wrong. It occurs mainly due to the dependency on
isBasicBlockPrologue that doesn't account early inserted instructions
(spills and splits) during RA and causes the block prolog break.

A better approach for deciding the insertion point should be worked out.
For now, improving the helper function to consider all possible early
insertions. This patch includes the spill instructions. The copies
associated with liverange split should also be included in the block
prolog.
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |  12 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.h          |   5 +
 .../GlobalISel/image-waterfall-loop-O0.ll     |   5 +-
 .../AMDGPU/bb-prolog-spill-during-regalloc.ll |   7 +-
 llvm/test/CodeGen/AMDGPU/collapse-endcf.ll    |  71 +--
 .../AMDGPU/control-flow-fastregalloc.ll       |  18 +-
 .../AMDGPU/cross-block-use-is-not-abi-copy.ll |   2 +-
 .../identical-subrange-spill-infloop.ll       | 375 ++++++++------
 ...uf-legalize-operands-non-ptr-intrinsics.ll |  54 +-
 .../CodeGen/AMDGPU/mubuf-legalize-operands.ll |  58 ++-
 .../AMDGPU/partial-sgpr-to-vgpr-spills.ll     |  33 +-
 .../CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll     |   7 +-
 .../CodeGen/AMDGPU/spill-scavenge-offset.ll   | 489 +++++++++---------
 llvm/test/CodeGen/AMDGPU/swdev380865.ll       |  16 +-
 .../AMDGPU/vgpr-spill-placement-issue61083.ll |   3 +-
 .../test/CodeGen/AMDGPU/wwm-reserved-spill.ll |   6 +-
 16 files changed, 632 insertions(+), 529 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 25e057b639b36ee..fae43bf30a3f6ba 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -8439,8 +8439,16 @@ unsigned SIInstrInfo::getLiveRangeSplitOpcode(Register SrcReg,
 }
 
 bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const {
-  return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
-         MI.modifiesRegister(AMDGPU::EXEC, &RI);
+  // We need to handle instructions which may be inserted during register
+  // allocation to handle the prolog. The initial prolog instruction may have
+  // been separated from the start of the block by spills and copies inserted
+  // needed by the prolog.
+  uint16_t Opc = MI.getOpcode();
+
+  // FIXME: Copies inserted in the block prolog for live-range split should also
+  // be included.
+  return (isSpillOpcode(Opc) || (!MI.isTerminator() && Opc != AMDGPU::COPY &&
+                                 MI.modifiesRegister(AMDGPU::EXEC, &RI)));
 }
 
 MachineInstrBuilder
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index e6c64d909d3eead..1fd72e6376e2d65 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -675,6 +675,11 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
     return get(Opcode).TSFlags & SIInstrFlags::SGPRSpill;
   }
 
+  bool isSpillOpcode(uint16_t Opcode) const {
+    return get(Opcode).TSFlags &
+           (SIInstrFlags::SGPRSpill | SIInstrFlags::VGPRSpill);
+  }
+
   static bool isWWMRegSpillOpcode(uint16_t Opcode) {
     return Opcode == AMDGPU::SI_SPILL_WWM_V32_SAVE ||
            Opcode == AMDGPU::SI_SPILL_WWM_AV32_SAVE ||
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll
index b19230c2e876c4f..10cbc56cc5fbea1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll
@@ -144,8 +144,6 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
 ; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b32 exec_lo, s21
 ; CHECK-NEXT:  ; %bb.2: ; in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_or_saveexec_b32 s21, -1
 ; CHECK-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b32 exec_lo, s21
@@ -163,6 +161,9 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
 ; CHECK-NEXT:    v_readlane_b32 s17, v2, 1
 ; CHECK-NEXT:    v_readlane_b32 s18, v2, 2
 ; CHECK-NEXT:    v_readlane_b32 s19, v2, 3
+; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    image_sample v0, v[0:1], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
diff --git a/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll b/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll
index 7df42fc506c790d..e22cb912552f970 100644
--- a/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll
@@ -1,8 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -O0 -verify-machineinstrs --stop-after=regallocfast,1 -o - %s | FileCheck -check-prefix=REGALLOC %s
 
-; FIXME: There are two spill codes inserted wrongly in this test.
-; They are inserted during regalloc for the BBLiveIns - the spill restores for vgpr1 in the Flow block (bb.1) and for vgpr0 in the return block (bb.4).
+; Test to check if the bb prolog spills are inserted correctly during regalloc.
 define i32 @prolog_spill(i32 %arg0, i32 %arg1, i32 %arg2) {
   ; REGALLOC-LABEL: name: prolog_spill
   ; REGALLOC: bb.0.bb.0:
@@ -33,10 +32,10 @@ define i32 @prolog_spill(i32 %arg0, i32 %arg1, i32 %arg2) {
   ; REGALLOC-NEXT:   successors: %bb.2(0x40000000), %bb.4(0x40000000)
   ; REGALLOC-NEXT: {{  $}}
   ; REGALLOC-NEXT:   $vgpr0 = SI_SPILL_WWM_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
-  ; REGALLOC-NEXT:   $vgpr1 = SI_SPILL_V32_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
   ; REGALLOC-NEXT:   $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr4_sgpr5
   ; REGALLOC-NEXT:   $sgpr5 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
   ; REGALLOC-NEXT:   renamable $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 killed renamable $sgpr4_sgpr5, implicit-def $exec, implicit-def dead $scc, implicit $exec
+  ; REGALLOC-NEXT:   $vgpr1 = SI_SPILL_V32_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
   ; REGALLOC-NEXT:   SI_SPILL_V32_SAVE killed $vgpr1, %stack.6, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5)
   ; REGALLOC-NEXT:   renamable $sgpr4_sgpr5 = S_AND_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
   ; REGALLOC-NEXT:   renamable $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr4, 2, $vgpr0, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
@@ -66,10 +65,10 @@ define i32 @prolog_spill(i32 %arg0, i32 %arg1, i32 %arg2) {
   ; REGALLOC-NEXT: {{  $}}
   ; REGALLOC-NEXT: bb.4.bb.3:
   ; REGALLOC-NEXT:   $vgpr1 = SI_SPILL_WWM_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
-  ; REGALLOC-NEXT:   $vgpr0 = SI_SPILL_V32_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5)
   ; REGALLOC-NEXT:   $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 2, implicit-def $sgpr4_sgpr5
   ; REGALLOC-NEXT:   $sgpr5 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 3
   ; REGALLOC-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
+  ; REGALLOC-NEXT:   $vgpr0 = SI_SPILL_V32_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5)
   ; REGALLOC-NEXT:   renamable $sgpr4 = S_MOV_B32 5
   ; REGALLOC-NEXT:   renamable $vgpr0 = V_MUL_LO_U32_e64 killed $vgpr0, killed $sgpr4, implicit $exec
   ; REGALLOC-NEXT:   KILL killed renamable $vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
index 73d5088141cdb31..dc519df52919add 100644
--- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
+++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
@@ -75,10 +75,10 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
 ; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_waitcnt vmcnt(1)
+; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s4, v0, 0
 ; GCN-O0-NEXT:    v_readlane_b32 s5, v0, 1
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b32 s2, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s0, 0
 ; GCN-O0-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
@@ -104,7 +104,6 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB0_3
 ; GCN-O0-NEXT:  ; %bb.2: ; %bb.inner.then
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
 ; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
@@ -112,7 +111,9 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 0
 ; GCN-O0-NEXT:    v_readlane_b32 s1, v0, 1
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    v_mov_b32_e32 v0, 1
+; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_add_i32_e64 v1, s[2:3], v1, v0
 ; GCN-O0-NEXT:    v_ashrrev_i32_e64 v3, 31, v1
 ; GCN-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
@@ -248,10 +249,10 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
 ; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_waitcnt vmcnt(1)
+; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s4, v0, 0
 ; GCN-O0-NEXT:    v_readlane_b32 s5, v0, 1
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b32 s2, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s0, 0
 ; GCN-O0-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
@@ -277,7 +278,6 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB1_4
 ; GCN-O0-NEXT:  ; %bb.2: ; %bb.inner.then
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
 ; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
@@ -285,7 +285,9 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 0
 ; GCN-O0-NEXT:    v_readlane_b32 s1, v0, 1
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    v_mov_b32_e32 v0, 1
+; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_add_i32_e64 v1, s[2:3], v1, v0
 ; GCN-O0-NEXT:    v_ashrrev_i32_e64 v3, 31, v1
 ; GCN-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
@@ -311,7 +313,6 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
 ; GCN-O0-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GCN-O0-NEXT:    s_branch .LBB1_5
 ; GCN-O0-NEXT:  .LBB1_4: ; %bb.inner.end
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
 ; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
@@ -322,7 +323,9 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
 ; GCN-O0-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 0
 ; GCN-O0-NEXT:    v_readlane_b32 s1, v0, 1
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    v_mov_b32_e32 v0, 2
+; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_add_i32_e64 v1, s[2:3], v1, v0
 ; GCN-O0-NEXT:    v_ashrrev_i32_e64 v3, 31, v1
 ; GCN-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
@@ -508,7 +511,6 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB2_5
 ; GCN-O0-NEXT:  ; %bb.3: ; %bb.then
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
 ; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
@@ -516,7 +518,9 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 0
 ; GCN-O0-NEXT:    v_readlane_b32 s1, v0, 1
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    v_mov_b32_e32 v0, 1
+; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_add_i32_e64 v1, s[2:3], v1, v0
 ; GCN-O0-NEXT:    v_ashrrev_i32_e64 v3, 31, v1
 ; GCN-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
@@ -532,7 +536,6 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
 ; GCN-O0-NEXT:    s_branch .LBB2_5
 ; GCN-O0-NEXT:  .LBB2_4: ; %bb.else
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
 ; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
@@ -540,7 +543,9 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 0
 ; GCN-O0-NEXT:    v_readlane_b32 s1, v0, 1
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    v_mov_b32_e32 v0, 2
+; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_add_i32_e64 v1, s[2:3], v1, v0
 ; GCN-O0-NEXT:    v_ashrrev_i32_e64 v3, 31, v1
 ; GCN-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
@@ -954,20 +959,21 @@ define amdgpu_kernel void @s_endpgm_unsafe_barrier(ptr addrspace(1) nocapture %a
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB4_2
 ; GCN-O0-NEXT:  ; %bb.1: ; %bb.then
-; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v1, 0
 ; GCN-O0-NEXT:    v_readlane_b32 s1, v1, 1
+; GCN-O0-NEXT:    s_waitcnt expcnt(0)
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b32 s2, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s4, 0
 ; GCN-O0-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
 ; GCN-O0-NEXT:    s_mov_b32 s5, s2
 ; GCN-O0-NEXT:    ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
 ; GCN-O0-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_ashrrev_i32_e64 v2, 31, v0
 ; GCN-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
 ; GCN-O0-NEXT:    v_mov_b32_e32 v1, v2
@@ -1103,14 +1109,14 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
 ; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_waitcnt vmcnt(1)
+; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s8, v0, 2
 ; GCN-O0-NEXT:    v_readlane_b32 s9, v0, 3
 ; GCN-O0-NEXT:    v_readlane_b32 s6, v0, 0
 ; GCN-O0-NEXT:    v_readlane_b32 s7, v0, 1
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s6, 4
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s7, 5
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b32 s4, 0x207
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_cmp_lt_i32_e64 s[4:5], v1, s4
@@ -1133,11 +1139,11 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
 ; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_waitcnt vmcnt(1)
+; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s4, v0, 6
 ; GCN-O0-NEXT:    v_readlane_b32 s5, v0, 7
 ; GCN-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b32 s6, 0
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_cmp_ne_u32_e64 s[4:5], v1, s6
@@ -1227,18 +1233,20 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
 ; GCN-O0-NEXT:    s_branch .LBB5_6
 ; GCN-O0-NEXT:  .LBB5_5: ; %Flow2
 ; GCN-O0-NEXT:    ; in Loop: Header=BB5_1 Depth=1
-; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
+; GCN-O0-NEXT:    s_waitcnt expcnt(1)
 ; GCN-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s4, v4, 10
 ; GCN-O0-NEXT:    v_readlane_b32 s5, v4, 11
 ; GCN-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GCN-O0-NEXT:    s_waitcnt expcnt(0)
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
@@ -1247,18 +1255,20 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
 ; GCN-O0-NEXT:    s_branch .LBB5_7
 ; GCN-O0-NEXT:  .LBB5_6: ; %Flow
 ; GCN-O0-NEXT:    ; in Loop: Header=BB5_1 Depth=1
-; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
+; GCN-O0-NEXT:    s_waitcnt expcnt(1)
 ; GCN-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s4, v4, 12
 ; GCN-O0-NEXT:    v_readlane_b32 s5, v4, 13
 ; GCN-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GCN-O0-NEXT:    s_waitcnt expcnt(0)
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
@@ -1302,11 +1312,6 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
 ; GCN-O0-NEXT:  .LBB5_9: ; %Flow3
 ; GCN-O0-NEXT:    ; in Loop: Header=BB5_1 Depth=1
-; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
 ; GCN-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
@@ -1318,6 +1323,11 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
 ; GCN-O0-NEXT:    v_readlane_b32 s7, v4, 5
 ; GCN-O0-NEXT:    v_readlane_b32 s4, v4, 14
 ; GCN-O0-NEXT:    v_readlane_b32 s5, v4, 15
+; GCN-O0-NEXT:    s_waitcnt expcnt(0)
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_and_b64 s[4:5], exec, s[4:5]
 ; GCN-O0-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
 ; GCN-O0-NEXT:    s_mov_b64 s[6:7], 0
@@ -1332,6 +1342,7 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
 ; GCN-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GCN-O0-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
diff --git a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
index d38e0072c94a5ab..6ef14d31ffa891d 100644
--- a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
@@ -46,9 +46,6 @@
 
 ; VMEM: [[ENDIF]]:
 
-; Restore val
-; GCN: buffer_load_dword [[RELOAD_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[VAL_OFFSET]] ; 4-byte Folded Reload
-
 ; Reload and restore exec mask
 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]
 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]
@@ -60,6 +57,9 @@
 
 ; GCN: s_or_b64 exec, exec, s[[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]]
 
+; Restore val
+; GCN: buffer_load_dword [[RELOAD_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[VAL_OFFSET]] ; 4-byte Folded Reload
+
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RELOAD_VAL]]
 
 ; VGPR: .amdhsa_private_segment_fixed_size 16
@@ -120,7 +120,6 @@ endif:
 ; GCN: buffer_store_dword v[[VAL_LOOP_RELOAD]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET:[0-9]+]] ; 4-byte Folded Spill
 
 ; GCN: [[END]]:
-; GCN: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload
 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]
 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]
 
@@ -130,6 +129,7 @@ endif:
 ; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 1
 
 ; GCN: s_or_b64 exec, exec, s[[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]]
+; GCN: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload
 
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[VAL_END]]
 
@@ -176,7 +176,7 @@ end:
 ; Spill saved exec
 ; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]]
 ; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]]
-; VGPR: buffer_store_dword [[SPILL_VGPR]], off, s[0:3], 0 offset:[[VREG_SAVE_RESTORE_OFFSET:[0-9]+]] ; 4-byte Folded Spill
+; VGPR: buffer_store_dword [[SPILL_VGPR]], off, s[0:3], 0 offset:[[SAVEEXEC_OFFSET:[0-9]+]] ; 4-byte Folded Spill
 
 ; VMEM: v_writelane_b32 v[[V_SAVEEXEC:[0-9]+]], s[[SAVEEXEC_LO]], 0
 ; VMEM: v_writelane_b32 v[[V_SAVEEXEC]], s[[SAVEEXEC_HI]], 1
@@ -189,8 +189,7 @@ end:
 ; GCN-NEXT: s_branch [[ELSE:.LBB[0-9]+_[0-9]+]]
 
 ; GCN: [[FLOW]]: ; %Flow
-; VGPR: buffer_load_dword [[SPILL_VGPR:v[0-9]+]], off, s[0:3], 0 offset:[[VREG_SAVE_RESTORE_OFFSET]] ; 4-byte Folded Reload
-; GCN: buffer_load_dword [[FLOW_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[FLOW_VAL_OFFSET:[0-9]+]] ; 4-byte Folded Reload
+; VGPR: buffer_load_dword [[SPILL_VGPR:v[0-9]+]], off, s[0:3], 0 offset:[[SAVEEXEC_OFFSET]] ; 4-byte Folded Reload
 ; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]
 ; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]
 
@@ -201,6 +200,8 @@ end:
 
 ; GCN: s_or_saveexec_b64 s[[[FLOW_S_RELOAD_SAVEEXEC_LO_SAVEEXEC:[0-9]+]]:[[FLOW_S_RELOAD_SAVEEXEC_HI_SAVEEXEC:[0-9]+]]], s[[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]]
 
+; GCN: buffer_load_dword [[FLOW_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[FLOW_VAL_OFFSET:[0-9]+]] ; 4-byte Folded Reload
+
 ; Regular spill value restored after exec modification
 ; Followed by spill
 ; GCN: buffer_store_dword [[FLOW_VAL]], off, s[0:3], 0 offset:[[RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill
@@ -233,7 +234,6 @@ end:
 ; GCN-NEXT: s_branch [[FLOW]]
 
 ; GCN: [[ENDIF]]:
-; GCN: buffer_load_dword v[[RESULT:[0-9]+]], off, s[0:3], 0 offset:[[RESULT_OFFSET]] ; 4-byte Folded Reload
 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[FLOW_SAVEEXEC_LO_LANE]]
 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[FLOW_SAVEEXEC_HI_LANE]]
 
@@ -245,6 +245,8 @@ end:
 
 ; GCN: s_or_b64 exec, exec, s[[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]]
 
+; GCN: buffer_load_dword v[[RESULT:[0-9]+]], off, s[0:3], 0 offset:[[RESULT_OFFSET]] ; 4-byte Folded Reload
+
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RESULT]]
 define amdgpu_kernel void @divergent_if_else_endif(ptr addrspace(1) %out) #0 {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
index 019b5ab593fe360..f97d15739be2965 100644
--- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
@@ -148,9 +148,9 @@ define { i32, half } @call_split_type_used_outside_block_struct() #0 {
 ; GCN-NEXT:    s_add_u32 s16, s16, func_struct at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s17, s17, func_struct at rel32@hi+12
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT:    v_mov_b32_e32 v1, v4
 ; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_mov_b32_e32 v1, v4
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
index 27fa6712f9eb429..8098304d134229d 100644
--- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
+++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
@@ -7,7 +7,8 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
 ; CHECK-NEXT:    v_writelane_b32 v1, s30, 0
 ; CHECK-NEXT:    v_writelane_b32 v1, s31, 1
@@ -31,238 +32,295 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    s_mov_b32 s5, s24
 ; CHECK-NEXT:    v_writelane_b32 v1, s51, 17
 ; CHECK-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0x0
-; CHECK-NEXT:    ; implicit-def: $vgpr4 : SGPR spill to VGPR lane
+; CHECK-NEXT:    ; implicit-def: $vgpr5 : SGPR spill to VGPR lane
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
 ; CHECK-NEXT:    s_load_dwordx4 s[28:31], s[4:5], 0x0
 ; CHECK-NEXT:    s_movk_i32 s4, 0x130
 ; CHECK-NEXT:    s_mov_b32 s5, s24
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_writelane_b32 v4, s36, 0
-; CHECK-NEXT:    v_writelane_b32 v4, s37, 1
-; CHECK-NEXT:    v_writelane_b32 v4, s38, 2
-; CHECK-NEXT:    v_writelane_b32 v4, s39, 3
-; CHECK-NEXT:    v_writelane_b32 v4, s40, 4
-; CHECK-NEXT:    v_writelane_b32 v4, s41, 5
-; CHECK-NEXT:    v_writelane_b32 v4, s42, 6
-; CHECK-NEXT:    v_writelane_b32 v4, s43, 7
-; CHECK-NEXT:    v_writelane_b32 v4, s44, 8
-; CHECK-NEXT:    v_writelane_b32 v4, s45, 9
-; CHECK-NEXT:    v_writelane_b32 v4, s46, 10
+; CHECK-NEXT:    v_writelane_b32 v5, s36, 0
+; CHECK-NEXT:    v_writelane_b32 v5, s37, 1
+; CHECK-NEXT:    v_writelane_b32 v5, s38, 2
+; CHECK-NEXT:    v_writelane_b32 v5, s39, 3
+; CHECK-NEXT:    v_writelane_b32 v5, s40, 4
+; CHECK-NEXT:    v_writelane_b32 v5, s41, 5
+; CHECK-NEXT:    v_writelane_b32 v5, s42, 6
+; CHECK-NEXT:    v_writelane_b32 v5, s43, 7
+; CHECK-NEXT:    v_writelane_b32 v5, s44, 8
+; CHECK-NEXT:    v_writelane_b32 v5, s45, 9
+; CHECK-NEXT:    v_writelane_b32 v5, s46, 10
 ; CHECK-NEXT:    s_load_dwordx16 s[4:19], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v4, s47, 11
-; CHECK-NEXT:    v_writelane_b32 v4, s48, 12
-; CHECK-NEXT:    v_writelane_b32 v4, s49, 13
+; CHECK-NEXT:    v_writelane_b32 v5, s47, 11
+; CHECK-NEXT:    v_writelane_b32 v5, s48, 12
+; CHECK-NEXT:    v_writelane_b32 v5, s49, 13
 ; CHECK-NEXT:    s_mov_b32 s20, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0
-; CHECK-NEXT:    v_writelane_b32 v4, s50, 14
-; CHECK-NEXT:    v_mov_b32_e32 v5, s28
-; CHECK-NEXT:    v_mov_b32_e32 v6, v2
+; CHECK-NEXT:    v_writelane_b32 v5, s50, 14
+; CHECK-NEXT:    v_mov_b32_e32 v6, s28
+; CHECK-NEXT:    v_mov_b32_e32 v7, v2
 ; CHECK-NEXT:    s_mov_b32 s21, s20
 ; CHECK-NEXT:    s_mov_b32 s22, s20
 ; CHECK-NEXT:    s_mov_b32 s23, s20
-; CHECK-NEXT:    v_writelane_b32 v4, s51, 15
+; CHECK-NEXT:    v_writelane_b32 v5, s51, 15
 ; CHECK-NEXT:    v_mov_b32_e32 v3, v2
-; CHECK-NEXT:    image_sample_lz v5, v[5:6], s[44:51], s[20:23] dmask:0x1
+; CHECK-NEXT:    image_sample_lz v6, v[6:7], s[44:51], s[20:23] dmask:0x1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_writelane_b32 v4, s4, 16
+; CHECK-NEXT:    v_writelane_b32 v5, s4, 16
+; CHECK-NEXT:    v_writelane_b32 v5, s5, 17
+; CHECK-NEXT:    v_writelane_b32 v5, s6, 18
+; CHECK-NEXT:    v_writelane_b32 v5, s7, 19
+; CHECK-NEXT:    v_writelane_b32 v5, s8, 20
+; CHECK-NEXT:    v_writelane_b32 v5, s9, 21
+; CHECK-NEXT:    image_sample_lz v7, v[2:3], s[4:11], s[20:23] dmask:0x1
+; CHECK-NEXT:    v_writelane_b32 v5, s10, 22
+; CHECK-NEXT:    v_writelane_b32 v5, s11, 23
+; CHECK-NEXT:    v_writelane_b32 v5, s12, 24
+; CHECK-NEXT:    v_writelane_b32 v5, s13, 25
+; CHECK-NEXT:    v_writelane_b32 v5, s14, 26
+; CHECK-NEXT:    v_writelane_b32 v5, s15, 27
+; CHECK-NEXT:    v_writelane_b32 v5, s16, 28
 ; CHECK-NEXT:    v_writelane_b32 v1, s52, 18
-; CHECK-NEXT:    v_writelane_b32 v4, s5, 17
+; CHECK-NEXT:    v_writelane_b32 v5, s17, 29
 ; CHECK-NEXT:    v_writelane_b32 v1, s53, 19
-; CHECK-NEXT:    v_writelane_b32 v4, s6, 18
+; CHECK-NEXT:    v_writelane_b32 v5, s18, 30
 ; CHECK-NEXT:    v_writelane_b32 v1, s54, 20
-; CHECK-NEXT:    image_sample_lz v6, v[2:3], s[4:11], s[20:23] dmask:0x1
-; CHECK-NEXT:    v_writelane_b32 v4, s7, 19
+; CHECK-NEXT:    v_writelane_b32 v5, s19, 31
+; CHECK-NEXT:    s_mov_b32 s4, 48
+; CHECK-NEXT:    s_mov_b32 s5, s24
 ; CHECK-NEXT:    v_writelane_b32 v1, s55, 21
-; CHECK-NEXT:    v_writelane_b32 v4, s8, 20
+; CHECK-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
 ; CHECK-NEXT:    v_writelane_b32 v1, s56, 22
-; CHECK-NEXT:    v_writelane_b32 v4, s9, 21
 ; CHECK-NEXT:    v_writelane_b32 v1, s57, 23
-; CHECK-NEXT:    v_writelane_b32 v4, s10, 22
 ; CHECK-NEXT:    v_writelane_b32 v1, s58, 24
-; CHECK-NEXT:    v_writelane_b32 v4, s11, 23
 ; CHECK-NEXT:    v_writelane_b32 v1, s59, 25
-; CHECK-NEXT:    v_writelane_b32 v4, s12, 24
 ; CHECK-NEXT:    v_writelane_b32 v1, s60, 26
-; CHECK-NEXT:    v_writelane_b32 v4, s13, 25
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_writelane_b32 v5, s4, 32
 ; CHECK-NEXT:    v_writelane_b32 v1, s61, 27
-; CHECK-NEXT:    v_writelane_b32 v4, s14, 26
+; CHECK-NEXT:    v_writelane_b32 v5, s5, 33
 ; CHECK-NEXT:    v_writelane_b32 v1, s62, 28
-; CHECK-NEXT:    v_writelane_b32 v4, s15, 27
+; CHECK-NEXT:    v_writelane_b32 v5, s6, 34
 ; CHECK-NEXT:    v_writelane_b32 v1, s63, 29
-; CHECK-NEXT:    v_writelane_b32 v4, s16, 28
+; CHECK-NEXT:    v_writelane_b32 v5, s7, 35
 ; CHECK-NEXT:    v_writelane_b32 v1, s64, 30
-; CHECK-NEXT:    v_writelane_b32 v4, s17, 29
+; CHECK-NEXT:    v_writelane_b32 v5, s8, 36
 ; CHECK-NEXT:    v_writelane_b32 v1, s65, 31
-; CHECK-NEXT:    v_writelane_b32 v4, s18, 30
+; CHECK-NEXT:    v_writelane_b32 v5, s9, 37
 ; CHECK-NEXT:    v_writelane_b32 v1, s66, 32
-; CHECK-NEXT:    v_writelane_b32 v4, s19, 31
-; CHECK-NEXT:    s_mov_b32 s4, 48
+; CHECK-NEXT:    s_movk_i32 s26, 0x1f0
 ; CHECK-NEXT:    s_movk_i32 s28, 0x2f0
-; CHECK-NEXT:    s_mov_b32 s5, s24
+; CHECK-NEXT:    s_mov_b32 s27, s24
 ; CHECK-NEXT:    s_mov_b32 s29, s24
+; CHECK-NEXT:    v_writelane_b32 v5, s10, 38
 ; CHECK-NEXT:    v_writelane_b32 v1, s67, 33
-; CHECK-NEXT:    s_movk_i32 s26, 0x1f0
-; CHECK-NEXT:    s_mov_b32 s27, s24
-; CHECK-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    s_load_dwordx16 s[36:51], s[26:27], 0x0
-; CHECK-NEXT:    s_load_dwordx16 s[52:67], s[28:29], 0x0
+; CHECK-NEXT:    v_writelane_b32 v5, s11, 39
+; CHECK-NEXT:    s_load_dwordx16 s[52:67], s[26:27], 0x0
+; CHECK-NEXT:    s_load_dwordx16 s[4:19], s[28:29], 0x0
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; CHECK-NEXT:    s_xor_b64 s[24:25], vcc, -1
+; CHECK-NEXT:    ; implicit-def: $vgpr4 : SGPR spill to VGPR lane
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_mul_f32_e32 v0, v6, v5
+; CHECK-NEXT:    v_mul_f32_e32 v0, v7, v6
 ; CHECK-NEXT:    s_and_saveexec_b64 s[26:27], s[24:25]
 ; CHECK-NEXT:    s_xor_b64 s[26:27], exec, s[26:27]
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_3
 ; CHECK-NEXT:  ; %bb.1: ; %bb48
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_writelane_b32 v4, s36, 32
-; CHECK-NEXT:    v_writelane_b32 v4, s37, 33
-; CHECK-NEXT:    v_writelane_b32 v4, s38, 34
-; CHECK-NEXT:    v_writelane_b32 v4, s39, 35
-; CHECK-NEXT:    v_writelane_b32 v4, s40, 36
-; CHECK-NEXT:    v_writelane_b32 v4, s41, 37
-; CHECK-NEXT:    v_writelane_b32 v4, s42, 38
-; CHECK-NEXT:    v_writelane_b32 v4, s43, 39
-; CHECK-NEXT:    v_writelane_b32 v4, s44, 40
-; CHECK-NEXT:    v_writelane_b32 v4, s45, 41
-; CHECK-NEXT:    v_writelane_b32 v4, s46, 42
-; CHECK-NEXT:    v_writelane_b32 v4, s47, 43
-; CHECK-NEXT:    v_writelane_b32 v4, s48, 44
-; CHECK-NEXT:    v_writelane_b32 v4, s49, 45
-; CHECK-NEXT:    v_writelane_b32 v4, s50, 46
-; CHECK-NEXT:    v_writelane_b32 v4, s51, 47
-; CHECK-NEXT:    v_readlane_b32 s36, v4, 0
-; CHECK-NEXT:    v_readlane_b32 s44, v4, 8
-; CHECK-NEXT:    v_readlane_b32 s45, v4, 9
-; CHECK-NEXT:    v_readlane_b32 s46, v4, 10
-; CHECK-NEXT:    v_readlane_b32 s47, v4, 11
-; CHECK-NEXT:    v_readlane_b32 s48, v4, 12
-; CHECK-NEXT:    v_readlane_b32 s49, v4, 13
-; CHECK-NEXT:    v_readlane_b32 s50, v4, 14
-; CHECK-NEXT:    v_readlane_b32 s51, v4, 15
-; CHECK-NEXT:    v_readlane_b32 s37, v4, 1
-; CHECK-NEXT:    v_readlane_b32 s38, v4, 2
-; CHECK-NEXT:    v_readlane_b32 s39, v4, 3
-; CHECK-NEXT:    v_readlane_b32 s40, v4, 4
-; CHECK-NEXT:    v_readlane_b32 s41, v4, 5
-; CHECK-NEXT:    image_sample_lz v5, v[2:3], s[44:51], s[20:23] dmask:0x1
-; CHECK-NEXT:    v_readlane_b32 s42, v4, 6
-; CHECK-NEXT:    v_readlane_b32 s43, v4, 7
-; CHECK-NEXT:    v_readlane_b32 s36, v4, 32
-; CHECK-NEXT:    v_readlane_b32 s37, v4, 33
-; CHECK-NEXT:    v_readlane_b32 s38, v4, 34
-; CHECK-NEXT:    v_readlane_b32 s39, v4, 35
-; CHECK-NEXT:    v_readlane_b32 s40, v4, 36
-; CHECK-NEXT:    v_readlane_b32 s41, v4, 37
-; CHECK-NEXT:    v_readlane_b32 s42, v4, 38
-; CHECK-NEXT:    v_readlane_b32 s43, v4, 39
-; CHECK-NEXT:    v_readlane_b32 s44, v4, 40
-; CHECK-NEXT:    v_readlane_b32 s45, v4, 41
-; CHECK-NEXT:    v_readlane_b32 s46, v4, 42
-; CHECK-NEXT:    v_readlane_b32 s47, v4, 43
-; CHECK-NEXT:    v_readlane_b32 s48, v4, 44
-; CHECK-NEXT:    v_readlane_b32 s49, v4, 45
-; CHECK-NEXT:    v_readlane_b32 s50, v4, 46
-; CHECK-NEXT:    v_readlane_b32 s51, v4, 47
-; CHECK-NEXT:    v_mov_b32_e32 v3, 0
+; CHECK-NEXT:    v_readlane_b32 s36, v5, 0
+; CHECK-NEXT:    v_readlane_b32 s44, v5, 8
+; CHECK-NEXT:    v_readlane_b32 s45, v5, 9
+; CHECK-NEXT:    v_readlane_b32 s46, v5, 10
+; CHECK-NEXT:    v_readlane_b32 s47, v5, 11
+; CHECK-NEXT:    v_readlane_b32 s48, v5, 12
+; CHECK-NEXT:    v_readlane_b32 s49, v5, 13
+; CHECK-NEXT:    v_readlane_b32 s50, v5, 14
+; CHECK-NEXT:    v_readlane_b32 s51, v5, 15
 ; CHECK-NEXT:    s_and_b64 vcc, exec, -1
+; CHECK-NEXT:    v_readlane_b32 s37, v5, 1
+; CHECK-NEXT:    v_readlane_b32 s38, v5, 2
+; CHECK-NEXT:    v_readlane_b32 s39, v5, 3
+; CHECK-NEXT:    v_readlane_b32 s40, v5, 4
+; CHECK-NEXT:    image_sample_lz v6, v[2:3], s[44:51], s[20:23] dmask:0x1
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0
+; CHECK-NEXT:    v_readlane_b32 s41, v5, 5
+; CHECK-NEXT:    v_readlane_b32 s42, v5, 6
+; CHECK-NEXT:    v_readlane_b32 s43, v5, 7
 ; CHECK-NEXT:  .LBB0_2: ; %bb50
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    v_readlane_b32 s36, v5, 32
+; CHECK-NEXT:    v_readlane_b32 s40, v5, 36
+; CHECK-NEXT:    v_readlane_b32 s41, v5, 37
+; CHECK-NEXT:    v_readlane_b32 s42, v5, 38
+; CHECK-NEXT:    v_readlane_b32 s43, v5, 39
 ; CHECK-NEXT:    s_mov_b32 s21, s20
 ; CHECK-NEXT:    s_mov_b32 s22, s20
 ; CHECK-NEXT:    s_mov_b32 s23, s20
-; CHECK-NEXT:    image_sample_lz v6, v[2:3], s[44:51], s[8:11] dmask:0x1
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    image_sample_lz v2, v[2:3], s[60:67], s[20:23] dmask:0x1
+; CHECK-NEXT:    v_readlane_b32 s37, v5, 33
+; CHECK-NEXT:    v_readlane_b32 s38, v5, 34
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    image_sample_lz v7, v[2:3], s[60:67], s[40:43] dmask:0x1
+; CHECK-NEXT:    v_readlane_b32 s39, v5, 35
+; CHECK-NEXT:    image_sample_lz v2, v[2:3], s[12:19], s[20:23] dmask:0x1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_sub_f32_e32 v2, v2, v6
+; CHECK-NEXT:    v_sub_f32_e32 v2, v2, v7
 ; CHECK-NEXT:    v_mul_f32_e32 v2, v2, v0
-; CHECK-NEXT:    v_mul_f32_e32 v2, v2, v5
+; CHECK-NEXT:    v_mul_f32_e32 v2, v2, v6
 ; CHECK-NEXT:    s_mov_b64 vcc, vcc
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB0_2
 ; CHECK-NEXT:  .LBB0_3: ; %Flow14
-; CHECK-NEXT:    s_andn2_saveexec_b64 s[28:29], s[26:27]
+; CHECK-NEXT:    s_or_saveexec_b64 s[20:21], s[26:27]
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_readlane_b32 s12, v5, 32
+; CHECK-NEXT:    v_readlane_b32 s13, v5, 33
+; CHECK-NEXT:    v_readlane_b32 s14, v5, 34
+; CHECK-NEXT:    v_readlane_b32 s15, v5, 35
+; CHECK-NEXT:    v_readlane_b32 s16, v5, 36
+; CHECK-NEXT:    v_readlane_b32 s17, v5, 37
+; CHECK-NEXT:    v_readlane_b32 s18, v5, 38
+; CHECK-NEXT:    v_readlane_b32 s19, v5, 39
+; CHECK-NEXT:    v_writelane_b32 v5, s4, 56
+; CHECK-NEXT:    v_writelane_b32 v5, s5, 57
+; CHECK-NEXT:    v_writelane_b32 v5, s6, 58
+; CHECK-NEXT:    v_writelane_b32 v5, s7, 59
+; CHECK-NEXT:    v_writelane_b32 v5, s8, 60
+; CHECK-NEXT:    v_writelane_b32 v5, s9, 61
+; CHECK-NEXT:    v_writelane_b32 v5, s10, 62
+; CHECK-NEXT:    v_writelane_b32 v5, s11, 63
+; CHECK-NEXT:    v_writelane_b32 v5, s52, 40
+; CHECK-NEXT:    v_writelane_b32 v5, s53, 41
+; CHECK-NEXT:    v_writelane_b32 v5, s54, 42
+; CHECK-NEXT:    v_writelane_b32 v5, s55, 43
+; CHECK-NEXT:    v_writelane_b32 v5, s56, 44
+; CHECK-NEXT:    v_writelane_b32 v5, s57, 45
+; CHECK-NEXT:    v_writelane_b32 v5, s58, 46
+; CHECK-NEXT:    v_writelane_b32 v5, s59, 47
+; CHECK-NEXT:    v_writelane_b32 v4, s12, 0
+; CHECK-NEXT:    v_writelane_b32 v5, s60, 48
+; CHECK-NEXT:    v_writelane_b32 v4, s13, 1
+; CHECK-NEXT:    v_writelane_b32 v5, s61, 49
+; CHECK-NEXT:    v_writelane_b32 v4, s14, 2
+; CHECK-NEXT:    v_writelane_b32 v5, s62, 50
+; CHECK-NEXT:    v_writelane_b32 v4, s15, 3
+; CHECK-NEXT:    v_writelane_b32 v5, s63, 51
+; CHECK-NEXT:    v_writelane_b32 v4, s16, 4
+; CHECK-NEXT:    v_writelane_b32 v5, s64, 52
+; CHECK-NEXT:    v_writelane_b32 v4, s17, 5
+; CHECK-NEXT:    v_writelane_b32 v5, s65, 53
+; CHECK-NEXT:    v_writelane_b32 v4, s18, 6
+; CHECK-NEXT:    v_writelane_b32 v5, s66, 54
+; CHECK-NEXT:    v_writelane_b32 v4, s19, 7
+; CHECK-NEXT:    v_writelane_b32 v5, s67, 55
+; CHECK-NEXT:    s_xor_b64 exec, exec, s[20:21]
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_10
 ; CHECK-NEXT:  ; %bb.4: ; %bb32
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_and_saveexec_b64 s[8:9], s[24:25]
-; CHECK-NEXT:    s_xor_b64 vcc, exec, s[8:9]
+; CHECK-NEXT:    s_xor_b64 s[22:23], exec, s[8:9]
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_6
 ; CHECK-NEXT:  ; %bb.5: ; %bb43
 ; CHECK-NEXT:    s_mov_b32 s8, 0
-; CHECK-NEXT:    s_mov_b64 s[12:13], s[36:37]
 ; CHECK-NEXT:    s_mov_b32 s9, s8
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s8
-; CHECK-NEXT:    s_mov_b64 s[14:15], s[38:39]
-; CHECK-NEXT:    s_mov_b64 s[16:17], s[40:41]
-; CHECK-NEXT:    s_mov_b64 s[18:19], s[42:43]
-; CHECK-NEXT:    v_readlane_b32 s36, v4, 0
+; CHECK-NEXT:    v_readlane_b32 s36, v5, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s9
 ; CHECK-NEXT:    s_mov_b32 s10, s8
 ; CHECK-NEXT:    s_mov_b32 s11, s8
-; CHECK-NEXT:    v_readlane_b32 s37, v4, 1
-; CHECK-NEXT:    v_readlane_b32 s38, v4, 2
-; CHECK-NEXT:    v_readlane_b32 s39, v4, 3
-; CHECK-NEXT:    v_readlane_b32 s40, v4, 4
-; CHECK-NEXT:    v_readlane_b32 s41, v4, 5
-; CHECK-NEXT:    v_readlane_b32 s42, v4, 6
-; CHECK-NEXT:    v_readlane_b32 s43, v4, 7
-; CHECK-NEXT:    v_mov_b32_e32 v6, 0
-; CHECK-NEXT:    v_mov_b32_e32 v7, v6
-; CHECK-NEXT:    v_readlane_b32 s44, v4, 8
-; CHECK-NEXT:    v_readlane_b32 s45, v4, 9
-; CHECK-NEXT:    v_readlane_b32 s46, v4, 10
-; CHECK-NEXT:    image_sample_lz v5, v[2:3], s[36:43], s[8:11] dmask:0x1
-; CHECK-NEXT:    s_mov_b64 s[42:43], s[18:19]
-; CHECK-NEXT:    s_mov_b64 s[40:41], s[16:17]
-; CHECK-NEXT:    s_mov_b64 s[38:39], s[14:15]
-; CHECK-NEXT:    s_mov_b64 s[36:37], s[12:13]
-; CHECK-NEXT:    v_readlane_b32 s12, v4, 16
-; CHECK-NEXT:    v_readlane_b32 s20, v4, 24
-; CHECK-NEXT:    v_readlane_b32 s21, v4, 25
-; CHECK-NEXT:    v_readlane_b32 s22, v4, 26
-; CHECK-NEXT:    v_readlane_b32 s23, v4, 27
-; CHECK-NEXT:    v_readlane_b32 s24, v4, 28
-; CHECK-NEXT:    v_readlane_b32 s25, v4, 29
-; CHECK-NEXT:    v_readlane_b32 s26, v4, 30
-; CHECK-NEXT:    v_readlane_b32 s27, v4, 31
-; CHECK-NEXT:    v_readlane_b32 s47, v4, 11
-; CHECK-NEXT:    v_readlane_b32 s48, v4, 12
-; CHECK-NEXT:    v_readlane_b32 s49, v4, 13
-; CHECK-NEXT:    v_readlane_b32 s50, v4, 14
-; CHECK-NEXT:    v_readlane_b32 s51, v4, 15
-; CHECK-NEXT:    image_sample_lz v2, v[2:3], s[20:27], s[4:7] dmask:0x1
-; CHECK-NEXT:    v_readlane_b32 s13, v4, 17
-; CHECK-NEXT:    v_readlane_b32 s14, v4, 18
-; CHECK-NEXT:    v_readlane_b32 s15, v4, 19
-; CHECK-NEXT:    v_readlane_b32 s16, v4, 20
-; CHECK-NEXT:    v_readlane_b32 s17, v4, 21
-; CHECK-NEXT:    v_readlane_b32 s18, v4, 22
-; CHECK-NEXT:    v_readlane_b32 s19, v4, 23
+; CHECK-NEXT:    v_readlane_b32 s37, v5, 1
+; CHECK-NEXT:    v_readlane_b32 s38, v5, 2
+; CHECK-NEXT:    v_readlane_b32 s39, v5, 3
+; CHECK-NEXT:    v_readlane_b32 s40, v5, 4
+; CHECK-NEXT:    v_readlane_b32 s41, v5, 5
+; CHECK-NEXT:    v_readlane_b32 s42, v5, 6
+; CHECK-NEXT:    v_readlane_b32 s43, v5, 7
+; CHECK-NEXT:    v_readlane_b32 s44, v5, 8
+; CHECK-NEXT:    v_readlane_b32 s45, v5, 9
+; CHECK-NEXT:    v_readlane_b32 s46, v5, 10
+; CHECK-NEXT:    v_readlane_b32 s47, v5, 11
+; CHECK-NEXT:    v_readlane_b32 s48, v5, 12
+; CHECK-NEXT:    v_readlane_b32 s49, v5, 13
+; CHECK-NEXT:    v_readlane_b32 s50, v5, 14
+; CHECK-NEXT:    v_readlane_b32 s51, v5, 15
+; CHECK-NEXT:    image_sample_lz v6, v[2:3], s[36:43], s[8:11] dmask:0x1
+; CHECK-NEXT:    v_readlane_b32 s36, v5, 16
+; CHECK-NEXT:    v_readlane_b32 s44, v5, 24
+; CHECK-NEXT:    v_readlane_b32 s45, v5, 25
+; CHECK-NEXT:    v_readlane_b32 s46, v5, 26
+; CHECK-NEXT:    v_readlane_b32 s47, v5, 27
+; CHECK-NEXT:    v_readlane_b32 s48, v5, 28
+; CHECK-NEXT:    v_readlane_b32 s49, v5, 29
+; CHECK-NEXT:    v_readlane_b32 s50, v5, 30
+; CHECK-NEXT:    v_readlane_b32 s51, v5, 31
+; CHECK-NEXT:    v_mov_b32_e32 v7, 0
+; CHECK-NEXT:    v_mov_b32_e32 v8, v7
+; CHECK-NEXT:    v_readlane_b32 s37, v5, 17
+; CHECK-NEXT:    v_readlane_b32 s38, v5, 18
+; CHECK-NEXT:    v_readlane_b32 s39, v5, 19
+; CHECK-NEXT:    image_sample_lz v2, v[2:3], s[44:51], s[12:15] dmask:0x1
+; CHECK-NEXT:    v_readlane_b32 s40, v5, 20
+; CHECK-NEXT:    v_readlane_b32 s41, v5, 21
+; CHECK-NEXT:    v_readlane_b32 s42, v5, 22
+; CHECK-NEXT:    v_readlane_b32 s43, v5, 23
 ; CHECK-NEXT:    ; implicit-def: $vgpr0
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_dwordx3 v[5:7], off, s[8:11], 0
+; CHECK-NEXT:    buffer_store_dwordx3 v[6:8], off, s[8:11], 0
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
 ; CHECK-NEXT:    buffer_store_dwordx4 v[2:5], off, s[8:11], 0
 ; CHECK-NEXT:  .LBB0_6: ; %Flow12
-; CHECK-NEXT:    s_andn2_saveexec_b64 s[4:5], vcc
+; CHECK-NEXT:    s_andn2_saveexec_b64 s[4:5], s[22:23]
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_9
 ; CHECK-NEXT:  ; %bb.7: ; %bb33.preheader
 ; CHECK-NEXT:    s_mov_b32 s8, 0
 ; CHECK-NEXT:    s_mov_b32 s6, s8
+; CHECK-NEXT:    v_readlane_b32 s36, v5, 40
 ; CHECK-NEXT:    s_mov_b32 s7, s8
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s6
+; CHECK-NEXT:    v_readlane_b32 s37, v5, 41
 ; CHECK-NEXT:    s_mov_b32 s9, s8
 ; CHECK-NEXT:    s_mov_b32 s10, s8
 ; CHECK-NEXT:    s_mov_b32 s11, s8
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s7
-; CHECK-NEXT:    image_sample_lz v5, v[2:3], s[36:43], s[8:11] dmask:0x1
-; CHECK-NEXT:    image_sample_lz v6, v[2:3], s[52:59], s[8:11] dmask:0x1
+; CHECK-NEXT:    v_readlane_b32 s38, v5, 42
+; CHECK-NEXT:    v_readlane_b32 s39, v5, 43
+; CHECK-NEXT:    v_readlane_b32 s40, v5, 44
+; CHECK-NEXT:    v_readlane_b32 s41, v5, 45
+; CHECK-NEXT:    v_readlane_b32 s42, v5, 46
+; CHECK-NEXT:    v_readlane_b32 s43, v5, 47
+; CHECK-NEXT:    v_readlane_b32 s44, v5, 48
+; CHECK-NEXT:    v_readlane_b32 s45, v5, 49
+; CHECK-NEXT:    v_readlane_b32 s46, v5, 50
+; CHECK-NEXT:    v_readlane_b32 s47, v5, 51
+; CHECK-NEXT:    v_readlane_b32 s48, v5, 52
+; CHECK-NEXT:    v_readlane_b32 s49, v5, 53
+; CHECK-NEXT:    v_readlane_b32 s50, v5, 54
+; CHECK-NEXT:    v_readlane_b32 s51, v5, 55
+; CHECK-NEXT:    s_mov_b64 s[12:13], s[36:37]
+; CHECK-NEXT:    s_mov_b64 s[14:15], s[38:39]
+; CHECK-NEXT:    s_mov_b64 s[16:17], s[40:41]
+; CHECK-NEXT:    s_mov_b64 s[18:19], s[42:43]
+; CHECK-NEXT:    image_sample_lz v6, v[2:3], s[36:43], s[8:11] dmask:0x1
+; CHECK-NEXT:    v_readlane_b32 s36, v5, 56
+; CHECK-NEXT:    v_readlane_b32 s37, v5, 57
+; CHECK-NEXT:    v_readlane_b32 s38, v5, 58
+; CHECK-NEXT:    v_readlane_b32 s39, v5, 59
+; CHECK-NEXT:    v_readlane_b32 s40, v5, 60
+; CHECK-NEXT:    v_readlane_b32 s41, v5, 61
+; CHECK-NEXT:    v_readlane_b32 s42, v5, 62
+; CHECK-NEXT:    v_readlane_b32 s43, v5, 63
+; CHECK-NEXT:    ; kill: killed $vgpr2_vgpr3
 ; CHECK-NEXT:    s_and_b64 vcc, exec, 0
+; CHECK-NEXT:    v_readlane_b32 s44, v4, 0
+; CHECK-NEXT:    v_readlane_b32 s45, v4, 1
+; CHECK-NEXT:    v_readlane_b32 s46, v4, 2
+; CHECK-NEXT:    v_readlane_b32 s47, v4, 3
+; CHECK-NEXT:    image_sample_lz v7, v[2:3], s[36:43], s[8:11] dmask:0x1
+; CHECK-NEXT:    v_readlane_b32 s48, v4, 4
+; CHECK-NEXT:    v_readlane_b32 s49, v4, 5
+; CHECK-NEXT:    v_readlane_b32 s50, v4, 6
+; CHECK-NEXT:    v_readlane_b32 s51, v4, 7
+; CHECK-NEXT:    ; kill: killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+; CHECK-NEXT:    ; kill: killed $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43
+; CHECK-NEXT:    ; kill: killed $sgpr8_sgpr9_sgpr10 killed $sgpr11
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_sub_f32_e32 v2, v6, v5
+; CHECK-NEXT:    v_sub_f32_e32 v2, v7, v6
 ; CHECK-NEXT:    v_mul_f32_e32 v0, v2, v0
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0
 ; CHECK-NEXT:  .LBB0_8: ; %bb33
@@ -274,8 +332,7 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:  .LBB0_9: ; %Flow13
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; CHECK-NEXT:  .LBB0_10: ; %UnifiedReturnBlock
-; CHECK-NEXT:    s_or_b64 exec, exec, s[28:29]
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_or_b64 exec, exec, s[20:21]
 ; CHECK-NEXT:    v_readlane_b32 s67, v1, 33
 ; CHECK-NEXT:    v_readlane_b32 s66, v1, 32
 ; CHECK-NEXT:    v_readlane_b32 s65, v1, 31
@@ -310,10 +367,12 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    v_readlane_b32 s36, v1, 2
 ; CHECK-NEXT:    v_readlane_b32 s31, v1, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v1, 0
+; CHECK-NEXT:    ; kill: killed $vgpr5
 ; CHECK-NEXT:    ; kill: killed $vgpr4
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll
index c877740c1baa9f0..fd261e8e2bb25e3 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll
@@ -210,7 +210,6 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  ; %bb.2: ; in Loop: Header=BB0_1 Depth=1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
@@ -222,7 +221,9 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
 ; W64-O0-NEXT:    v_readlane_b32 s10, v1, 5
 ; W64-O0-NEXT:    v_readlane_b32 s11, v1, 6
 ; W64-O0-NEXT:    v_readlane_b32 s6, v1, 0
-; W64-O0-NEXT:    s_nop 4
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    s_nop 2
 ; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
@@ -232,11 +233,11 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(1)
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    v_readlane_b32 s4, v1, 1
 ; W64-O0-NEXT:    v_readlane_b32 s5, v1, 2
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    ; kill: killed $vgpr1
 ; W64-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
@@ -601,7 +602,6 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  ; %bb.2: ; in Loop: Header=BB1_1 Depth=1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
@@ -613,7 +613,9 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt
 ; W64-O0-NEXT:    v_readlane_b32 s10, v1, 5
 ; W64-O0-NEXT:    v_readlane_b32 s11, v1, 6
 ; W64-O0-NEXT:    v_readlane_b32 s6, v1, 0
-; W64-O0-NEXT:    s_nop 4
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    s_nop 2
 ; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
@@ -668,7 +670,6 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  ; %bb.5: ; in Loop: Header=BB1_4 Depth=1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
@@ -680,7 +681,9 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt
 ; W64-O0-NEXT:    v_readlane_b32 s10, v1, 13
 ; W64-O0-NEXT:    v_readlane_b32 s11, v1, 14
 ; W64-O0-NEXT:    v_readlane_b32 s6, v1, 0
-; W64-O0-NEXT:    s_nop 4
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    s_nop 2
 ; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
@@ -690,16 +693,16 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    v_readlane_b32 s4, v0, 9
+; W64-O0-NEXT:    v_readlane_b32 s5, v0, 10
+; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(6)
-; W64-O0-NEXT:    v_readlane_b32 s4, v0, 9
-; W64-O0-NEXT:    v_readlane_b32 s5, v0, 10
-; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    global_store_dword v[4:5], v6, off
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
@@ -1119,7 +1122,6 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  ; %bb.2: ; in Loop: Header=BB2_1 Depth=1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
@@ -1131,7 +1133,9 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
 ; W64-O0-NEXT:    v_readlane_b32 s10, v1, 6
 ; W64-O0-NEXT:    v_readlane_b32 s11, v1, 7
 ; W64-O0-NEXT:    v_readlane_b32 s6, v1, 1
-; W64-O0-NEXT:    s_nop 4
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    s_nop 2
 ; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -1141,13 +1145,13 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(2)
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    v_readlane_b32 s6, v0, 2
 ; W64-O0-NEXT:    v_readlane_b32 s7, v0, 3
 ; W64-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; W64-O0-NEXT:    v_readlane_b32 s4, v0, 1
+; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b32 s5, 0x3ff
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    v_and_b32_e64 v2, v2, s5
@@ -1214,7 +1218,6 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  ; %bb.6: ; in Loop: Header=BB2_5 Depth=1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
@@ -1226,14 +1229,15 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
 ; W64-O0-NEXT:    v_readlane_b32 s10, v1, 17
 ; W64-O0-NEXT:    v_readlane_b32 s11, v1, 18
 ; W64-O0-NEXT:    v_readlane_b32 s6, v1, 12
-; W64-O0-NEXT:    s_nop 4
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    s_nop 2
 ; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; W64-O0-NEXT:    s_cbranch_execnz .LBB2_5
 ; W64-O0-NEXT:  ; %bb.7:
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
@@ -1241,18 +1245,20 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
 ; W64-O0-NEXT:    v_readlane_b32 s4, v1, 13
 ; W64-O0-NEXT:    v_readlane_b32 s5, v1, 14
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; W64-O0-NEXT:  .LBB2_8: ; %bb2
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(3)
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    v_readlane_b32 s4, v0, 10
 ; W64-O0-NEXT:    v_readlane_b32 s5, v0, 11
 ; W64-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
+; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    global_store_dword v[1:2], v3, off
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
index ac46f8ce20d60bf..72fbcc7bd9e40f5 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
@@ -224,7 +224,6 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 {
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  ; %bb.2: ; in Loop: Header=BB0_1 Depth=1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
@@ -236,7 +235,9 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 {
 ; W64-O0-NEXT:    v_readlane_b32 s10, v1, 5
 ; W64-O0-NEXT:    v_readlane_b32 s11, v1, 6
 ; W64-O0-NEXT:    v_readlane_b32 s6, v1, 0
-; W64-O0-NEXT:    s_nop 4
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    s_nop 2
 ; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
@@ -246,11 +247,11 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 {
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(1)
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    v_readlane_b32 s4, v1, 1
 ; W64-O0-NEXT:    v_readlane_b32 s5, v1, 2
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    ; kill: killed $vgpr1
 ; W64-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
@@ -640,7 +641,6 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8)
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  ; %bb.2: ; in Loop: Header=BB1_1 Depth=1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
@@ -652,7 +652,9 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8)
 ; W64-O0-NEXT:    v_readlane_b32 s10, v1, 5
 ; W64-O0-NEXT:    v_readlane_b32 s11, v1, 6
 ; W64-O0-NEXT:    v_readlane_b32 s6, v1, 0
-; W64-O0-NEXT:    s_nop 4
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    s_nop 2
 ; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
@@ -707,7 +709,6 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8)
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  ; %bb.5: ; in Loop: Header=BB1_4 Depth=1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
@@ -719,7 +720,9 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8)
 ; W64-O0-NEXT:    v_readlane_b32 s10, v1, 13
 ; W64-O0-NEXT:    v_readlane_b32 s11, v1, 14
 ; W64-O0-NEXT:    v_readlane_b32 s6, v1, 0
-; W64-O0-NEXT:    s_nop 4
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    s_nop 2
 ; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
@@ -729,16 +732,16 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8)
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    v_readlane_b32 s4, v0, 9
+; W64-O0-NEXT:    v_readlane_b32 s5, v0, 10
+; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(6)
-; W64-O0-NEXT:    v_readlane_b32 s4, v0, 9
-; W64-O0-NEXT:    v_readlane_b32 s5, v0, 10
-; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    global_store_dword v[4:5], v6, off
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
@@ -1175,7 +1178,6 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j,
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  ; %bb.2: ; in Loop: Header=BB2_1 Depth=1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
@@ -1187,7 +1189,9 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j,
 ; W64-O0-NEXT:    v_readlane_b32 s10, v1, 6
 ; W64-O0-NEXT:    v_readlane_b32 s11, v1, 7
 ; W64-O0-NEXT:    v_readlane_b32 s6, v1, 1
-; W64-O0-NEXT:    s_nop 4
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    s_nop 2
 ; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
@@ -1197,13 +1201,13 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j,
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(2)
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    v_readlane_b32 s6, v0, 2
 ; W64-O0-NEXT:    v_readlane_b32 s7, v0, 3
 ; W64-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; W64-O0-NEXT:    v_readlane_b32 s4, v0, 1
+; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b32 s5, 0x3ff
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    v_and_b32_e64 v2, v2, s5
@@ -1222,12 +1226,12 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j,
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    v_readlane_b32 s4, v0, 0
 ; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(4)
-; W64-O0-NEXT:    v_readlane_b32 s4, v0, 0
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    v_mov_b32_e32 v7, v5
 ; W64-O0-NEXT:    v_mov_b32_e32 v1, v4
@@ -1291,7 +1295,6 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j,
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  ; %bb.6: ; in Loop: Header=BB2_5 Depth=1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
@@ -1303,14 +1306,15 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j,
 ; W64-O0-NEXT:    v_readlane_b32 s10, v1, 17
 ; W64-O0-NEXT:    v_readlane_b32 s11, v1, 18
 ; W64-O0-NEXT:    v_readlane_b32 s6, v1, 12
-; W64-O0-NEXT:    s_nop 4
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    s_nop 2
 ; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; W64-O0-NEXT:    s_cbranch_execnz .LBB2_5
 ; W64-O0-NEXT:  ; %bb.7:
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
@@ -1318,18 +1322,20 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j,
 ; W64-O0-NEXT:    v_readlane_b32 s4, v1, 13
 ; W64-O0-NEXT:    v_readlane_b32 s5, v1, 14
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; W64-O0-NEXT:  .LBB2_8: ; %bb2
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(3)
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    v_readlane_b32 s4, v0, 10
 ; W64-O0-NEXT:    v_readlane_b32 s5, v0, 11
 ; W64-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
+; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    global_store_dword v[1:2], v3, off
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
index 454dc881f7bf2cd..a4948921ed4270d 100644
--- a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
@@ -223,15 +223,12 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out,
 ; GCN-NEXT:    s_cbranch_scc1 .LBB0_2
 ; GCN-NEXT:  ; %bb.1: ; %bb0
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[92:95], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v2, off, s[92:95], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GCN-NEXT:    buffer_load_dword v1, off, s[92:95], 0 offset:12 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
-; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v2, off, s[92:95], 0 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[34:35]
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_readlane_b32 s8, v2, 56
 ; GCN-NEXT:    v_readlane_b32 s9, v2, 57
 ; GCN-NEXT:    v_readlane_b32 s10, v2, 58
@@ -296,6 +293,7 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out,
 ; GCN-NEXT:    v_readlane_b32 s73, v2, 5
 ; GCN-NEXT:    v_readlane_b32 s74, v2, 6
 ; GCN-NEXT:    v_readlane_b32 s75, v2, 7
+; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_readlane_b32 s76, v1, 56
 ; GCN-NEXT:    v_readlane_b32 s77, v1, 57
 ; GCN-NEXT:    v_readlane_b32 s78, v1, 58
@@ -320,6 +318,9 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out,
 ; GCN-NEXT:    v_readlane_b32 s5, v1, 5
 ; GCN-NEXT:    v_readlane_b32 s6, v1, 6
 ; GCN-NEXT:    v_readlane_b32 s7, v1, 7
+; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[92:95], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[0:7]
 ; GCN-NEXT:    ;;#ASMEND
@@ -378,6 +379,7 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out,
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[0:7]
 ; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_readlane_b32 s0, v0, 0
 ; GCN-NEXT:    v_readlane_b32 s1, v0, 1
 ; GCN-NEXT:    v_readlane_b32 s2, v0, 2
@@ -595,12 +597,12 @@ define amdgpu_kernel void @split_sgpr_spill_2_vgprs(ptr addrspace(1) %out, i32 %
 ; GCN-NEXT:    s_cbranch_scc1 .LBB1_2
 ; GCN-NEXT:  ; %bb.1: ; %bb0
 ; GCN-NEXT:    s_or_saveexec_b64 s[28:29], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[28:29]
 ; GCN-NEXT:    s_or_saveexec_b64 s[28:29], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[28:29]
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_readlane_b32 s16, v1, 8
 ; GCN-NEXT:    v_readlane_b32 s17, v1, 9
 ; GCN-NEXT:    v_readlane_b32 s20, v1, 0
@@ -611,6 +613,7 @@ define amdgpu_kernel void @split_sgpr_spill_2_vgprs(ptr addrspace(1) %out, i32 %
 ; GCN-NEXT:    v_readlane_b32 s25, v1, 5
 ; GCN-NEXT:    v_readlane_b32 s26, v1, 6
 ; GCN-NEXT:    v_readlane_b32 s27, v1, 7
+; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_readlane_b32 s36, v0, 32
 ; GCN-NEXT:    v_readlane_b32 s37, v0, 33
 ; GCN-NEXT:    v_readlane_b32 s38, v0, 34
@@ -854,9 +857,6 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %
 ; GCN-NEXT:    s_cbranch_scc1 .LBB2_2
 ; GCN-NEXT:  ; %bb.1: ; %bb0
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[34:35]
-; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -908,6 +908,9 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %
 ; GCN-NEXT:    v_readlane_b32 s29, v1, 13
 ; GCN-NEXT:    v_readlane_b32 s30, v1, 14
 ; GCN-NEXT:    v_readlane_b32 s31, v1, 15
+; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[16:31]
 ; GCN-NEXT:    ;;#ASMEND
@@ -930,6 +933,7 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %
 ; GCN-NEXT:    v_readlane_b32 s17, v1, 61
 ; GCN-NEXT:    v_readlane_b32 s18, v1, 62
 ; GCN-NEXT:    v_readlane_b32 s19, v1, 63
+; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_readlane_b32 s0, v0, 0
 ; GCN-NEXT:    v_readlane_b32 s1, v0, 1
 ; GCN-NEXT:    ;;#ASMSTART
@@ -1105,9 +1109,6 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 {
 ; GCN-NEXT:    s_cbranch_scc1 .LBB3_2
 ; GCN-NEXT:  ; %bb.1: ; %bb0
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[34:35]
-; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GCN-NEXT:    buffer_load_dword v2, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -1159,6 +1160,9 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 {
 ; GCN-NEXT:    v_readlane_b32 s29, v2, 13
 ; GCN-NEXT:    v_readlane_b32 s30, v2, 14
 ; GCN-NEXT:    v_readlane_b32 s31, v2, 15
+; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def v0
 ; GCN-NEXT:    ;;#ASMEND
@@ -1184,6 +1188,7 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 {
 ; GCN-NEXT:    v_readlane_b32 s17, v2, 61
 ; GCN-NEXT:    v_readlane_b32 s18, v2, 62
 ; GCN-NEXT:    v_readlane_b32 s19, v2, 63
+; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_readlane_b32 s0, v1, 0
 ; GCN-NEXT:    v_readlane_b32 s1, v1, 1
 ; GCN-NEXT:    ;;#ASMSTART
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
index fbbcfd62f5c602d..fec732eff798c8c 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
@@ -123,9 +123,6 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou
 ; GCN-NEXT:    s_cbranch_scc1 .LBB0_2
 ; GCN-NEXT:  ; %bb.1: ; %bb0
 ; GCN-NEXT:    s_or_saveexec_b64 s[24:25], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[24:25]
-; GCN-NEXT:    s_or_saveexec_b64 s[24:25], -1
 ; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[24:25]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -145,6 +142,9 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou
 ; GCN-NEXT:    v_readlane_b32 s17, v1, 13
 ; GCN-NEXT:    v_readlane_b32 s18, v1, 14
 ; GCN-NEXT:    v_readlane_b32 s19, v1, 15
+; GCN-NEXT:    s_or_saveexec_b64 s[24:25], -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[24:25]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[4:19]
 ; GCN-NEXT:    ;;#ASMEND
@@ -202,6 +202,7 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou
 ; GCN-NEXT:    v_readlane_b32 s21, v1, 61
 ; GCN-NEXT:    v_readlane_b32 s22, v1, 62
 ; GCN-NEXT:    v_readlane_b32 s23, v1, 63
+; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_readlane_b32 s4, v0, 0
 ; GCN-NEXT:    v_readlane_b32 s5, v0, 1
 ; GCN-NEXT:    ;;#ASMSTART
diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
index 08db1e7fee259d6..033fd8ef89cfe06 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
@@ -10083,17 +10083,30 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_add_u32 s40, s40, s3
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, -1, 0
+; GFX6-NEXT:    s_addc_u32 s41, s41, 0
 ; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v5, -1, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v6, 0
+; GFX6-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b64 exec, 15
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_writelane_b32 v0, s0, 0
+; GFX6-NEXT:    v_writelane_b32 v0, s1, 1
+; GFX6-NEXT:    v_writelane_b32 v0, s2, 2
+; GFX6-NEXT:    v_writelane_b32 v0, s3, 3
+; GFX6-NEXT:    s_mov_b32 s8, 0x80400
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s8 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 8, v5
 ; GFX6-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:240
-; GFX6-NEXT:    s_addc_u32 s41, s41, 0
-; GFX6-NEXT:    s_mov_b32 s2, 0x83800
+; GFX6-NEXT:    s_mov_b32 s2, 0x84400
 ; GFX6-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
@@ -10103,7 +10116,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:224
-; GFX6-NEXT:    s_mov_b32 s2, 0x83400
+; GFX6-NEXT:    s_mov_b32 s2, 0x84000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10112,7 +10125,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:208
-; GFX6-NEXT:    s_mov_b32 s2, 0x83000
+; GFX6-NEXT:    s_mov_b32 s2, 0x83c00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10121,7 +10134,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:192
-; GFX6-NEXT:    s_mov_b32 s2, 0x82c00
+; GFX6-NEXT:    s_mov_b32 s2, 0x83800
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10130,7 +10143,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:176
-; GFX6-NEXT:    s_mov_b32 s2, 0x82800
+; GFX6-NEXT:    s_mov_b32 s2, 0x83400
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10139,7 +10152,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:160
-; GFX6-NEXT:    s_mov_b32 s2, 0x82400
+; GFX6-NEXT:    s_mov_b32 s2, 0x83000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10148,7 +10161,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:144
-; GFX6-NEXT:    s_mov_b32 s2, 0x82000
+; GFX6-NEXT:    s_mov_b32 s2, 0x82c00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10157,7 +10170,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:128
-; GFX6-NEXT:    s_mov_b32 s2, 0x81c00
+; GFX6-NEXT:    s_mov_b32 s2, 0x82800
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10166,7 +10179,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:112
-; GFX6-NEXT:    s_mov_b32 s2, 0x81800
+; GFX6-NEXT:    s_mov_b32 s2, 0x82400
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10175,7 +10188,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:96
-; GFX6-NEXT:    s_mov_b32 s2, 0x81400
+; GFX6-NEXT:    s_mov_b32 s2, 0x82000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10184,25 +10197,41 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:80
-; GFX6-NEXT:    s_mov_b32 s2, 0x81000
+; GFX6-NEXT:    s_mov_b32 s2, 0x81c00
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:64
+; GFX6-NEXT:    s_mov_b32 s2, 0x81400
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_load_dwordx4 v[17:20], v[7:8], s[4:7], 0 addr64 offset:64
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64
 ; GFX6-NEXT:    buffer_load_dwordx4 v[9:12], v[7:8], s[4:7], 0 addr64 offset:16
-; GFX6-NEXT:    s_mov_b32 s2, 0x80800
+; GFX6-NEXT:    s_mov_b32 s2, 0x80c00
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v9, off, s[40:43], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v10, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v11, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v12, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dwordx4 v[9:12], v[7:8], s[4:7], 0 addr64 offset:32
+; GFX6-NEXT:    s_mov_b32 s2, 0x81000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v9, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v10, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v11, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v12, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_load_dwordx4 v[13:16], v[7:8], s[4:7], 0 addr64 offset:32
 ; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; GFX6-NEXT:    s_mov_b64 exec, 15
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
@@ -10211,25 +10240,25 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_writelane_b32 v4, s1, 1
 ; GFX6-NEXT:    v_writelane_b32 v4, s2, 2
 ; GFX6-NEXT:    v_writelane_b32 v4, s3, 3
-; GFX6-NEXT:    s_mov_b32 s10, 0x80400
+; GFX6-NEXT:    s_mov_b32 s10, 0x80800
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s10 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[7:8], s[4:7], 0 addr64 offset:48
-; GFX6-NEXT:    s_mov_b32 s2, 0x80c00
+; GFX6-NEXT:    s_mov_b32 s0, 0x81800
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 13, v0
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 16, v4
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v7, off, s[40:43], s0 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(3)
 ; GFX6-NEXT:    v_mov_b32_e32 v7, 1
-; GFX6-NEXT:    s_mov_b64 s[2:3], exec
+; GFX6-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX6-NEXT:    buffer_store_dword v7, v4, s[40:43], 0 offen
 ; GFX6-NEXT:    ;;#ASMSTART
 ; GFX6-NEXT:    ; def s[4:11]
@@ -10245,12 +10274,12 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_writelane_b32 v4, s9, 5
 ; GFX6-NEXT:    v_writelane_b32 v4, s10, 6
 ; GFX6-NEXT:    v_writelane_b32 v4, s11, 7
-; GFX6-NEXT:    s_mov_b32 s12, 0x83c00
-; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s12 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_mov_b32 s2, 0x84800
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX6-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX6-NEXT:    ;;#ASMSTART
 ; GFX6-NEXT:    ; def s[8:15]
@@ -10262,18 +10291,19 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    ; def s[24:31]
 ; GFX6-NEXT:    ;;#ASMEND
 ; GFX6-NEXT:    ;;#ASMSTART
-; GFX6-NEXT:    ; def s[4:7]
+; GFX6-NEXT:    ; def s[0:3]
 ; GFX6-NEXT:    ;;#ASMEND
 ; GFX6-NEXT:    ;;#ASMSTART
-; GFX6-NEXT:    ; def s[2:3]
+; GFX6-NEXT:    ; def s[4:5]
 ; GFX6-NEXT:    ;;#ASMEND
 ; GFX6-NEXT:    ;;#ASMSTART
 ; GFX6-NEXT:    ; def s33
 ; GFX6-NEXT:    ;;#ASMEND
-; GFX6-NEXT:    s_and_saveexec_b64 s[34:35], vcc
+; GFX6-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX6-NEXT:    s_mov_b64 vcc, s[6:7]
 ; GFX6-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX6-NEXT:  ; %bb.1: ; %bb0
-; GFX6-NEXT:    s_mov_b64 s[36:37], exec
+; GFX6-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10285,18 +10315,18 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_writelane_b32 v4, s13, 5
 ; GFX6-NEXT:    v_writelane_b32 v4, s14, 6
 ; GFX6-NEXT:    v_writelane_b32 v4, s15, 7
-; GFX6-NEXT:    s_mov_b32 s38, 0x84400
-; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s38 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_mov_b32 s34, 0x85000
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX6-NEXT:    s_mov_b64 s[36:37], exec
+; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX6-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
-; GFX6-NEXT:    s_mov_b32 s38, 0x83c00
+; GFX6-NEXT:    s_mov_b32 s34, 0x84800
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_readlane_b32 s8, v4, 0
 ; GFX6-NEXT:    v_readlane_b32 s9, v4, 1
@@ -10308,8 +10338,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_readlane_b32 s15, v4, 7
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX6-NEXT:    s_mov_b64 s[36:37], exec
+; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX6-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10321,18 +10351,18 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_writelane_b32 v4, s21, 5
 ; GFX6-NEXT:    v_writelane_b32 v4, s22, 6
 ; GFX6-NEXT:    v_writelane_b32 v4, s23, 7
-; GFX6-NEXT:    s_mov_b32 s38, 0x84c00
-; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s38 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_mov_b32 s34, 0x85800
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX6-NEXT:    s_mov_b64 s[36:37], exec
+; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX6-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
-; GFX6-NEXT:    s_mov_b32 s38, 0x84400
+; GFX6-NEXT:    s_mov_b32 s34, 0x85000
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_readlane_b32 s16, v4, 0
 ; GFX6-NEXT:    v_readlane_b32 s17, v4, 1
@@ -10344,8 +10374,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_readlane_b32 s23, v4, 7
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX6-NEXT:    s_mov_b64 s[36:37], exec
+; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX6-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10357,18 +10387,18 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_writelane_b32 v4, s29, 5
 ; GFX6-NEXT:    v_writelane_b32 v4, s30, 6
 ; GFX6-NEXT:    v_writelane_b32 v4, s31, 7
-; GFX6-NEXT:    s_mov_b32 s38, 0x85400
-; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s38 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_mov_b32 s34, 0x86000
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX6-NEXT:    s_mov_b64 s[36:37], exec
+; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX6-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
-; GFX6-NEXT:    s_mov_b32 s38, 0x84c00
+; GFX6-NEXT:    s_mov_b32 s34, 0x85800
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_readlane_b32 s24, v4, 0
 ; GFX6-NEXT:    v_readlane_b32 s25, v4, 1
@@ -10380,8 +10410,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_readlane_b32 s31, v4, 7
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX6-NEXT:    s_mov_b64 s[36:37], exec
+; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX6-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 15
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10389,44 +10419,30 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_writelane_b32 v4, s1, 1
 ; GFX6-NEXT:    v_writelane_b32 v4, s2, 2
 ; GFX6-NEXT:    v_writelane_b32 v4, s3, 3
-; GFX6-NEXT:    s_mov_b32 s38, 0x85c00
-; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s38 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_mov_b32 s34, 0x86800
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX6-NEXT:    s_mov_b64 s[0:1], exec
-; GFX6-NEXT:    s_mov_b64 exec, 15
+; GFX6-NEXT:    s_mov_b64 exec, 3
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    v_writelane_b32 v4, s4, 0
 ; GFX6-NEXT:    v_writelane_b32 v4, s5, 1
-; GFX6-NEXT:    v_writelane_b32 v4, s6, 2
-; GFX6-NEXT:    v_writelane_b32 v4, s7, 3
-; GFX6-NEXT:    s_mov_b32 s36, 0x86000
-; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s36 ; 4-byte Folded Spill
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX6-NEXT:    s_mov_b64 s[0:1], exec
-; GFX6-NEXT:    s_mov_b64 exec, 3
-; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_writelane_b32 v4, s2, 0
-; GFX6-NEXT:    v_writelane_b32 v4, s3, 1
-; GFX6-NEXT:    s_mov_b32 s4, 0x86400
-; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s4 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_mov_b32 s2, 0x86c00
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX6-NEXT:    s_mov_b64 s[36:37], exec
+; GFX6-NEXT:    s_mov_b64 s[34:35], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
-; GFX6-NEXT:    s_mov_b32 s38, 0x85400
+; GFX6-NEXT:    s_mov_b32 s36, 0x86000
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s36 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_readlane_b32 s0, v4, 0
 ; GFX6-NEXT:    v_readlane_b32 s1, v4, 1
@@ -10438,13 +10454,13 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_readlane_b32 s7, v4, 7
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX6-NEXT:    s_mov_b64 s[44:45], exec
+; GFX6-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX6-NEXT:    s_mov_b64 s[34:35], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 15
-; GFX6-NEXT:    v_mov_b32_e32 v7, 0x2180
+; GFX6-NEXT:    s_mov_b32 s44, 0x86800
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s44 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_readlane_b32 s36, v4, 0
 ; GFX6-NEXT:    v_readlane_b32 s37, v4, 1
@@ -10452,11 +10468,10 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_readlane_b32 s39, v4, 3
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[44:45]
-; GFX6-NEXT:    s_mov_b64 vcc, s[34:35]
+; GFX6-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX6-NEXT:    s_mov_b64 s[44:45], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 3
-; GFX6-NEXT:    v_mov_b32_e32 v7, 0x2190
+; GFX6-NEXT:    v_mov_b32_e32 v7, 0x21b0
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Reload
@@ -10469,54 +10484,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    ;;#ASMSTART
 ; GFX6-NEXT:    ; use s[8:15],s[16:23],s[24:31],s[0:7],s[36:39],s[34:35]
 ; GFX6-NEXT:    ;;#ASMEND
-; GFX6-NEXT:    s_mov_b64 s[34:35], vcc
-; GFX6-NEXT:    s_mov_b64 s[4:5], exec
-; GFX6-NEXT:    s_mov_b64 exec, 15
-; GFX6-NEXT:    s_mov_b32 s6, 0x85c00
-; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s6 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_readlane_b32 s0, v4, 0
-; GFX6-NEXT:    v_readlane_b32 s1, v4, 1
-; GFX6-NEXT:    v_readlane_b32 s2, v4, 2
-; GFX6-NEXT:    v_readlane_b32 s3, v4, 3
-; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX6-NEXT:    s_mov_b32 s2, 0x83c00
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
-; GFX6-NEXT:    s_mov_b32 s2, 0x84400
-; GFX6-NEXT:    buffer_store_dword v13, off, s[40:43], s2 ; 4-byte Folded Spill
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v14, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v15, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v16, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
-; GFX6-NEXT:    s_waitcnt expcnt(4)
-; GFX6-NEXT:    v_mov_b32_e32 v0, v17
-; GFX6-NEXT:    v_mov_b32_e32 v1, v18
-; GFX6-NEXT:    v_mov_b32_e32 v2, v19
-; GFX6-NEXT:    v_mov_b32_e32 v3, v20
-; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    ;;#ASMSTART
 ; GFX6-NEXT:    ;;#ASMEND
-; GFX6-NEXT:    v_mov_b32_e32 v20, v3
-; GFX6-NEXT:    buffer_load_dword v13, off, s[40:43], s2 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v14, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v15, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v16, off, s[40:43], s2 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s2, 0x83c00
-; GFX6-NEXT:    v_mov_b32_e32 v19, v2
-; GFX6-NEXT:    v_mov_b32_e32 v18, v1
-; GFX6-NEXT:    v_mov_b32_e32 v17, v0
-; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s2 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    ;;#ASMSTART
 ; GFX6-NEXT:    ;;#ASMEND
 ; GFX6-NEXT:    ;;#ASMSTART
@@ -10530,128 +10499,158 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    ;;#ASMSTART
 ; GFX6-NEXT:    ;;#ASMEND
 ; GFX6-NEXT:  .LBB1_2: ; %ret
-; GFX6-NEXT:    s_or_b64 exec, exec, s[34:35]
-; GFX6-NEXT:    s_mov_b64 s[2:3], exec
+; GFX6-NEXT:    s_or_b64 exec, exec, vcc
+; GFX6-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 15
-; GFX6-NEXT:    s_mov_b32 s8, 0x80400
+; GFX6-NEXT:    s_mov_b32 s6, 0x80400
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s6 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_readlane_b32 s4, v4, 0
-; GFX6-NEXT:    v_readlane_b32 s5, v4, 1
-; GFX6-NEXT:    v_readlane_b32 s6, v4, 2
-; GFX6-NEXT:    v_readlane_b32 s7, v4, 3
+; GFX6-NEXT:    v_readlane_b32 s0, v4, 0
+; GFX6-NEXT:    v_readlane_b32 s1, v4, 1
+; GFX6-NEXT:    v_readlane_b32 s2, v4, 2
+; GFX6-NEXT:    v_readlane_b32 s3, v4, 3
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX6-NEXT:    s_mov_b32 s4, 0x83800
+; GFX6-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX6-NEXT:    s_mov_b64 s[36:37], s[0:1]
+; GFX6-NEXT:    s_mov_b64 s[4:5], exec
+; GFX6-NEXT:    s_mov_b64 exec, 15
+; GFX6-NEXT:    s_mov_b32 s6, 0x80800
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s6 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_readlane_b32 s0, v4, 0
+; GFX6-NEXT:    v_readlane_b32 s1, v4, 1
+; GFX6-NEXT:    v_readlane_b32 s2, v4, 2
+; GFX6-NEXT:    v_readlane_b32 s3, v4, 3
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX6-NEXT:    s_mov_b32 s0, 0x84400
 ; GFX6-NEXT:    v_lshl_b64 v[4:5], v[5:6], 8
-; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; GFX6-NEXT:    s_mov_b32 s4, 0x83400
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:240
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x83000
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:224
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x82c00
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:208
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x82800
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:192
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x82400
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:176
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x82000
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:160
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x81c00
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:144
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x81800
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:128
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x81400
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:112
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x81000
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:96
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x80c00
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:80
-; GFX6-NEXT:    buffer_store_dwordx4 v[17:20], v[4:5], s[0:3], 0 addr64 offset:64
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v17, off, s[40:43], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v18, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v19, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v20, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x80800
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[17:20], v[4:5], s[0:3], 0 addr64 offset:48
-; GFX6-NEXT:    buffer_store_dwordx4 v[13:16], v[4:5], s[0:3], 0 addr64 offset:32
-; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:16
-; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64
+; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b64 s[38:39], s[2:3]
+; GFX6-NEXT:    s_mov_b32 s0, 0x84000
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[36:39], 0 addr64 offset:240
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s0, 0x83c00
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[36:39], 0 addr64 offset:224
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s0, 0x83800
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[36:39], 0 addr64 offset:208
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s0, 0x83400
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[36:39], 0 addr64 offset:192
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s0, 0x83000
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[36:39], 0 addr64 offset:176
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s0, 0x82c00
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[36:39], 0 addr64 offset:160
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s0, 0x82800
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[36:39], 0 addr64 offset:144
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s0, 0x82400
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[36:39], 0 addr64 offset:128
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s0, 0x82000
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[36:39], 0 addr64 offset:112
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s0, 0x81c00
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[36:39], 0 addr64 offset:96
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s0, 0x81400
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[36:39], 0 addr64 offset:80
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s0, 0x81800
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[36:39], 0 addr64 offset:64
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s0, 0x81000
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[36:39], 0 addr64 offset:48
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s0, 0x80c00
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[36:39], 0 addr64 offset:32
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[36:39], 0 addr64 offset:16
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[4:5], s[36:39], 0 addr64
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-FLATSCR-LABEL: test_limited_sgpr:
diff --git a/llvm/test/CodeGen/AMDGPU/swdev380865.ll b/llvm/test/CodeGen/AMDGPU/swdev380865.ll
index 7201ffaf561662a..6c98c7def23289b 100644
--- a/llvm/test/CodeGen/AMDGPU/swdev380865.ll
+++ b/llvm/test/CodeGen/AMDGPU/swdev380865.ll
@@ -45,9 +45,9 @@ define amdgpu_kernel void @_Z6kernelILi4000ELi1EEvPd(ptr addrspace(1) %x.coerce)
 ; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], 0
 ; CHECK-NEXT:    s_mov_b32 s2, 0
 ; CHECK-NEXT:    s_mov_b32 s3, 0x40140000
-; CHECK-NEXT:    v_writelane_b32 v2, s6, 9
-; CHECK-NEXT:    v_writelane_b32 v2, s7, 10
-; CHECK-NEXT:    v_writelane_b32 v2, s0, 11
+; CHECK-NEXT:    v_writelane_b32 v2, s0, 9
+; CHECK-NEXT:    v_writelane_b32 v2, s6, 10
+; CHECK-NEXT:    v_writelane_b32 v2, s7, 11
 ; CHECK-NEXT:    v_readlane_b32 s6, v2, 1
 ; CHECK-NEXT:    v_readlane_b32 s7, v2, 2
 ; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
@@ -55,8 +55,8 @@ define amdgpu_kernel void @_Z6kernelILi4000ELi1EEvPd(ptr addrspace(1) %x.coerce)
 ; CHECK-NEXT:    s_mov_b32 s0, s2
 ; CHECK-NEXT:    v_writelane_b32 v2, s6, 1
 ; CHECK-NEXT:    v_writelane_b32 v2, s7, 2
-; CHECK-NEXT:    v_readlane_b32 s6, v2, 9
-; CHECK-NEXT:    v_readlane_b32 s7, v2, 10
+; CHECK-NEXT:    v_readlane_b32 s6, v2, 10
+; CHECK-NEXT:    v_readlane_b32 s7, v2, 11
 ; CHECK-NEXT:    s_mov_b32 s6, s2
 ; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[0:1]
 ; CHECK-NEXT:    v_readlane_b32 s0, v2, 3
@@ -92,11 +92,11 @@ define amdgpu_kernel void @_Z6kernelILi4000ELi1EEvPd(ptr addrspace(1) %x.coerce)
 ; CHECK-NEXT:    s_mov_b32 s4, s0
 ; CHECK-NEXT:    v_writelane_b32 v2, s1, 8
 ; CHECK-NEXT:    v_readlane_b32 s0, v2, 0
-; CHECK-NEXT:    v_readlane_b32 s2, v2, 11
+; CHECK-NEXT:    v_readlane_b32 s2, v2, 9
 ; CHECK-NEXT:    s_add_i32 s2, s2, s0
-; CHECK-NEXT:    v_writelane_b32 v2, s2, 11
+; CHECK-NEXT:    v_writelane_b32 v2, s2, 9
 ; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[4:5]
-; CHECK-NEXT:    v_readlane_b32 s0, v2, 11
+; CHECK-NEXT:    v_readlane_b32 s0, v2, 9
 ; CHECK-NEXT:    s_cmpk_lt_i32 s0, 0xa00
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB0_1
 ; CHECK-NEXT:  ; %bb.2: ; %for.cond.cleanup.loopexit
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
index c84eee4f9921d08..48e27f4480d7857 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
@@ -39,7 +39,6 @@ define amdgpu_kernel void @__omp_offloading_16_dd2df_main_l9()  {
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_2
 ; CHECK-NEXT:  ; %bb.1: ; %bb193
 ; CHECK-NEXT:  .LBB0_2: ; %bb194
-; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[8:9]
@@ -47,7 +46,9 @@ define amdgpu_kernel void @__omp_offloading_16_dd2df_main_l9()  {
 ; CHECK-NEXT:    v_readlane_b32 s4, v1, 0
 ; CHECK-NEXT:    v_readlane_b32 s5, v1, 1
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_cmp_ne_u16_e64 s[4:5], v0, s4
 ; CHECK-NEXT:    s_and_b64 vcc, exec, s[4:5]
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB0_4
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
index b6e7da97e008905..d2340872eb545bb 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
@@ -223,9 +223,7 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[46:47], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[46:47]
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_readlane_b32 s36, v0, 4
 ; GFX9-O0-NEXT:    v_readlane_b32 s37, v0, 5
 ; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[36:37]
@@ -233,6 +231,8 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
 ; GFX9-O0-NEXT:    v_readlane_b32 s39, v0, 1
 ; GFX9-O0-NEXT:    v_readlane_b32 s34, v0, 2
 ; GFX9-O0-NEXT:    v_readlane_b32 s35, v0, 3
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[36:37], v3, v4
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[36:37]



More information about the llvm-commits mailing list