[llvm] 5aea839 - [AMDGPU] Switch to backwards scavenging in eliminateFrameIndex

Jay Foad via llvm-commits llvm-commits at lists.llvm.org
Wed Jun 7 13:01:34 PDT 2023


Author: Jay Foad
Date: 2023-06-07T20:59:05+01:00
New Revision: 5aea839ab3edeb58bb0eea833ad9427d0ae7ef67

URL: https://github.com/llvm/llvm-project/commit/5aea839ab3edeb58bb0eea833ad9427d0ae7ef67
DIFF: https://github.com/llvm/llvm-project/commit/5aea839ab3edeb58bb0eea833ad9427d0ae7ef67.diff

LOG: [AMDGPU] Switch to backwards scavenging in eliminateFrameIndex

Frame index elimination runs backwards so we must use backwards
scavenging. Otherwise, when a scavenged register is spilled, the
scavenger will remember that the register is in use until the restore
point, but it will never reach that restore point. The result is that in
some cases it will keep scavenging different registers instead of
reusing the same one.

Differential Revision: https://reviews.llvm.org/D152394

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
    llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
    llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
    llvm/test/CodeGen/AMDGPU/extract-load-i1.ll
    llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll
    llvm/test/CodeGen/AMDGPU/flat-scratch.ll
    llvm/test/CodeGen/AMDGPU/frame-index.mir
    llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
    llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
    llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir
    llvm/test/CodeGen/AMDGPU/scratch-simple.ll
    llvm/test/CodeGen/AMDGPU/sgpr-spill-to-vmem-scc-clobber.mir
    llvm/test/CodeGen/AMDGPU/sgpr-spill-vmem-large-frame.mir
    llvm/test/CodeGen/AMDGPU/sgpr-spill.mir
    llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
    llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll
    llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
    llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir
    llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index d750088e12f15..996a02241326f 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -170,7 +170,8 @@ struct SGPRSpillBuilder {
     // a register as actually in use in another lane, so we need to save all
     // used lanes of the chosen VGPR.
     assert(RS && "Cannot spill SGPR to memory without RegScavenger");
-    TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0, false);
+    TmpVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false,
+                                            0, false);
 
     // Reserve temporary stack slot
     TmpVGPRIndex = MFI.getScavengeFI(MF.getFrameInfo(), TRI);
@@ -199,7 +200,7 @@ struct SGPRSpillBuilder {
     const TargetRegisterClass &RC =
         IsWave32 ? AMDGPU::SGPR_32RegClass : AMDGPU::SGPR_64RegClass;
     RS->setRegUsed(SuperReg);
-    SavedExecReg = RS->scavengeRegister(&RC, MI, 0, false);
+    SavedExecReg = RS->scavengeRegisterBackwards(RC, MI, false, 0, false);
 
     int64_t VGPRLanes = getPerVGPRData().VGPRLanes;
 
@@ -1591,7 +1592,8 @@ void SIRegisterInfo::buildSpillLoadStore(
     } else if (UseVGPROffset) {
       // FIXME: change to scavengeRegisterBackwards()
       if (!TmpOffsetVGPR) {
-        TmpOffsetVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
+        TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
+                                                      MI, false, 0);
         RS->setRegUsed(TmpOffsetVGPR);
       }
     }
@@ -2282,7 +2284,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
         const TargetRegisterClass *RC = UseSGPR ? &AMDGPU::SReg_32_XM0RegClass
                                                 : &AMDGPU::VGPR_32RegClass;
 
-        Register TmpReg = RS->scavengeRegister(RC, MI, 0, !UseSGPR);
+        Register TmpReg =
+            RS->scavengeRegisterBackwards(*RC, MI, false, 0, !UseSGPR);
         FIOp.setReg(TmpReg);
         FIOp.setIsKill();
 
@@ -2302,8 +2305,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
 
         Register TmpSReg =
             UseSGPR ? TmpReg
-                    : RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0,
-                                           !UseSGPR);
+                    : RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
+                                                    MI, false, 0, !UseSGPR);
 
         // TODO: for flat scratch another attempt can be made with a VGPR index
         //       if no SGPRs can be scavenged.
@@ -2377,8 +2380,9 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
                                             : &AMDGPU::VGPR_32RegClass;
         bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32 ||
                       MI->getOpcode() == AMDGPU::V_MOV_B32_e64;
-        Register ResultReg = IsCopy ? MI->getOperand(0).getReg()
-                                    : RS->scavengeRegister(RC, MI, 0);
+        Register ResultReg =
+            IsCopy ? MI->getOperand(0).getReg()
+                   : RS->scavengeRegisterBackwards(*RC, MI, false, 0);
 
         int64_t Offset = FrameInfo.getObjectOffset(Index);
         if (Offset == 0) {
@@ -2391,8 +2395,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
           if (IsSALU && !LiveSCC)
             Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead.
           if (IsSALU && LiveSCC) {
-            Register NewDest =
-                RS->scavengeRegister(&AMDGPU::SReg_32RegClass, Shift, 0);
+            Register NewDest = RS->scavengeRegisterBackwards(
+                AMDGPU::SReg_32RegClass, Shift, false, 0);
             BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
                     NewDest)
                 .addReg(ResultReg);
@@ -2446,8 +2450,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
 
             // We may have 1 free scratch SGPR even though a carry out is
             // unavailable. Only one additional mov is needed.
-            Register TmpScaledReg =
-              RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false);
+            Register TmpScaledReg = RS->scavengeRegisterBackwards(
+                AMDGPU::SReg_32_XM0RegClass, MI, false, 0, false);
             Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg;
 
             BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg)
@@ -2512,7 +2516,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
 
       FIOp.ChangeToImmediate(Offset);
       if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) {
-        Register TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
+        Register TmpReg = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
+                                                        MI, false, 0);
         BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
           .addImm(Offset);
         FIOp.ChangeToRegister(TmpReg, false, false, true);

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
index 61645d7738fec..48c1ef46c1976 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
@@ -159,50 +159,32 @@ define amdgpu_kernel void @kernel_caller_byval() {
 ; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
 ; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
 ; FLATSCR-NEXT:    v_mov_b32_e32 v1, 0
-; FLATSCR-NEXT:    s_mov_b32 s8, 0
-; FLATSCR-NEXT:    s_mov_b32 s12, 0
-; FLATSCR-NEXT:    s_mov_b32 s11, 0
-; FLATSCR-NEXT:    s_mov_b32 s10, 0
-; FLATSCR-NEXT:    s_mov_b32 s9, 0
-; FLATSCR-NEXT:    s_mov_b32 s13, 0
-; FLATSCR-NEXT:    s_mov_b32 s7, 0
-; FLATSCR-NEXT:    s_mov_b32 s5, 0
-; FLATSCR-NEXT:    s_mov_b32 s3, 0
-; FLATSCR-NEXT:    s_mov_b32 s1, 0
 ; FLATSCR-NEXT:    s_mov_b32 s0, 0
-; FLATSCR-NEXT:    s_mov_b32 s2, 0
-; FLATSCR-NEXT:    s_mov_b32 s4, 0
-; FLATSCR-NEXT:    s_mov_b32 s6, 0
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s8 offset:8
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s12 offset:16
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s11 offset:24
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s10 offset:32
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s9 offset:40
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s13 offset:48
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s7 offset:56
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s5 offset:64
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s3 offset:72
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s1 offset:80
+; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:8
+; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:24
+; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:32
+; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:40
+; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:48
+; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:56
+; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:64
+; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:72
+; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:80
 ; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:88
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s2 offset:96
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s4 offset:104
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s6 offset:112
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s8 offset:120
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s12 offset:128
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s11 offset:8
+; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:96
+; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:104
+; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:112
+; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:120
+; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:128
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s0 offset:8
 ; FLATSCR-NEXT:    s_nop 0
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[2:3], off, s10 offset:16
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[4:5], off, s9 offset:24
-; FLATSCR-NEXT:    s_mov_b32 s37, 0
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[6:7], off, s37 offset:32
-; FLATSCR-NEXT:    s_mov_b32 s36, 0
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[8:9], off, s36 offset:40
-; FLATSCR-NEXT:    s_mov_b32 s35, 0
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[10:11], off, s35 offset:48
-; FLATSCR-NEXT:    s_mov_b32 s34, 0
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[12:13], off, s34 offset:56
-; FLATSCR-NEXT:    s_mov_b32 s33, 0
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[14:15], off, s33 offset:64
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[2:3], off, s0 offset:16
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[4:5], off, s0 offset:24
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[6:7], off, s0 offset:32
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[8:9], off, s0 offset:40
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[10:11], off, s0 offset:48
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[12:13], off, s0 offset:56
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[14:15], off, s0 offset:64
 ; FLATSCR-NEXT:    s_movk_i32 s32, 0x50
 ; FLATSCR-NEXT:    s_getpc_b64 s[0:1]
 ; FLATSCR-NEXT:    s_add_u32 s0, s0, external_void_func_byval at rel32@lo+4

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
index 7983cba3478db..33a4d3c5494f7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
@@ -239,6 +239,8 @@ define i16 @v_extract_v128i16_varidx(ptr addrspace(1) %ptr, i32 %idx) {
 ; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
 ; GCN-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:224
 ; GCN-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:240
+; GCN-NEXT:    v_lshrrev_b32_e64 v1, 6, s33
+; GCN-NEXT:    v_add_u32_e32 v1, 0x100, v1
 ; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:256
 ; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:260
 ; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:264
@@ -308,10 +310,8 @@ define i16 @v_extract_v128i16_varidx(ptr addrspace(1) %ptr, i32 %idx) {
 ; GCN-NEXT:    buffer_load_dword v30, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload
 ; GCN-NEXT:    v_bfe_u32 v0, v6, 1, 6
-; GCN-NEXT:    v_lshrrev_b32_e64 v2, 6, s33
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GCN-NEXT:    v_add_u32_e32 v2, 0x100, v2
-; GCN-NEXT:    v_add_u32_e32 v0, v2, v0
+; GCN-NEXT:    v_add_u32_e32 v0, v1, v0
 ; GCN-NEXT:    v_and_b32_e32 v1, 1, v6
 ; GCN-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index 411b599519def..a476d595f988a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -279,8 +279,8 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
-; GFX9-NEXT:    s_mov_b32 s2, 0
-; GFX9-NEXT:    scratch_load_dword v0, off, s2 offset:4 glc
+; GFX9-NEXT:    s_mov_b32 s1, 0
+; GFX9-NEXT:    scratch_load_dword v0, off, s1 offset:4 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
 ; GFX9-NEXT:    s_and_b32 s0, s0, 15
@@ -459,9 +459,9 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX9-NEXT:    scratch_load_dword v1, off, s32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; GFX9-NEXT:    s_add_i32 s1, s32, 0x100
+; GFX9-NEXT:    s_add_i32 s0, s32, 0x100
 ; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
-; GFX9-NEXT:    v_add_u32_e32 v1, s1, v1
+; GFX9-NEXT:    v_add_u32_e32 v1, s0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_add_i32 s0, s32, 0x100
@@ -478,13 +478,13 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 15, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    s_add_i32 s1, s32, 0x100
 ; GFX10-NEXT:    s_add_i32 s0, s32, 0x100
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, s1, v0
 ; GFX10-NEXT:    scratch_load_dword v3, off, s32 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, s0, v0
+; GFX10-NEXT:    s_add_i32 s0, s32, 0x100
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, s0, v1
 ; GFX10-NEXT:    scratch_store_dword v0, v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -544,8 +544,8 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
 ; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
-; GFX9-NEXT:    s_mov_b32 s2, 0
-; GFX9-NEXT:    scratch_load_dword v0, off, s2 offset:4 glc
+; GFX9-NEXT:    s_mov_b32 s1, 0
+; GFX9-NEXT:    scratch_load_dword v0, off, s1 offset:4 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
 ; GFX9-NEXT:    s_and_b32 s0, s0, 15
@@ -728,9 +728,9 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX9-NEXT:    scratch_load_dword v1, off, s32 offset:4 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; GFX9-NEXT:    s_add_i32 s1, s32, 0x4004
+; GFX9-NEXT:    s_add_i32 s0, s32, 0x4004
 ; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
-; GFX9-NEXT:    v_add_u32_e32 v1, s1, v1
+; GFX9-NEXT:    v_add_u32_e32 v1, s0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_add_i32 s0, s32, 0x4004
@@ -747,13 +747,13 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 15, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    s_add_i32 s1, s32, 0x4004
 ; GFX10-NEXT:    s_add_i32 s0, s32, 0x4004
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, s1, v0
 ; GFX10-NEXT:    scratch_load_dword v3, off, s32 offset:4 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, s0, v0
+; GFX10-NEXT:    s_add_i32 s0, s32, 0x4004
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, s0, v1
 ; GFX10-NEXT:    scratch_store_dword v0, v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -767,8 +767,8 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX940-NEXT:    scratch_load_dword v1, off, s32 offset:4 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; GFX940-NEXT:    s_add_i32 s1, s32, 0x4004
-; GFX940-NEXT:    v_add_u32_e32 v1, s1, v1
+; GFX940-NEXT:    s_add_i32 s0, s32, 0x4004
+; GFX940-NEXT:    v_add_u32_e32 v1, s0, v1
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX940-NEXT:    v_and_b32_e32 v0, 15, v0
 ; GFX940-NEXT:    scratch_store_dword v1, v2, off sc0 sc1
@@ -785,12 +785,11 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 15, v0
-; GFX11-NEXT:    s_add_i32 s1, s32, 0x4004
 ; GFX11-NEXT:    s_add_i32 s0, s32, 0x4004
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, s1, v1
 ; GFX11-NEXT:    scratch_load_b32 v3, off, s32 offset:4 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, s0, v1
+; GFX11-NEXT:    s_add_i32 s0, s32, 0x4004
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-NEXT:    scratch_store_b32 v1, v2, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -816,10 +815,10 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 13
-; GFX9-NEXT:    s_mov_b32 s1, 0
-; GFX9-NEXT:    s_movk_i32 s0, 0x3e80
-; GFX9-NEXT:    scratch_store_dword off, v0, s1 offset:4
+; GFX9-NEXT:    s_mov_b32 s0, 0
+; GFX9-NEXT:    scratch_store_dword off, v0, s0 offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_movk_i32 s0, 0x3e80
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX9-NEXT:    s_add_i32 s0, s0, 4
 ; GFX9-NEXT:    scratch_store_dword off, v0, s0

diff  --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
index 37a3fb547c63f..00f8349e93cd0 100644
--- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
@@ -481,25 +481,22 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i
 ; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
 ; FLATSCR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
 ; FLATSCR-NEXT:    v_mov_b32_e32 v2, 0
-; FLATSCR-NEXT:    s_mov_b32 s6, 0
-; FLATSCR-NEXT:    s_mov_b32 s5, 0
 ; FLATSCR-NEXT:    s_mov_b32 s4, 0
 ; FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
 ; FLATSCR-NEXT:    global_load_ushort v0, v2, s[0:1]
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    scratch_store_short off, v0, s6 offset:4
+; FLATSCR-NEXT:    scratch_store_short off, v0, s4 offset:4
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    global_load_ushort v0, v2, s[0:1] offset:2
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    scratch_store_short off, v0, s5 offset:6
+; FLATSCR-NEXT:    scratch_store_short off, v0, s4 offset:6
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    global_load_ushort v0, v2, s[0:1] offset:4
-; FLATSCR-NEXT:    s_mov_b32 s1, 0
 ; FLATSCR-NEXT:    s_mov_b32 s0, 0
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    scratch_store_short off, v0, s4 offset:8
+; FLATSCR-NEXT:    scratch_store_short off, v0, s0 offset:8
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    scratch_load_dword v0, off, s1 offset:4
+; FLATSCR-NEXT:    scratch_load_dword v0, off, s0 offset:4
 ; FLATSCR-NEXT:    scratch_load_dword v1, off, s0 offset:6
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
@@ -544,27 +541,24 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i
 ; FLATSCR_GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
 ; FLATSCR_GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
 ; FLATSCR_GFX10-NEXT:    v_mov_b32_e32 v2, 0
-; FLATSCR_GFX10-NEXT:    s_mov_b32 s6, 0
-; FLATSCR_GFX10-NEXT:    s_mov_b32 s5, 0
 ; FLATSCR_GFX10-NEXT:    s_mov_b32 s4, 0
 ; FLATSCR_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; FLATSCR_GFX10-NEXT:    global_load_ushort v0, v2, s[0:1]
 ; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR_GFX10-NEXT:    scratch_store_short off, v0, s6 offset:4
+; FLATSCR_GFX10-NEXT:    scratch_store_short off, v0, s4 offset:4
 ; FLATSCR_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; FLATSCR_GFX10-NEXT:    global_load_ushort v0, v2, s[0:1] offset:2
 ; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR_GFX10-NEXT:    scratch_store_short off, v0, s5 offset:6
+; FLATSCR_GFX10-NEXT:    scratch_store_short off, v0, s4 offset:6
 ; FLATSCR_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; FLATSCR_GFX10-NEXT:    global_load_ushort v0, v2, s[0:1] offset:4
 ; FLATSCR_GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; FLATSCR_GFX10-NEXT:    s_mov_b32 s1, 0
 ; FLATSCR_GFX10-NEXT:    s_mov_b32 s0, 0
 ; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR_GFX10-NEXT:    scratch_store_short off, v0, s4 offset:8
+; FLATSCR_GFX10-NEXT:    scratch_store_short off, v0, s0 offset:8
 ; FLATSCR_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; FLATSCR_GFX10-NEXT:    s_clause 0x1
-; FLATSCR_GFX10-NEXT:    scratch_load_dword v0, off, s1 offset:4
+; FLATSCR_GFX10-NEXT:    scratch_load_dword v0, off, s0 offset:4
 ; FLATSCR_GFX10-NEXT:    scratch_load_dword v1, off, s0 offset:6
 ; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR_GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]

diff  --git a/llvm/test/CodeGen/AMDGPU/extract-load-i1.ll b/llvm/test/CodeGen/AMDGPU/extract-load-i1.ll
index 575f6143bc6a0..72ee660dc2adb 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-load-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-load-i1.ll
@@ -9,8 +9,8 @@ define i1 @extractloadi1(ptr %ptr, i32 %idx) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    flat_load_ubyte v0, v[0:1]
 ; CHECK-NEXT:    v_and_b32_e32 v1, 7, v2
-; CHECK-NEXT:    v_lshr_b32_e64 v9, s32, 6
-; CHECK-NEXT:    v_or_b32_e32 v1, v9, v1
+; CHECK-NEXT:    v_lshr_b32_e64 v2, s32, 6
+; CHECK-NEXT:    v_or_b32_e32 v1, v2, v1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_bfe_u32 v2, v0, 1, 1
 ; CHECK-NEXT:    v_bfe_u32 v3, v0, 2, 2

diff  --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll
index 5fea62ca5b375..41fea1a6c30a4 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll
@@ -117,21 +117,21 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) {
 ; FLAT_SCR_OPT-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
 ; FLAT_SCR_OPT-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
 ; FLAT_SCR_OPT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; FLAT_SCR_OPT-NEXT:    s_mov_b32 s104, exec_lo
+; FLAT_SCR_OPT-NEXT:    s_mov_b32 s4, exec_lo
 ; FLAT_SCR_OPT-NEXT:    s_mov_b32 exec_lo, 3
-; FLAT_SCR_OPT-NEXT:    s_mov_b32 s4, 0
-; FLAT_SCR_OPT-NEXT:    scratch_store_dword off, v72, s4
+; FLAT_SCR_OPT-NEXT:    s_mov_b32 s5, 0
+; FLAT_SCR_OPT-NEXT:    scratch_store_dword off, v0, s5
 ; FLAT_SCR_OPT-NEXT:    s_waitcnt lgkmcnt(0)
-; FLAT_SCR_OPT-NEXT:    v_writelane_b32 v72, s2, 0
-; FLAT_SCR_OPT-NEXT:    s_mov_b32 s4, 4
-; FLAT_SCR_OPT-NEXT:    v_writelane_b32 v72, s3, 1
-; FLAT_SCR_OPT-NEXT:    scratch_store_dword off, v72, s4 ; 4-byte Folded Spill
+; FLAT_SCR_OPT-NEXT:    v_writelane_b32 v0, s2, 0
+; FLAT_SCR_OPT-NEXT:    s_mov_b32 s5, 4
+; FLAT_SCR_OPT-NEXT:    v_writelane_b32 v0, s3, 1
+; FLAT_SCR_OPT-NEXT:    scratch_store_dword off, v0, s5 ; 4-byte Folded Spill
 ; FLAT_SCR_OPT-NEXT:    s_waitcnt_depctr 0xffe3
-; FLAT_SCR_OPT-NEXT:    s_mov_b32 s4, 0
-; FLAT_SCR_OPT-NEXT:    scratch_load_dword v72, off, s4
+; FLAT_SCR_OPT-NEXT:    s_mov_b32 s5, 0
+; FLAT_SCR_OPT-NEXT:    scratch_load_dword v0, off, s5
 ; FLAT_SCR_OPT-NEXT:    s_waitcnt vmcnt(0)
 ; FLAT_SCR_OPT-NEXT:    s_waitcnt_depctr 0xffe3
-; FLAT_SCR_OPT-NEXT:    s_mov_b32 exec_lo, s104
+; FLAT_SCR_OPT-NEXT:    s_mov_b32 exec_lo, s4
 ; FLAT_SCR_OPT-NEXT:    s_load_dword vcc_lo, s[0:1], 0x8
 ; FLAT_SCR_OPT-NEXT:    ; kill: killed $sgpr0_sgpr1
 ; FLAT_SCR_OPT-NEXT:    ;;#ASMSTART
@@ -231,16 +231,16 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) {
 ; FLAT_SCR_OPT-NEXT:    s_mov_b32 s2, exec_lo
 ; FLAT_SCR_OPT-NEXT:    s_mov_b32 exec_lo, 3
 ; FLAT_SCR_OPT-NEXT:    s_mov_b32 s3, 0
-; FLAT_SCR_OPT-NEXT:    scratch_store_dword off, v2, s3
+; FLAT_SCR_OPT-NEXT:    scratch_store_dword off, v1, s3
 ; FLAT_SCR_OPT-NEXT:    s_waitcnt_depctr 0xffe3
 ; FLAT_SCR_OPT-NEXT:    s_mov_b32 s3, 4
-; FLAT_SCR_OPT-NEXT:    scratch_load_dword v2, off, s3 ; 4-byte Folded Reload
+; FLAT_SCR_OPT-NEXT:    scratch_load_dword v1, off, s3 ; 4-byte Folded Reload
 ; FLAT_SCR_OPT-NEXT:    s_waitcnt_depctr 0xffe3
 ; FLAT_SCR_OPT-NEXT:    s_mov_b32 s3, 0
 ; FLAT_SCR_OPT-NEXT:    s_waitcnt vmcnt(0)
-; FLAT_SCR_OPT-NEXT:    v_readlane_b32 s0, v2, 0
-; FLAT_SCR_OPT-NEXT:    v_readlane_b32 s1, v2, 1
-; FLAT_SCR_OPT-NEXT:    scratch_load_dword v2, off, s3
+; FLAT_SCR_OPT-NEXT:    v_readlane_b32 s0, v1, 0
+; FLAT_SCR_OPT-NEXT:    v_readlane_b32 s1, v1, 1
+; FLAT_SCR_OPT-NEXT:    scratch_load_dword v1, off, s3
 ; FLAT_SCR_OPT-NEXT:    s_waitcnt vmcnt(0)
 ; FLAT_SCR_OPT-NEXT:    s_waitcnt_depctr 0xffe3
 ; FLAT_SCR_OPT-NEXT:    s_mov_b32 exec_lo, s2
@@ -251,21 +251,21 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) {
 ; FLAT_SCR_ARCH-LABEL: test:
 ; FLAT_SCR_ARCH:       ; %bb.0:
 ; FLAT_SCR_ARCH-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; FLAT_SCR_ARCH-NEXT:    s_mov_b32 s104, exec_lo
+; FLAT_SCR_ARCH-NEXT:    s_mov_b32 s4, exec_lo
 ; FLAT_SCR_ARCH-NEXT:    s_mov_b32 exec_lo, 3
-; FLAT_SCR_ARCH-NEXT:    s_mov_b32 s4, 0
-; FLAT_SCR_ARCH-NEXT:    scratch_store_dword off, v72, s4
+; FLAT_SCR_ARCH-NEXT:    s_mov_b32 s5, 0
+; FLAT_SCR_ARCH-NEXT:    scratch_store_dword off, v0, s5
 ; FLAT_SCR_ARCH-NEXT:    s_waitcnt lgkmcnt(0)
-; FLAT_SCR_ARCH-NEXT:    v_writelane_b32 v72, s2, 0
-; FLAT_SCR_ARCH-NEXT:    s_mov_b32 s4, 4
-; FLAT_SCR_ARCH-NEXT:    v_writelane_b32 v72, s3, 1
-; FLAT_SCR_ARCH-NEXT:    scratch_store_dword off, v72, s4 ; 4-byte Folded Spill
+; FLAT_SCR_ARCH-NEXT:    v_writelane_b32 v0, s2, 0
+; FLAT_SCR_ARCH-NEXT:    s_mov_b32 s5, 4
+; FLAT_SCR_ARCH-NEXT:    v_writelane_b32 v0, s3, 1
+; FLAT_SCR_ARCH-NEXT:    scratch_store_dword off, v0, s5 ; 4-byte Folded Spill
 ; FLAT_SCR_ARCH-NEXT:    s_waitcnt_depctr 0xffe3
-; FLAT_SCR_ARCH-NEXT:    s_mov_b32 s4, 0
-; FLAT_SCR_ARCH-NEXT:    scratch_load_dword v72, off, s4
+; FLAT_SCR_ARCH-NEXT:    s_mov_b32 s5, 0
+; FLAT_SCR_ARCH-NEXT:    scratch_load_dword v0, off, s5
 ; FLAT_SCR_ARCH-NEXT:    s_waitcnt vmcnt(0)
 ; FLAT_SCR_ARCH-NEXT:    s_waitcnt_depctr 0xffe3
-; FLAT_SCR_ARCH-NEXT:    s_mov_b32 exec_lo, s104
+; FLAT_SCR_ARCH-NEXT:    s_mov_b32 exec_lo, s4
 ; FLAT_SCR_ARCH-NEXT:    s_load_dword vcc_lo, s[0:1], 0x8
 ; FLAT_SCR_ARCH-NEXT:    ; kill: killed $sgpr0_sgpr1
 ; FLAT_SCR_ARCH-NEXT:    ;;#ASMSTART
@@ -365,16 +365,16 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) {
 ; FLAT_SCR_ARCH-NEXT:    s_mov_b32 s2, exec_lo
 ; FLAT_SCR_ARCH-NEXT:    s_mov_b32 exec_lo, 3
 ; FLAT_SCR_ARCH-NEXT:    s_mov_b32 s3, 0
-; FLAT_SCR_ARCH-NEXT:    scratch_store_dword off, v2, s3
+; FLAT_SCR_ARCH-NEXT:    scratch_store_dword off, v1, s3
 ; FLAT_SCR_ARCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; FLAT_SCR_ARCH-NEXT:    s_mov_b32 s3, 4
-; FLAT_SCR_ARCH-NEXT:    scratch_load_dword v2, off, s3 ; 4-byte Folded Reload
+; FLAT_SCR_ARCH-NEXT:    scratch_load_dword v1, off, s3 ; 4-byte Folded Reload
 ; FLAT_SCR_ARCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; FLAT_SCR_ARCH-NEXT:    s_mov_b32 s3, 0
 ; FLAT_SCR_ARCH-NEXT:    s_waitcnt vmcnt(0)
-; FLAT_SCR_ARCH-NEXT:    v_readlane_b32 s0, v2, 0
-; FLAT_SCR_ARCH-NEXT:    v_readlane_b32 s1, v2, 1
-; FLAT_SCR_ARCH-NEXT:    scratch_load_dword v2, off, s3
+; FLAT_SCR_ARCH-NEXT:    v_readlane_b32 s0, v1, 0
+; FLAT_SCR_ARCH-NEXT:    v_readlane_b32 s1, v1, 1
+; FLAT_SCR_ARCH-NEXT:    scratch_load_dword v1, off, s3
 ; FLAT_SCR_ARCH-NEXT:    s_waitcnt vmcnt(0)
 ; FLAT_SCR_ARCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; FLAT_SCR_ARCH-NEXT:    s_mov_b32 exec_lo, s2

diff  --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index ce24e68f60b65..0da1119117a31 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -21,12 +21,9 @@ define amdgpu_kernel void @zero_init_kernel() {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NEXT:    s_mov_b32 s3, 0
-; GFX9-NEXT:    s_mov_b32 s2, 0
-; GFX9-NEXT:    s_mov_b32 s1, 0
-; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s3 offset:52
-; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s2 offset:36
-; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s1 offset:20
+; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:52
+; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:36
+; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:20
 ; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:4
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -84,12 +81,9 @@ define amdgpu_kernel void @zero_init_kernel() {
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-PAL-NEXT:    s_mov_b32 s3, 0
-; GFX9-PAL-NEXT:    s_mov_b32 s2, 0
-; GFX9-PAL-NEXT:    s_mov_b32 s1, 0
-; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s3 offset:52
-; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s2 offset:36
-; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s1 offset:20
+; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:52
+; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:36
+; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:20
 ; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:4
 ; GFX9-PAL-NEXT:    s_endpgm
 ;
@@ -126,12 +120,9 @@ define amdgpu_kernel void @zero_init_kernel() {
 ; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, s3
-; GFX1010-PAL-NEXT:    s_mov_b32 s3, 0
-; GFX1010-PAL-NEXT:    s_mov_b32 s2, 0
-; GFX1010-PAL-NEXT:    s_mov_b32 s1, 0
-; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s3 offset:52
-; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s2 offset:36
-; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s1 offset:20
+; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:52
+; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:36
+; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:20
 ; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:4
 ; GFX1010-PAL-NEXT:    s_endpgm
 ;
@@ -892,10 +883,9 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
-; GFX9-NEXT:    s_mov_b32 s4, 0
-; GFX9-NEXT:    scratch_load_dword v0, off, s4 offset:4 glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s0, 0
+; GFX9-NEXT:    scratch_load_dword v0, off, s0 offset:4 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s1, s0
 ; GFX9-NEXT:    s_mov_b32 s2, s0
 ; GFX9-NEXT:    s_mov_b32 s3, s0
@@ -903,12 +893,9 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NEXT:    s_mov_b32 s3, 0
-; GFX9-NEXT:    s_mov_b32 s2, 0
-; GFX9-NEXT:    s_mov_b32 s1, 0
-; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s3 offset:260
-; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s2 offset:276
-; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s1 offset:292
+; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:260
+; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:276
+; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:292
 ; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:308
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -958,13 +945,12 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
 ; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
 ; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX9-PAL-NEXT:    s_mov_b32 s4, 0
 ; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
 ; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
 ; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
 ; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
-; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s4 offset:4 glc
+; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:4 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
 ; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
@@ -973,12 +959,9 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-PAL-NEXT:    s_mov_b32 s3, 0
-; GFX9-PAL-NEXT:    s_mov_b32 s2, 0
-; GFX9-PAL-NEXT:    s_mov_b32 s1, 0
-; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s3 offset:260
-; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s2 offset:276
-; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s1 offset:292
+; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:260
+; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:276
+; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:292
 ; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:308
 ; GFX9-PAL-NEXT:    s_endpgm
 ;
@@ -1009,9 +992,8 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
 ; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
 ; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
-; GFX1010-PAL-NEXT:    s_mov_b32 s4, 0
 ; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
-; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s4 offset:4 glc dlc
+; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:4 glc dlc
 ; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1010-PAL-NEXT:    s_mov_b32 s1, s0
 ; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
@@ -1020,12 +1002,9 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, s3
-; GFX1010-PAL-NEXT:    s_mov_b32 s3, 0
-; GFX1010-PAL-NEXT:    s_mov_b32 s2, 0
-; GFX1010-PAL-NEXT:    s_mov_b32 s1, 0
-; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s3 offset:260
-; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s2 offset:276
-; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s1 offset:292
+; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:260
+; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:276
+; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:292
 ; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:308
 ; GFX1010-PAL-NEXT:    s_endpgm
 ;
@@ -1238,8 +1217,8 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
-; GFX9-NEXT:    s_mov_b32 s2, 0
-; GFX9-NEXT:    scratch_load_dword v0, off, s2 offset:4 glc
+; GFX9-NEXT:    s_mov_b32 s1, 0
+; GFX9-NEXT:    scratch_load_dword v0, off, s1 offset:4 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
 ; GFX9-NEXT:    s_and_b32 s0, s0, 15
@@ -1298,14 +1277,14 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX9-PAL-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-PAL-NEXT:    s_mov_b32 s4, s0
 ; GFX9-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-PAL-NEXT:    s_mov_b32 s2, 0
-; GFX9-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
+; GFX9-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s4, s3
 ; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
-; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s2 offset:4 glc
-; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT:    s_mov_b32 s1, 0
+; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s1 offset:4 glc
+; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
 ; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
@@ -1348,8 +1327,8 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
 ; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
 ; GFX1010-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX1010-PAL-NEXT:    s_mov_b32 s2, 0
-; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s2 offset:4 glc dlc
+; GFX1010-PAL-NEXT:    s_mov_b32 s1, 0
+; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 offset:4 glc dlc
 ; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1426,8 +1405,8 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
-; GFX9-NEXT:    s_mov_b32 s1, 0
-; GFX9-NEXT:    scratch_load_dword v0, off, s1 offset:4 glc
+; GFX9-NEXT:    s_mov_b32 s0, 0
+; GFX9-NEXT:    scratch_load_dword v0, off, s0 offset:4 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_lshl_b32 s0, s2, 2
 ; GFX9-NEXT:    s_addk_i32 s0, 0x104
@@ -1486,8 +1465,8 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
 ; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
 ; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
 ; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
-; GFX9-PAL-NEXT:    s_mov_b32 s2, 0
-; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s2 offset:4 glc
+; GFX9-PAL-NEXT:    s_mov_b32 s1, 0
+; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s1 offset:4 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
 ; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
@@ -1528,11 +1507,11 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
 ; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
 ; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
 ; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
-; GFX1010-PAL-NEXT:    s_mov_b32 s2, 0
-; GFX1010-PAL-NEXT:    s_and_b32 s1, s0, 15
-; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s2 offset:4 glc dlc
+; GFX1010-PAL-NEXT:    s_mov_b32 s1, 0
+; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 offset:4 glc dlc
 ; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 15
+; GFX1010-PAL-NEXT:    s_and_b32 s1, s0, 15
 ; GFX1010-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX1010-PAL-NEXT:    s_lshl_b32 s1, s1, 2
 ; GFX1010-PAL-NEXT:    s_addk_i32 s0, 0x104
@@ -1785,13 +1764,13 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 15, v0
-; GFX10-NEXT:    s_add_i32 s1, s32, 0x100
 ; GFX10-NEXT:    s_add_i32 s0, s32, 0x100
-; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 15
-; GFX10-NEXT:    v_lshl_add_u32 v1, v1, 2, s0
+; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
+; GFX10-NEXT:    s_add_i32 s0, s32, 0x100
 ; GFX10-NEXT:    scratch_load_dword v3, off, s32 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshl_add_u32 v1, v1, 2, s0
 ; GFX10-NEXT:    scratch_store_dword v0, v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
@@ -1853,13 +1832,13 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-PAL-NEXT:    v_and_b32_e32 v1, 15, v0
-; GFX10-PAL-NEXT:    s_add_i32 s1, s32, 0x100
 ; GFX10-PAL-NEXT:    s_add_i32 s0, s32, 0x100
-; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, s1
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 15
-; GFX10-PAL-NEXT:    v_lshl_add_u32 v1, v1, 2, s0
+; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
+; GFX10-PAL-NEXT:    s_add_i32 s0, s32, 0x100
 ; GFX10-PAL-NEXT:    scratch_load_dword v3, off, s32 glc dlc
 ; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT:    v_lshl_add_u32 v1, v1, 2, s0
 ; GFX10-PAL-NEXT:    scratch_store_dword v0, v2, off
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-PAL-NEXT:    scratch_load_dword v0, v1, off glc dlc
@@ -1899,10 +1878,9 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
-; GFX9-NEXT:    s_mov_b32 s4, 0
-; GFX9-NEXT:    scratch_load_dword v0, off, s4 offset:4 glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s0, 0
+; GFX9-NEXT:    scratch_load_dword v0, off, s0 offset:4 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s1, s0
 ; GFX9-NEXT:    s_mov_b32 s2, s0
 ; GFX9-NEXT:    s_mov_b32 s3, s0
@@ -1910,13 +1888,10 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NEXT:    s_movk_i32 s3, 0x4004
-; GFX9-NEXT:    s_movk_i32 s2, 0x4004
-; GFX9-NEXT:    s_movk_i32 s1, 0x4004
 ; GFX9-NEXT:    s_movk_i32 s0, 0x4004
-; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s3
-; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s2 offset:16
-; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s1 offset:32
+; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0
+; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:16
+; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:32
 ; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:48
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -1936,13 +1911,10 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-NEXT:    s_movk_i32 s3, 0x4004
-; GFX10-NEXT:    s_movk_i32 s2, 0x4004
-; GFX10-NEXT:    s_movk_i32 s1, 0x4004
 ; GFX10-NEXT:    s_movk_i32 s0, 0x4004
-; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s3
-; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s2 offset:16
-; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s1 offset:32
+; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s0
+; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:16
+; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:32
 ; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:48
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -1957,14 +1929,11 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
 ; GFX11-NEXT:    s_mov_b32 s3, s0
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT:    s_movk_i32 s3, 0x4004
-; GFX11-NEXT:    s_movk_i32 s2, 0x4004
-; GFX11-NEXT:    s_movk_i32 s1, 0x4004
 ; GFX11-NEXT:    s_movk_i32 s0, 0x4004
 ; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s3
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s2 offset:16
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s1 offset:32
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s0
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s0 offset:16
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s0 offset:32
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s0 offset:48
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -1974,13 +1943,12 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
 ; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
 ; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
 ; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX9-PAL-NEXT:    s_mov_b32 s4, 0
 ; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
 ; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
 ; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
 ; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
-; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s4 offset:4 glc
+; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:4 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
 ; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
@@ -1989,13 +1957,10 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-PAL-NEXT:    s_movk_i32 s3, 0x4004
-; GFX9-PAL-NEXT:    s_movk_i32 s2, 0x4004
-; GFX9-PAL-NEXT:    s_movk_i32 s1, 0x4004
 ; GFX9-PAL-NEXT:    s_movk_i32 s0, 0x4004
-; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s3
-; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s2 offset:16
-; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s1 offset:32
+; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0
+; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:16
+; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:32
 ; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:48
 ; GFX9-PAL-NEXT:    s_endpgm
 ;
@@ -2009,13 +1974,10 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
 ; GFX940-NEXT:    s_mov_b32 s3, s0
 ; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX940-NEXT:    s_movk_i32 s3, 0x4004
-; GFX940-NEXT:    s_movk_i32 s2, 0x4004
-; GFX940-NEXT:    s_movk_i32 s1, 0x4004
 ; GFX940-NEXT:    s_movk_i32 s0, 0x4004
-; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s3 sc0 sc1
-; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s2 offset:16 sc0 sc1
-; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s1 offset:32 sc0 sc1
+; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 sc0 sc1
+; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:16 sc0 sc1
+; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:32 sc0 sc1
 ; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:48 sc0 sc1
 ; GFX940-NEXT:    s_endpgm
 ;
@@ -2030,9 +1992,8 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
 ; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
 ; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
 ; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
-; GFX1010-PAL-NEXT:    s_mov_b32 s4, 0
 ; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
-; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s4 offset:4 glc dlc
+; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:4 glc dlc
 ; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1010-PAL-NEXT:    s_mov_b32 s1, s0
 ; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
@@ -2041,13 +2002,10 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
 ; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, s3
-; GFX1010-PAL-NEXT:    s_movk_i32 s3, 0x4004
-; GFX1010-PAL-NEXT:    s_movk_i32 s2, 0x4004
-; GFX1010-PAL-NEXT:    s_movk_i32 s1, 0x4004
 ; GFX1010-PAL-NEXT:    s_movk_i32 s0, 0x4004
-; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s3
-; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s2 offset:16
-; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s1 offset:32
+; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0
+; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:16
+; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:32
 ; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:48
 ; GFX1010-PAL-NEXT:    s_endpgm
 ;
@@ -2072,13 +2030,10 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
 ; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX1030-PAL-NEXT:    v_mov_b32_e32 v3, s3
-; GFX1030-PAL-NEXT:    s_movk_i32 s3, 0x4004
-; GFX1030-PAL-NEXT:    s_movk_i32 s2, 0x4004
-; GFX1030-PAL-NEXT:    s_movk_i32 s1, 0x4004
 ; GFX1030-PAL-NEXT:    s_movk_i32 s0, 0x4004
-; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s3
-; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s2 offset:16
-; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s1 offset:32
+; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0
+; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:16
+; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:32
 ; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:48
 ; GFX1030-PAL-NEXT:    s_endpgm
 ;
@@ -2093,14 +2048,11 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
 ; GFX11-PAL-NEXT:    s_mov_b32 s3, s0
 ; GFX11-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-PAL-NEXT:    s_movk_i32 s3, 0x4004
-; GFX11-PAL-NEXT:    s_movk_i32 s2, 0x4004
-; GFX11-PAL-NEXT:    s_movk_i32 s1, 0x4004
 ; GFX11-PAL-NEXT:    s_movk_i32 s0, 0x4004
 ; GFX11-PAL-NEXT:    s_clause 0x3
-; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s3
-; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s2 offset:16
-; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s1 offset:32
+; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s0
+; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s0 offset:16
+; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s0 offset:32
 ; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s0 offset:48
 ; GFX11-PAL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-PAL-NEXT:    s_endpgm
@@ -2126,13 +2078,13 @@ define void @zero_init_large_offset_foo() {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NEXT:    s_add_i32 s3, s32, 0x4004
-; GFX9-NEXT:    s_add_i32 s2, s32, 0x4004
-; GFX9-NEXT:    s_add_i32 s1, s32, 0x4004
 ; GFX9-NEXT:    s_add_i32 s0, s32, 0x4004
-; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s3
-; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s2 offset:16
-; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s1 offset:32
+; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0
+; GFX9-NEXT:    s_add_i32 s0, s32, 0x4004
+; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:16
+; GFX9-NEXT:    s_add_i32 s0, s32, 0x4004
+; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:32
+; GFX9-NEXT:    s_add_i32 s0, s32, 0x4004
 ; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:48
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2151,13 +2103,13 @@ define void @zero_init_large_offset_foo() {
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-NEXT:    s_add_i32 s3, s32, 0x4004
-; GFX10-NEXT:    s_add_i32 s2, s32, 0x4004
-; GFX10-NEXT:    s_add_i32 s1, s32, 0x4004
 ; GFX10-NEXT:    s_add_i32 s0, s32, 0x4004
-; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s3
-; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s2 offset:16
-; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s1 offset:32
+; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s0
+; GFX10-NEXT:    s_add_i32 s0, s32, 0x4004
+; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:16
+; GFX10-NEXT:    s_add_i32 s0, s32, 0x4004
+; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:32
+; GFX10-NEXT:    s_add_i32 s0, s32, 0x4004
 ; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:48
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -2175,14 +2127,13 @@ define void @zero_init_large_offset_foo() {
 ; GFX11-NEXT:    s_mov_b32 s3, s0
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT:    s_add_i32 s3, s32, 0x4004
-; GFX11-NEXT:    s_add_i32 s2, s32, 0x4004
-; GFX11-NEXT:    s_add_i32 s1, s32, 0x4004
 ; GFX11-NEXT:    s_add_i32 s0, s32, 0x4004
-; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s3
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s2 offset:16
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s1 offset:32
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s0
+; GFX11-NEXT:    s_add_i32 s0, s32, 0x4004
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s0 offset:16
+; GFX11-NEXT:    s_add_i32 s0, s32, 0x4004
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s0 offset:32
+; GFX11-NEXT:    s_add_i32 s0, s32, 0x4004
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s0 offset:48
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -2200,13 +2151,13 @@ define void @zero_init_large_offset_foo() {
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-PAL-NEXT:    s_add_i32 s3, s32, 0x4004
-; GFX9-PAL-NEXT:    s_add_i32 s2, s32, 0x4004
-; GFX9-PAL-NEXT:    s_add_i32 s1, s32, 0x4004
 ; GFX9-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
-; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s3
-; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s2 offset:16
-; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s1 offset:32
+; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0
+; GFX9-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
+; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:16
+; GFX9-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
+; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:32
+; GFX9-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
 ; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:48
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
@@ -2222,41 +2173,69 @@ define void @zero_init_large_offset_foo() {
 ; GFX940-NEXT:    s_mov_b32 s3, s0
 ; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX940-NEXT:    s_add_i32 s3, s32, 0x4004
-; GFX940-NEXT:    s_add_i32 s2, s32, 0x4004
-; GFX940-NEXT:    s_add_i32 s1, s32, 0x4004
 ; GFX940-NEXT:    s_add_i32 s0, s32, 0x4004
-; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s3 sc0 sc1
-; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s2 offset:16 sc0 sc1
-; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s1 offset:32 sc0 sc1
+; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 sc0 sc1
+; GFX940-NEXT:    s_add_i32 s0, s32, 0x4004
+; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:16 sc0 sc1
+; GFX940-NEXT:    s_add_i32 s0, s32, 0x4004
+; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:32 sc0 sc1
+; GFX940-NEXT:    s_add_i32 s0, s32, 0x4004
 ; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:48 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-PAL-LABEL: zero_init_large_offset_foo:
-; GFX10-PAL:       ; %bb.0:
-; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s32 offset:4 glc dlc
-; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-PAL-NEXT:    s_mov_b32 s0, 0
-; GFX10-PAL-NEXT:    s_mov_b32 s1, s0
-; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
-; GFX10-PAL-NEXT:    s_mov_b32 s3, s0
-; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-PAL-NEXT:    s_add_i32 s3, s32, 0x4004
-; GFX10-PAL-NEXT:    s_add_i32 s2, s32, 0x4004
-; GFX10-PAL-NEXT:    s_add_i32 s1, s32, 0x4004
-; GFX10-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
-; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s3
-; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s2 offset:16
-; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s1 offset:32
-; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:48
-; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1010-PAL-LABEL: zero_init_large_offset_foo:
+; GFX1010-PAL:       ; %bb.0:
+; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s32 offset:4 glc dlc
+; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
+; GFX1010-PAL-NEXT:    s_mov_b32 s1, s0
+; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
+; GFX1010-PAL-NEXT:    s_mov_b32 s3, s0
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1010-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
+; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0
+; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX1010-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
+; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:16
+; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX1010-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
+; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:32
+; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX1010-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
+; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:48
+; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1010-PAL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-PAL-LABEL: zero_init_large_offset_foo:
+; GFX1030-PAL:       ; %bb.0:
+; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s32 offset:4 glc dlc
+; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT:    s_mov_b32 s0, 0
+; GFX1030-PAL-NEXT:    s_mov_b32 s1, s0
+; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
+; GFX1030-PAL-NEXT:    s_mov_b32 s3, s0
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1030-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
+; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0
+; GFX1030-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
+; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:16
+; GFX1030-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
+; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:32
+; GFX1030-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
+; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:48
+; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1030-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-PAL-LABEL: zero_init_large_offset_foo:
 ; GFX11-PAL:       ; %bb.0:
@@ -2271,14 +2250,13 @@ define void @zero_init_large_offset_foo() {
 ; GFX11-PAL-NEXT:    s_mov_b32 s3, s0
 ; GFX11-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-PAL-NEXT:    s_add_i32 s3, s32, 0x4004
-; GFX11-PAL-NEXT:    s_add_i32 s2, s32, 0x4004
-; GFX11-PAL-NEXT:    s_add_i32 s1, s32, 0x4004
 ; GFX11-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
-; GFX11-PAL-NEXT:    s_clause 0x3
-; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s3
-; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s2 offset:16
-; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s1 offset:32
+; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s0
+; GFX11-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
+; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s0 offset:16
+; GFX11-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
+; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s0 offset:32
+; GFX11-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
 ; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s0 offset:48
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
@@ -2296,8 +2274,8 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
 ; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
-; GFX9-NEXT:    s_mov_b32 s2, 0
-; GFX9-NEXT:    scratch_load_dword v0, off, s2 offset:4 glc
+; GFX9-NEXT:    s_mov_b32 s1, 0
+; GFX9-NEXT:    scratch_load_dword v0, off, s1 offset:4 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
 ; GFX9-NEXT:    s_and_b32 s0, s0, 15
@@ -2356,14 +2334,14 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
 ; GFX9-PAL-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-PAL-NEXT:    s_mov_b32 s4, s0
 ; GFX9-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-PAL-NEXT:    s_mov_b32 s2, 0
-; GFX9-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
+; GFX9-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s4, s3
 ; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
-; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s2 offset:4 glc
-; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT:    s_mov_b32 s1, 0
+; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s1 offset:4 glc
+; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
 ; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
@@ -2406,8 +2384,8 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
 ; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
 ; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
 ; GFX1010-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX1010-PAL-NEXT:    s_mov_b32 s2, 0
-; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s2 offset:4 glc dlc
+; GFX1010-PAL-NEXT:    s_mov_b32 s1, 0
+; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 offset:4 glc dlc
 ; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2484,8 +2462,8 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) {
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
-; GFX9-NEXT:    s_mov_b32 s1, 0
-; GFX9-NEXT:    scratch_load_dword v0, off, s1 offset:4 glc
+; GFX9-NEXT:    s_mov_b32 s0, 0
+; GFX9-NEXT:    scratch_load_dword v0, off, s0 offset:4 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_lshl_b32 s0, s2, 2
 ; GFX9-NEXT:    s_addk_i32 s0, 0x4004
@@ -2544,8 +2522,8 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) {
 ; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
 ; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
 ; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
-; GFX9-PAL-NEXT:    s_mov_b32 s2, 0
-; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s2 offset:4 glc
+; GFX9-PAL-NEXT:    s_mov_b32 s1, 0
+; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s1 offset:4 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
 ; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
@@ -2586,11 +2564,11 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) {
 ; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
 ; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
 ; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
-; GFX1010-PAL-NEXT:    s_mov_b32 s2, 0
-; GFX1010-PAL-NEXT:    s_and_b32 s1, s0, 15
-; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s2 offset:4 glc dlc
+; GFX1010-PAL-NEXT:    s_mov_b32 s1, 0
+; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 offset:4 glc dlc
 ; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 15
+; GFX1010-PAL-NEXT:    s_and_b32 s1, s0, 15
 ; GFX1010-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX1010-PAL-NEXT:    s_lshl_b32 s1, s1, 2
 ; GFX1010-PAL-NEXT:    s_addk_i32 s0, 0x4004
@@ -2846,13 +2824,13 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 15, v0
-; GFX10-NEXT:    s_add_i32 s1, s32, 0x4004
 ; GFX10-NEXT:    s_add_i32 s0, s32, 0x4004
-; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 15
-; GFX10-NEXT:    v_lshl_add_u32 v1, v1, 2, s0
+; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
+; GFX10-NEXT:    s_add_i32 s0, s32, 0x4004
 ; GFX10-NEXT:    scratch_load_dword v3, off, s32 offset:4 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshl_add_u32 v1, v1, 2, s0
 ; GFX10-NEXT:    scratch_store_dword v0, v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
@@ -2864,13 +2842,12 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
-; GFX11-NEXT:    s_add_i32 s1, s32, 0x4004
 ; GFX11-NEXT:    s_add_i32 s0, s32, 0x4004
-; GFX11-NEXT:    v_lshl_add_u32 v0, v0, 2, s1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
 ; GFX11-NEXT:    scratch_load_b32 v3, off, s32 offset:4 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-NEXT:    s_add_i32 s0, s32, 0x4004
 ; GFX11-NEXT:    scratch_store_b32 v0, v2, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_load_b32 v0, v1, s0 glc dlc
@@ -2899,8 +2876,8 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    scratch_load_dword v1, off, s32 offset:4 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    s_add_i32 s1, s32, 0x4004
-; GFX940-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NEXT:    s_add_i32 s0, s32, 0x4004
+; GFX940-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NEXT:    v_lshl_add_u32 v1, v0, 2, v1
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX940-NEXT:    v_and_b32_e32 v0, 15, v0
@@ -2917,13 +2894,13 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-PAL-NEXT:    v_and_b32_e32 v1, 15, v0
-; GFX10-PAL-NEXT:    s_add_i32 s1, s32, 0x4004
 ; GFX10-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
-; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, s1
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 15
-; GFX10-PAL-NEXT:    v_lshl_add_u32 v1, v1, 2, s0
+; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
+; GFX10-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
 ; GFX10-PAL-NEXT:    scratch_load_dword v3, off, s32 offset:4 glc dlc
 ; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT:    v_lshl_add_u32 v1, v1, 2, s0
 ; GFX10-PAL-NEXT:    scratch_store_dword v0, v2, off
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-PAL-NEXT:    scratch_load_dword v0, v1, off glc dlc
@@ -2935,13 +2912,12 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-PAL-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
-; GFX11-PAL-NEXT:    s_add_i32 s1, s32, 0x4004
 ; GFX11-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
-; GFX11-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, s1
-; GFX11-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-PAL-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
 ; GFX11-PAL-NEXT:    scratch_load_b32 v3, off, s32 offset:4 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
+; GFX11-PAL-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
 ; GFX11-PAL-NEXT:    scratch_store_b32 v0, v2, off dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, v1, s0 glc dlc
@@ -2966,10 +2942,10 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 13
-; GFX9-NEXT:    s_mov_b32 s1, 0
-; GFX9-NEXT:    s_movk_i32 s0, 0x3000
-; GFX9-NEXT:    scratch_store_dword off, v0, s1 offset:4
+; GFX9-NEXT:    s_mov_b32 s0, 0
+; GFX9-NEXT:    scratch_store_dword off, v0, s0 offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_movk_i32 s0, 0x3000
 ; GFX9-NEXT:    s_add_i32 s0, s0, 4
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX9-NEXT:    scratch_store_dword off, v0, s0 offset:3712
@@ -3016,14 +2992,14 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
 ; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
 ; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 13
-; GFX9-PAL-NEXT:    s_movk_i32 s0, 0x3000
+; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
 ; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
 ; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
 ; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
-; GFX9-PAL-NEXT:    s_mov_b32 s1, 0
-; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1 offset:4
+; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s0 offset:4
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT:    s_movk_i32 s0, 0x3000
 ; GFX9-PAL-NEXT:    s_add_i32 s0, s0, 4
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s0 offset:3712
@@ -3919,10 +3895,9 @@ define amdgpu_ps void @large_offset() {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, v0
-; GFX9-NEXT:    s_mov_b32 s1, 0
-; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s1 offset:3024
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s0, 0
+; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:3024
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    scratch_load_dwordx4 v[0:3], off, s0 offset:3024 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 16
@@ -3994,10 +3969,9 @@ define amdgpu_ps void @large_offset() {
 ; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
 ; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s0
 ; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
-; GFX9-PAL-NEXT:    s_mov_b32 s1, 0
-; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s1 offset:3024
-; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
+; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:3024
+; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    scratch_load_dwordx4 v[0:3], off, s0 offset:3024 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 16

diff  --git a/llvm/test/CodeGen/AMDGPU/frame-index.mir b/llvm/test/CodeGen/AMDGPU/frame-index.mir
index eebd326d8e421..31bc2b04dde8c 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-index.mir
+++ b/llvm/test/CodeGen/AMDGPU/frame-index.mir
@@ -89,8 +89,8 @@ body:             |
     ; GCN-LABEL: name: func_add_constant_to_fi_uniform_SCC_clobber_i32
     ; GCN: liveins: $sgpr30_sgpr31
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: $sgpr1 = S_LSHR_B32 6, $sgpr32, implicit-def dead $scc
-    ; GCN-NEXT: renamable $sgpr4 = nuw S_ADD_U32 killed $sgpr1, 4, implicit-def $scc
+    ; GCN-NEXT: $sgpr0 = S_LSHR_B32 6, $sgpr32, implicit-def dead $scc
+    ; GCN-NEXT: renamable $sgpr4 = nuw S_ADD_U32 killed $sgpr0, 4, implicit-def $scc
     ; GCN-NEXT: renamable $sgpr5 = S_ADDC_U32 $sgpr4, 1234567, implicit-def $scc, implicit $scc
     ; GCN-NEXT: $sgpr0 = S_LSHR_B32 $sgpr32, 6, implicit-def $scc
     ; GCN-NEXT: $sgpr0 = S_ADD_I32 killed $sgpr0, 8, implicit-def $scc
@@ -167,8 +167,8 @@ body:             |
     ; GCN: liveins: $sgpr10, $sgpr30_sgpr31
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: renamable $sgpr4 = nuw S_ADD_U32 $sgpr10, 4, implicit-def $scc
-    ; GCN-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
-    ; GCN-NEXT: $sgpr0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
+    ; GCN-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GCN-NEXT: $sgpr0 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
     ; GCN-NEXT: renamable $sgpr5 = S_MUL_I32 killed $sgpr0, $sgpr4
     ; GCN-NEXT: renamable $sgpr6 = S_ADDC_U32 $sgpr5, 1234567, implicit-def dead $scc, implicit $scc
     ; GCN-NEXT: renamable $vgpr0 = COPY killed renamable $sgpr6, implicit $exec

diff  --git a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
index e7cab2606aa8a..cd69ffed1cec3 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
@@ -47,22 +47,22 @@ define void @callee_with_stack_and_call() #0 {
 ; NO-SPILL-TO-VGPR-NEXT:    v_mov_b32_e32 v0, s4
 ; NO-SPILL-TO-VGPR-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; NO-SPILL-TO-VGPR-NEXT:    s_addk_i32 s32, 0x800
-; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 s[10:11], exec
+; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 s[4:5], exec
 ; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, 1
-; NO-SPILL-TO-VGPR-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:16
-; NO-SPILL-TO-VGPR-NEXT:    v_writelane_b32 v2, s30, 0
-; NO-SPILL-TO-VGPR-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; NO-SPILL-TO-VGPR-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:16
+; NO-SPILL-TO-VGPR-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:16
+; NO-SPILL-TO-VGPR-NEXT:    v_writelane_b32 v0, s30, 0
+; NO-SPILL-TO-VGPR-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; NO-SPILL-TO-VGPR-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:16
 ; NO-SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)
-; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, s[10:11]
-; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 s[8:9], exec
+; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, s[4:5]
+; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 s[4:5], exec
 ; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, 1
-; NO-SPILL-TO-VGPR-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:16
-; NO-SPILL-TO-VGPR-NEXT:    v_writelane_b32 v1, s31, 0
-; NO-SPILL-TO-VGPR-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; NO-SPILL-TO-VGPR-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:16
+; NO-SPILL-TO-VGPR-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:16
+; NO-SPILL-TO-VGPR-NEXT:    v_writelane_b32 v0, s31, 0
+; NO-SPILL-TO-VGPR-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; NO-SPILL-TO-VGPR-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:16
 ; NO-SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)
-; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, s[8:9]
+; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, s[4:5]
 ; NO-SPILL-TO-VGPR-NEXT:    v_mov_b32_e32 v0, 0
 ; NO-SPILL-TO-VGPR-NEXT:    buffer_store_dword v0, off, s[0:3], s33
 ; NO-SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)
@@ -70,24 +70,24 @@ define void @callee_with_stack_and_call() #0 {
 ; NO-SPILL-TO-VGPR-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
 ; NO-SPILL-TO-VGPR-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
 ; NO-SPILL-TO-VGPR-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 s[8:9], exec
+; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 s[4:5], exec
 ; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, 1
-; NO-SPILL-TO-VGPR-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:16
-; NO-SPILL-TO-VGPR-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; NO-SPILL-TO-VGPR-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:16
+; NO-SPILL-TO-VGPR-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; NO-SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)
-; NO-SPILL-TO-VGPR-NEXT:    v_readlane_b32 s31, v2, 0
-; NO-SPILL-TO-VGPR-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:16
+; NO-SPILL-TO-VGPR-NEXT:    v_readlane_b32 s31, v0, 0
+; NO-SPILL-TO-VGPR-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:16
 ; NO-SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)
-; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, s[8:9]
-; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 s[6:7], exec
+; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, s[4:5]
+; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 s[4:5], exec
 ; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, 1
-; NO-SPILL-TO-VGPR-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:16
-; NO-SPILL-TO-VGPR-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; NO-SPILL-TO-VGPR-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:16
+; NO-SPILL-TO-VGPR-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; NO-SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)
-; NO-SPILL-TO-VGPR-NEXT:    v_readlane_b32 s30, v1, 0
-; NO-SPILL-TO-VGPR-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:16
+; NO-SPILL-TO-VGPR-NEXT:    v_readlane_b32 s30, v0, 0
+; NO-SPILL-TO-VGPR-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:16
 ; NO-SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)
-; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, s[6:7]
+; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, s[4:5]
 ; NO-SPILL-TO-VGPR-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
 ; NO-SPILL-TO-VGPR-NEXT:    s_addk_i32 s32, 0xf800
 ; NO-SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)

diff  --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
index 72a86c24235f4..52f97150e4b30 100644
--- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
@@ -61,10 +61,10 @@ define amdgpu_kernel void @local_stack_offset_uses_sp(ptr addrspace(1) %out) {
 ; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
 ; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
 ; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
-; FLATSCR-NEXT:    s_movk_i32 s3, 0x2000
-; FLATSCR-NEXT:    s_mov_b32 s2, 0
-; FLATSCR-NEXT:    scratch_store_dword off, v0, s3
+; FLATSCR-NEXT:    s_movk_i32 s2, 0x2000
+; FLATSCR-NEXT:    scratch_store_dword off, v0, s2
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_mov_b32 s2, 0
 ; FLATSCR-NEXT:  .LBB0_1: ; %loadstoreloop
 ; FLATSCR-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; FLATSCR-NEXT:    s_add_i32 s3, s2, 0x3000
@@ -155,10 +155,10 @@ define void @func_local_stack_offset_uses_sp(ptr addrspace(1) %out) {
 ; FLATSCR-NEXT:    s_and_b32 s33, s33, 0xffffe000
 ; FLATSCR-NEXT:    s_add_i32 s32, s32, 0x8000
 ; FLATSCR-NEXT:    v_mov_b32_e32 v2, 0
-; FLATSCR-NEXT:    s_add_i32 s1, s33, 0x2000
-; FLATSCR-NEXT:    s_mov_b32 s0, 0
-; FLATSCR-NEXT:    scratch_store_dword off, v2, s1
+; FLATSCR-NEXT:    s_add_i32 s0, s33, 0x2000
+; FLATSCR-NEXT:    scratch_store_dword off, v2, s0
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_mov_b32 s0, 0
 ; FLATSCR-NEXT:  .LBB1_1: ; %loadstoreloop
 ; FLATSCR-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; FLATSCR-NEXT:    s_add_i32 s3, s33, 0x3000
@@ -233,35 +233,35 @@ define amdgpu_kernel void @local_stack_offset_uses_sp_flat(ptr addrspace(1) %out
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    v_or_b32_e32 v1, 0x12cc, v0
 ; MUBUF-NEXT:    v_or_b32_e32 v0, 0x12c8, v0
-; MUBUF-NEXT:    v_mov_b32_e32 v18, 0x4000
-; MUBUF-NEXT:    v_mov_b32_e32 v17, 0x4000
-; MUBUF-NEXT:    v_mov_b32_e32 v16, 0x4000
+; MUBUF-NEXT:    v_mov_b32_e32 v2, 0x4000
 ; MUBUF-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen glc
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
-; MUBUF-NEXT:    v_mov_b32_e32 v15, 0x4000
+; MUBUF-NEXT:    v_mov_b32_e32 v3, 0x4000
 ; MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen glc
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
-; MUBUF-NEXT:    v_mov_b32_e32 v14, 0x4000
-; MUBUF-NEXT:    buffer_load_dword v8, v18, s[0:3], 0 offen glc
-; MUBUF-NEXT:    s_waitcnt vmcnt(0)
-; MUBUF-NEXT:    buffer_load_dword v9, v17, s[0:3], 0 offen offset:4 glc
+; MUBUF-NEXT:    v_mov_b32_e32 v10, 0x4000
+; MUBUF-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen glc
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
-; MUBUF-NEXT:    buffer_load_dword v2, v16, s[0:3], 0 offen offset:8 glc
+; MUBUF-NEXT:    v_mov_b32_e32 v2, 0x4000
+; MUBUF-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:4 glc
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
-; MUBUF-NEXT:    buffer_load_dword v3, v15, s[0:3], 0 offen offset:12 glc
+; MUBUF-NEXT:    v_mov_b32_e32 v11, 0x4000
+; MUBUF-NEXT:    buffer_load_dword v2, v3, s[0:3], 0 offen offset:8 glc
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
-; MUBUF-NEXT:    buffer_load_dword v10, v14, s[0:3], 0 offen offset:16 glc
-; MUBUF-NEXT:    s_waitcnt vmcnt(0)
-; MUBUF-NEXT:    v_mov_b32_e32 v13, 0x4000
-; MUBUF-NEXT:    buffer_load_dword v11, v13, s[0:3], 0 offen offset:20 glc
+; MUBUF-NEXT:    v_mov_b32_e32 v12, 0x4000
+; MUBUF-NEXT:    buffer_load_dword v3, v10, s[0:3], 0 offen offset:12 glc
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; MUBUF-NEXT:    v_mov_b32_e32 v12, 0
+; MUBUF-NEXT:    buffer_load_dword v10, v11, s[0:3], 0 offen offset:16 glc
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    v_add_co_u32_e32 v2, vcc, v0, v2
+; MUBUF-NEXT:    buffer_load_dword v11, v12, s[0:3], 0 offen offset:20 glc
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
 ; MUBUF-NEXT:    v_add_co_u32_e32 v0, vcc, v7, v8
 ; MUBUF-NEXT:    v_addc_co_u32_e32 v1, vcc, v6, v9, vcc
 ; MUBUF-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v10
+; MUBUF-NEXT:    v_mov_b32_e32 v12, 0
 ; MUBUF-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v11, vcc
 ; MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
 ; MUBUF-NEXT:    global_store_dwordx2 v12, v[4:5], s[4:5] offset:16
@@ -275,9 +275,8 @@ define amdgpu_kernel void @local_stack_offset_uses_sp_flat(ptr addrspace(1) %out
 ; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
 ; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
 ; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
-; FLATSCR-NEXT:    s_mov_b32 s3, 0
 ; FLATSCR-NEXT:    s_mov_b32 s2, 0
-; FLATSCR-NEXT:    scratch_store_dword off, v0, s3 offset:1024
+; FLATSCR-NEXT:    scratch_store_dword off, v0, s2 offset:1024
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:  .LBB2_1: ; %loadstoreloop
 ; FLATSCR-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -294,9 +293,8 @@ define amdgpu_kernel void @local_stack_offset_uses_sp_flat(ptr addrspace(1) %out
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 offset:704 glc
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    s_movk_i32 s3, 0x2000
 ; FLATSCR-NEXT:    s_movk_i32 s2, 0x2000
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[10:11], off, s3 offset:16 glc
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[10:11], off, s2 offset:16 glc
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    scratch_load_dwordx4 v[4:7], off, s2 glc
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)

diff  --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir
index 4a69e7ac04a09..808e61f517a11 100644
--- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir
@@ -87,10 +87,10 @@ body:             |
     ; GFX9-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7
     ; GFX9-FLATSCR-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr4, 0, undef $vgpr2
     ; GFX9-FLATSCR-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc
-    ; GFX9-FLATSCR-NEXT: $sgpr9 = S_ADD_I32 $sgpr33, 8192, implicit-def $scc
-    ; GFX9-FLATSCR-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr9, implicit $exec
-    ; GFX9-FLATSCR-NEXT: $sgpr8 = S_ADD_I32 $sgpr33, 16384, implicit-def $scc
-    ; GFX9-FLATSCR-NEXT: $vgpr0 = V_OR_B32_e32 killed $sgpr8, $vgpr1, implicit $exec
+    ; GFX9-FLATSCR-NEXT: $sgpr4 = S_ADD_I32 $sgpr33, 8192, implicit-def $scc
+    ; GFX9-FLATSCR-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr4, implicit $exec
+    ; GFX9-FLATSCR-NEXT: $sgpr4 = S_ADD_I32 $sgpr33, 16384, implicit-def $scc
+    ; GFX9-FLATSCR-NEXT: $vgpr0 = V_OR_B32_e32 killed $sgpr4, $vgpr1, implicit $exec
     ; GFX9-FLATSCR-NEXT: $sgpr4 = V_READLANE_B32 $vgpr2, 0
     ; GFX9-FLATSCR-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
     ; GFX9-FLATSCR-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 16388, implicit-def dead $scc

diff  --git a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
index 78880428278cd..a5bd362d3b2e5 100644
--- a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
+++ b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
@@ -34,7 +34,7 @@
 ; GFX9-FLATSCR-PAL-DAG: s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX9-FLATSCR-PAL-DAG: v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-FLATSCR-PAL-DAG: v_mov_b32_e32 v0, 0xbf20e7f4
-; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s1, 0
+; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s0, 0
 ; GFX9-FLATSCR-PAL-DAG: s_waitcnt lgkmcnt(0)
 ; GFX9-FLATSCR-PAL-DAG: s_and_b32 s3, s3, 0xffff
 ; GFX9-FLATSCR-PAL-DAG: s_add_u32 flat_scratch_lo, s2, s0
@@ -102,7 +102,7 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX9-FLATSCR-PAL-DAG: s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX9-FLATSCR-PAL-DAG: v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-FLATSCR-PAL-DAG: v_mov_b32_e32 v0, 0xbf20e7f4
-; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s1, 0
+; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s0, 0
 ; GFX9-FLATSCR-PAL-DAG: s_waitcnt lgkmcnt(0)
 ; GFX9-FLATSCR-PAL-DAG: s_and_b32 s3, s3, 0xffff
 ; GFX9-FLATSCR-PAL-DAG: s_add_u32 flat_scratch_lo, s2, s0
@@ -155,7 +155,7 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX9-FLATSCR-PAL-DAG: s_load_dwordx2 s[2:3], s[2:3], 0x10
 ; GFX9-FLATSCR-PAL-DAG: v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-FLATSCR-PAL-DAG: v_mov_b32_e32 v0, 0xbf20e7f4
-; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s1, 0
+; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s0, 0
 ; GFX9-FLATSCR-PAL-DAG: s_waitcnt lgkmcnt(0)
 ; GFX9-FLATSCR-PAL-DAG: s_and_b32 s3, s3, 0xffff
 ; GFX9-FLATSCR-PAL-DAG: s_add_u32 flat_scratch_lo, s2, s0
@@ -237,7 +237,7 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; GFX9-FLATSCR-PAL-DAG: s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX9-FLATSCR-PAL-DAG: v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-FLATSCR-PAL-DAG: v_mov_b32_e32 v0, 0xbf20e7f4
-; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s1, 0
+; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s0, 0
 ; GFX9-FLATSCR-PAL-DAG: s_waitcnt lgkmcnt(0)
 ; GFX9-FLATSCR-PAL-DAG: s_and_b32 s1, s1, 0xffff
 ; GFX9-FLATSCR-PAL-DAG: s_add_u32 flat_scratch_lo, s0, s5
@@ -293,7 +293,7 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX9-FLATSCR-PAL-DAG: s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX9-FLATSCR-PAL-DAG: v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-FLATSCR-PAL-DAG: v_mov_b32_e32 v0, 0xbf20e7f4
-; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s1, 0
+; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s0, 0
 ; GFX9-FLATSCR-PAL-DAG: s_waitcnt lgkmcnt(0)
 ; GFX9-FLATSCR-PAL-DAG: s_and_b32 s1, s1, 0xffff
 ; GFX9-FLATSCR-PAL-DAG: s_add_u32 flat_scratch_lo, s0, s5
@@ -349,7 +349,7 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX9-FLATSCR-PAL-DAG: s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX9-FLATSCR-PAL-DAG: v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-FLATSCR-PAL-DAG: v_mov_b32_e32 v0, 0xbf20e7f4
-; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s1, 0
+; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s0, 0
 ; GFX9-FLATSCR-PAL-DAG: s_waitcnt lgkmcnt(0)
 ; GFX9-FLATSCR-PAL-DAG: s_and_b32 s1, s1, 0xffff
 ; GFX9-FLATSCR-PAL-DAG: s_add_u32 flat_scratch_lo, s0, s5

diff  --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-to-vmem-scc-clobber.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-to-vmem-scc-clobber.mir
index 8f7d2f7d05927..797231b3412a8 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-to-vmem-scc-clobber.mir
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-to-vmem-scc-clobber.mir
@@ -77,7 +77,7 @@ body:             |
   ; VMEM-GFX8-NEXT:   $sgpr4_sgpr5 = S_MOV_B64 $exec
   ; VMEM-GFX8-NEXT:   $exec = S_MOV_B64 3, implicit-def $vgpr0
   ; VMEM-GFX8-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
-  ; VMEM-GFX8-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr8, 0, undef $vgpr0
+  ; VMEM-GFX8-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr8, 0, undef $vgpr0, implicit $sgpr8_sgpr9
   ; VMEM-GFX8-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr9, 1, $vgpr0, implicit $sgpr8_sgpr9
   ; VMEM-GFX8-NEXT:   $vgpr1 = V_MOV_B32_e32 8200, implicit $exec
   ; VMEM-GFX8-NEXT:   BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
@@ -333,7 +333,7 @@ body:             |
   ; VMEM-GFX8-NEXT:   $sgpr4_sgpr5 = S_MOV_B64 $exec
   ; VMEM-GFX8-NEXT:   $exec = S_MOV_B64 3
   ; VMEM-GFX8-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
-  ; VMEM-GFX8-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr8, 0, undef $vgpr0
+  ; VMEM-GFX8-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr8, 0, undef $vgpr0, implicit $sgpr8_sgpr9
   ; VMEM-GFX8-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr9, 1, $vgpr0, implicit $sgpr8_sgpr9
   ; VMEM-GFX8-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
   ; VMEM-GFX8-NEXT:   $vgpr1 = V_MOV_B32_e32 8200, implicit $exec
@@ -444,7 +444,7 @@ body:             |
   ; VMEM-GFX8-NEXT:   liveins: $sgpr8, $sgpr9, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
   ; VMEM-GFX8-NEXT: {{  $}}
   ; VMEM-GFX8-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
-  ; VMEM-GFX8-NEXT:   $sgpr6_sgpr7 = S_MOV_B64 $exec
+  ; VMEM-GFX8-NEXT:   $sgpr4_sgpr5 = S_MOV_B64 $exec
   ; VMEM-GFX8-NEXT:   $exec = S_MOV_B64 1
   ; VMEM-GFX8-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
   ; VMEM-GFX8-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr8, 0, undef $vgpr0
@@ -452,7 +452,7 @@ body:             |
   ; VMEM-GFX8-NEXT:   $vgpr1 = V_MOV_B32_e32 8200, implicit $exec
   ; VMEM-GFX8-NEXT:   BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
   ; VMEM-GFX8-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
-  ; VMEM-GFX8-NEXT:   $exec = S_MOV_B64 killed $sgpr6_sgpr7
+  ; VMEM-GFX8-NEXT:   $exec = S_MOV_B64 killed $sgpr4_sgpr5
   ; VMEM-GFX8-NEXT:   $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5)
   ; VMEM-GFX8-NEXT:   $sgpr4_sgpr5 = S_MOV_B64 $exec
   ; VMEM-GFX8-NEXT:   $exec = S_MOV_B64 1
@@ -508,7 +508,7 @@ body:             |
   ; VMEM-GFX8-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
   ; VMEM-GFX8-NEXT: {{  $}}
   ; VMEM-GFX8-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
-  ; VMEM-GFX8-NEXT:   $sgpr6_sgpr7 = S_MOV_B64 $exec
+  ; VMEM-GFX8-NEXT:   $sgpr4_sgpr5 = S_MOV_B64 $exec
   ; VMEM-GFX8-NEXT:   $exec = S_MOV_B64 1
   ; VMEM-GFX8-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
   ; VMEM-GFX8-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
@@ -516,7 +516,7 @@ body:             |
   ; VMEM-GFX8-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
   ; VMEM-GFX8-NEXT:   $sgpr8 = V_READLANE_B32 killed $vgpr0, 0
   ; VMEM-GFX8-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
-  ; VMEM-GFX8-NEXT:   $exec = S_MOV_B64 killed $sgpr6_sgpr7
+  ; VMEM-GFX8-NEXT:   $exec = S_MOV_B64 killed $sgpr4_sgpr5
   ; VMEM-GFX8-NEXT:   $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5)
   ; VMEM-GFX8-NEXT:   $sgpr4_sgpr5 = S_MOV_B64 $exec
   ; VMEM-GFX8-NEXT:   $exec = S_MOV_B64 1

diff  --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-vmem-large-frame.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-vmem-large-frame.mir
index aadd9e79ff617..bbbe87848ae4e 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-vmem-large-frame.mir
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-vmem-large-frame.mir
@@ -26,13 +26,13 @@ body:             |
     ; CHECK: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc
-    ; CHECK-NEXT: $sgpr6_sgpr7 = S_MOV_B64 $exec
-    ; CHECK-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr2
-    ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
-    ; CHECK-NEXT: $vgpr2 = V_WRITELANE_B32 killed $sgpr10, 0, undef $vgpr2
-    ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
-    ; CHECK-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
-    ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7, implicit killed $vgpr2
+    ; CHECK-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $exec
+    ; CHECK-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr1
+    ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
+    ; CHECK-NEXT: $vgpr1 = V_WRITELANE_B32 killed $sgpr10, 0, undef $vgpr1
+    ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
+    ; CHECK-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
+    ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5, implicit killed $vgpr1
     ; CHECK-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $exec
     ; CHECK-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr1
     ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)

diff  --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir
index 612af095d6d83..a3fabfffde088 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir
@@ -66,13 +66,13 @@ body:             |
     ; GCN64-MUBUF-NEXT: $sgpr28 = S_ADD_U32 $sgpr28, $sgpr11, implicit-def $scc, implicit-def $sgpr28_sgpr29_sgpr30_sgpr31
     ; GCN64-MUBUF-NEXT: $sgpr29 = S_ADDC_U32 $sgpr29, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr28_sgpr29_sgpr30_sgpr31
     ; GCN64-MUBUF-NEXT: renamable $sgpr12 = IMPLICIT_DEF
-    ; GCN64-MUBUF-NEXT: $sgpr6_sgpr7 = S_MOV_B64 $exec
-    ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr1
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 killed $sgpr12, 0, undef $vgpr1
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7, implicit killed $vgpr1
+    ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
+    ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr0
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr12, 0, undef $vgpr0
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: renamable $sgpr12 = IMPLICIT_DEF
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr0
@@ -82,33 +82,33 @@ body:             |
     ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: renamable $sgpr12_sgpr13 = IMPLICIT_DEF
-    ; GCN64-MUBUF-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $exec
-    ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr2
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr2, implicit $sgpr12_sgpr13
-    ; GCN64-MUBUF-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr13, 1, $vgpr2, implicit killed $sgpr12_sgpr13
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 8, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5, implicit killed $vgpr2
+    ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
+    ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr13, 1, $vgpr0, implicit killed $sgpr12_sgpr13
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 8, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: renamable $sgpr12_sgpr13 = IMPLICIT_DEF
-    ; GCN64-MUBUF-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $exec
-    ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr3
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr3 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr3, implicit $sgpr12_sgpr13
-    ; GCN64-MUBUF-NEXT: $vgpr3 = V_WRITELANE_B32 $sgpr13, 1, $vgpr3, implicit $sgpr12_sgpr13
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 8, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr2_sgpr3, implicit killed $vgpr3
+    ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
+    ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 8, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: renamable $sgpr12_sgpr13_sgpr14 = IMPLICIT_DEF
-    ; GCN64-MUBUF-NEXT: $sgpr6_sgpr7 = S_MOV_B64 $exec
-    ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 7, implicit-def $vgpr1
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr1, implicit $sgpr12_sgpr13_sgpr14
-    ; GCN64-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr13, 1, $vgpr1, implicit $sgpr12_sgpr13_sgpr14
-    ; GCN64-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr14, 2, $vgpr1, implicit killed $sgpr12_sgpr13_sgpr14
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 16, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7, implicit killed $vgpr1
+    ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
+    ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 7, implicit-def $vgpr0
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr14, 2, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 16, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15 = IMPLICIT_DEF
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 15, implicit-def $vgpr0
@@ -121,32 +121,32 @@ body:             |
     ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 = IMPLICIT_DEF
-    ; GCN64-MUBUF-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $exec
-    ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 31, implicit-def $vgpr2
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr2, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
-    ; GCN64-MUBUF-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr13, 1, $vgpr2, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
-    ; GCN64-MUBUF-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr14, 2, $vgpr2, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
-    ; GCN64-MUBUF-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr15, 3, $vgpr2, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
-    ; GCN64-MUBUF-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr16, 4, $vgpr2, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 44, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5, implicit killed $vgpr2
+    ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
+    ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 31, implicit-def $vgpr0
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr15, 3, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr16, 4, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 44, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = IMPLICIT_DEF
-    ; GCN64-MUBUF-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $exec
-    ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 255, implicit-def $vgpr1
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr1, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN64-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr13, 1, $vgpr1, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN64-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr14, 2, $vgpr1, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN64-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr15, 3, $vgpr1, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN64-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr16, 4, $vgpr1, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN64-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr17, 5, $vgpr1, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN64-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr18, 6, $vgpr1, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN64-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr19, 7, $vgpr1, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 64, 0, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr2_sgpr3, implicit killed $vgpr1
+    ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
+    ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 255, implicit-def $vgpr0
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr15, 3, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr16, 4, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr17, 5, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr18, 6, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr19, 7, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 64, 0, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 = IMPLICIT_DEF
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 65535, implicit-def $vgpr0
@@ -171,44 +171,44 @@ body:             |
     ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = IMPLICIT_DEF
-    ; GCN64-MUBUF-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $exec
-    ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 4294967295, implicit-def $vgpr1
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr64, 0, undef $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr65, 1, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr66, 2, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr67, 3, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr68, 4, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr69, 5, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr70, 6, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr71, 7, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr72, 8, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr73, 9, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr74, 10, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr75, 11, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr76, 12, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr77, 13, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr78, 14, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr79, 15, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr80, 16, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr81, 17, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr82, 18, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr83, 19, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr84, 20, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr85, 21, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr86, 22, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr87, 23, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr88, 24, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr89, 25, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr90, 26, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr91, 27, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr92, 28, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr93, 29, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr94, 30, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr95, 31, $vgpr1, implicit killed $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 160, 0, 0, implicit $exec :: (store (s32) into %stack.7, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5, implicit killed $vgpr1
+    ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
+    ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 4294967295, implicit-def $vgpr0
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr64, 0, undef $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr65, 1, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr66, 2, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr67, 3, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr68, 4, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr69, 5, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr70, 6, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr71, 7, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr72, 8, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr73, 9, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr74, 10, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr75, 11, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr76, 12, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr77, 13, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr78, 14, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr79, 15, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr80, 16, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr81, 17, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr82, 18, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr83, 19, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr84, 20, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr85, 21, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr86, 22, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr87, 23, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr88, 24, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr89, 25, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr90, 26, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr91, 27, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr92, 28, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr93, 29, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr94, 30, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr95, 31, $vgpr0, implicit killed $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 160, 0, 0, implicit $exec :: (store (s32) into %stack.7, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: renamable $sgpr12 = IMPLICIT_DEF
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr0
@@ -229,13 +229,13 @@ body:             |
     ; GCN32-MUBUF-NEXT: $sgpr96 = S_ADD_U32 $sgpr96, $sgpr11, implicit-def $scc, implicit-def $sgpr96_sgpr97_sgpr98_sgpr99
     ; GCN32-MUBUF-NEXT: $sgpr97 = S_ADDC_U32 $sgpr97, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr96_sgpr97_sgpr98_sgpr99
     ; GCN32-MUBUF-NEXT: renamable $sgpr12 = IMPLICIT_DEF
-    ; GCN32-MUBUF-NEXT: $sgpr3 = S_MOV_B32 $exec_lo
-    ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 1, implicit-def $vgpr1
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 killed $sgpr12, 0, undef $vgpr1
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr3, implicit killed $vgpr1
+    ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
+    ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 1, implicit-def $vgpr0
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr12, 0, undef $vgpr0
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: renamable $sgpr12 = IMPLICIT_DEF
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 1, implicit-def $vgpr0
@@ -245,33 +245,33 @@ body:             |
     ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: renamable $sgpr12_sgpr13 = IMPLICIT_DEF
-    ; GCN32-MUBUF-NEXT: $sgpr2 = S_MOV_B32 $exec_lo
-    ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 3, implicit-def $vgpr2
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr2, implicit $sgpr12_sgpr13
-    ; GCN32-MUBUF-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr13, 1, $vgpr2, implicit killed $sgpr12_sgpr13
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 8, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr2, implicit killed $vgpr2
+    ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
+    ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 3, implicit-def $vgpr0
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr13, 1, $vgpr0, implicit killed $sgpr12_sgpr13
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 8, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: renamable $sgpr12_sgpr13 = IMPLICIT_DEF
-    ; GCN32-MUBUF-NEXT: $sgpr1 = S_MOV_B32 $exec_lo
-    ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 3, implicit-def $vgpr3
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr3 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr3, implicit $sgpr12_sgpr13
-    ; GCN32-MUBUF-NEXT: $vgpr3 = V_WRITELANE_B32 $sgpr13, 1, $vgpr3, implicit $sgpr12_sgpr13
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 8, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1, implicit killed $vgpr3
+    ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
+    ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 3, implicit-def $vgpr0
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 8, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: renamable $sgpr12_sgpr13_sgpr14 = IMPLICIT_DEF
-    ; GCN32-MUBUF-NEXT: $sgpr3 = S_MOV_B32 $exec_lo
-    ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 7, implicit-def $vgpr1
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr1, implicit $sgpr12_sgpr13_sgpr14
-    ; GCN32-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr13, 1, $vgpr1, implicit $sgpr12_sgpr13_sgpr14
-    ; GCN32-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr14, 2, $vgpr1, implicit killed $sgpr12_sgpr13_sgpr14
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 16, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr3, implicit killed $vgpr1
+    ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
+    ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 7, implicit-def $vgpr0
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr14, 2, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 16, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15 = IMPLICIT_DEF
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 15, implicit-def $vgpr0
@@ -284,32 +284,32 @@ body:             |
     ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 = IMPLICIT_DEF
-    ; GCN32-MUBUF-NEXT: $sgpr2 = S_MOV_B32 $exec_lo
-    ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 31, implicit-def $vgpr2
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr2, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
-    ; GCN32-MUBUF-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr13, 1, $vgpr2, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
-    ; GCN32-MUBUF-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr14, 2, $vgpr2, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
-    ; GCN32-MUBUF-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr15, 3, $vgpr2, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
-    ; GCN32-MUBUF-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr16, 4, $vgpr2, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 44, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr2, implicit killed $vgpr2
+    ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
+    ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 31, implicit-def $vgpr0
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr15, 3, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr16, 4, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 44, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = IMPLICIT_DEF
-    ; GCN32-MUBUF-NEXT: $sgpr1 = S_MOV_B32 $exec_lo
-    ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 255, implicit-def $vgpr1
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr1, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN32-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr13, 1, $vgpr1, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN32-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr14, 2, $vgpr1, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN32-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr15, 3, $vgpr1, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN32-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr16, 4, $vgpr1, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN32-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr17, 5, $vgpr1, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN32-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr18, 6, $vgpr1, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN32-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr19, 7, $vgpr1, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 64, 0, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1, implicit killed $vgpr1
+    ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
+    ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 255, implicit-def $vgpr0
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr15, 3, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr16, 4, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr17, 5, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr18, 6, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr19, 7, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 64, 0, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 = IMPLICIT_DEF
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 65535, implicit-def $vgpr0
@@ -334,44 +334,44 @@ body:             |
     ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = IMPLICIT_DEF
-    ; GCN32-MUBUF-NEXT: $sgpr2 = S_MOV_B32 $exec_lo
-    ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 4294967295, implicit-def $vgpr1
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr64, 0, undef $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr65, 1, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr66, 2, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr67, 3, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr68, 4, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr69, 5, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr70, 6, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr71, 7, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr72, 8, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr73, 9, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr74, 10, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr75, 11, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr76, 12, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr77, 13, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr78, 14, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr79, 15, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr80, 16, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr81, 17, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr82, 18, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr83, 19, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr84, 20, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr85, 21, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr86, 22, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr87, 23, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr88, 24, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr89, 25, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr90, 26, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr91, 27, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr92, 28, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr93, 29, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr94, 30, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr95, 31, $vgpr1, implicit killed $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 160, 0, 0, implicit $exec :: (store (s32) into %stack.7, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr2, implicit killed $vgpr1
+    ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
+    ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 4294967295, implicit-def $vgpr0
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr64, 0, undef $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr65, 1, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr66, 2, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr67, 3, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr68, 4, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr69, 5, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr70, 6, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr71, 7, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr72, 8, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr73, 9, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr74, 10, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr75, 11, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr76, 12, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr77, 13, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr78, 14, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr79, 15, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr80, 16, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr81, 17, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr82, 18, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr83, 19, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr84, 20, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr85, 21, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr86, 22, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr87, 23, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr88, 24, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr89, 25, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr90, 26, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr91, 27, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr92, 28, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr93, 29, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr94, 30, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr95, 31, $vgpr0, implicit killed $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 160, 0, 0, implicit $exec :: (store (s32) into %stack.7, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: renamable $sgpr12 = IMPLICIT_DEF
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 1, implicit-def $vgpr0
@@ -388,13 +388,13 @@ body:             |
     ; GCN64-FLATSCR-NEXT: $flat_scr_lo = S_ADD_U32 $sgpr0, $sgpr11, implicit-def $scc
     ; GCN64-FLATSCR-NEXT: $flat_scr_hi = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc
     ; GCN64-FLATSCR-NEXT: renamable $sgpr12 = IMPLICIT_DEF
-    ; GCN64-FLATSCR-NEXT: $sgpr6_sgpr7 = S_MOV_B64 $exec
-    ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr1
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = V_WRITELANE_B32 killed $sgpr12, 0, undef $vgpr1
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7, implicit killed $vgpr1
+    ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
+    ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr0
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr12, 0, undef $vgpr0
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: renamable $sgpr12 = IMPLICIT_DEF
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr0
@@ -404,33 +404,33 @@ body:             |
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: renamable $sgpr12_sgpr13 = IMPLICIT_DEF
-    ; GCN64-FLATSCR-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $exec
-    ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr2
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr2, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr2, implicit $sgpr12_sgpr13
-    ; GCN64-FLATSCR-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr13, 1, $vgpr2, implicit killed $sgpr12_sgpr13
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr2, $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5, implicit killed $vgpr2
+    ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
+    ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr13, 1, $vgpr0, implicit killed $sgpr12_sgpr13
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: renamable $sgpr12_sgpr13 = IMPLICIT_DEF
-    ; GCN64-FLATSCR-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $exec
-    ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr3
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr3, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr3 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr3, implicit $sgpr12_sgpr13
-    ; GCN64-FLATSCR-NEXT: $vgpr3 = V_WRITELANE_B32 $sgpr13, 1, $vgpr3, implicit $sgpr12_sgpr13
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr3, $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr3 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr2_sgpr3, implicit killed $vgpr3
+    ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
+    ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: renamable $sgpr12_sgpr13_sgpr14 = IMPLICIT_DEF
-    ; GCN64-FLATSCR-NEXT: $sgpr6_sgpr7 = S_MOV_B64 $exec
-    ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 7, implicit-def $vgpr1
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr1, implicit $sgpr12_sgpr13_sgpr14
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr13, 1, $vgpr1, implicit $sgpr12_sgpr13_sgpr14
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr14, 2, $vgpr1, implicit killed $sgpr12_sgpr13_sgpr14
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7, implicit killed $vgpr1
+    ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
+    ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 7, implicit-def $vgpr0
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr14, 2, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15 = IMPLICIT_DEF
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 15, implicit-def $vgpr0
@@ -443,32 +443,32 @@ body:             |
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 = IMPLICIT_DEF
-    ; GCN64-FLATSCR-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $exec
-    ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 31, implicit-def $vgpr2
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr2, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr2, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
-    ; GCN64-FLATSCR-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr13, 1, $vgpr2, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
-    ; GCN64-FLATSCR-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr14, 2, $vgpr2, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
-    ; GCN64-FLATSCR-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr15, 3, $vgpr2, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
-    ; GCN64-FLATSCR-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr16, 4, $vgpr2, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr2, $sgpr33, 44, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.4, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5, implicit killed $vgpr2
+    ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
+    ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 31, implicit-def $vgpr0
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr15, 3, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr16, 4, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 44, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.4, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = IMPLICIT_DEF
-    ; GCN64-FLATSCR-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $exec
-    ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 255, implicit-def $vgpr1
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr1, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr13, 1, $vgpr1, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr14, 2, $vgpr1, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr15, 3, $vgpr1, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr16, 4, $vgpr1, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr17, 5, $vgpr1, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr18, 6, $vgpr1, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr19, 7, $vgpr1, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr33, 64, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.5, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr2_sgpr3, implicit killed $vgpr1
+    ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
+    ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 255, implicit-def $vgpr0
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr15, 3, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr16, 4, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr17, 5, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr18, 6, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr19, 7, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 64, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.5, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 = IMPLICIT_DEF
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 65535, implicit-def $vgpr0
@@ -493,44 +493,44 @@ body:             |
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = IMPLICIT_DEF
-    ; GCN64-FLATSCR-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $exec
-    ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 4294967295, implicit-def $vgpr1
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr64, 0, undef $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr65, 1, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr66, 2, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr67, 3, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr68, 4, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr69, 5, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr70, 6, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr71, 7, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr72, 8, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr73, 9, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr74, 10, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr75, 11, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr76, 12, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr77, 13, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr78, 14, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr79, 15, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr80, 16, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr81, 17, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr82, 18, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr83, 19, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr84, 20, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr85, 21, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr86, 22, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr87, 23, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr88, 24, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr89, 25, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr90, 26, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr91, 27, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr92, 28, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr93, 29, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr94, 30, $vgpr1, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr95, 31, $vgpr1, implicit killed $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr33, 160, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.7, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5, implicit killed $vgpr1
+    ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
+    ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 4294967295, implicit-def $vgpr0
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr64, 0, undef $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr65, 1, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr66, 2, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr67, 3, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr68, 4, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr69, 5, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr70, 6, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr71, 7, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr72, 8, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr73, 9, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr74, 10, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr75, 11, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr76, 12, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr77, 13, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr78, 14, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr79, 15, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr80, 16, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr81, 17, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr82, 18, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr83, 19, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr84, 20, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr85, 21, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr86, 22, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr87, 23, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr88, 24, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr89, 25, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr90, 26, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr91, 27, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr92, 28, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr93, 29, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr94, 30, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr95, 31, $vgpr0, implicit killed $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 160, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.7, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: renamable $sgpr12 = IMPLICIT_DEF
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr0
@@ -622,30 +622,30 @@ body:             |
     ; GCN64-MUBUF-NEXT: $sgpr31 = S_MOV_B32 14680064, implicit-def $sgpr28_sgpr29_sgpr30_sgpr31
     ; GCN64-MUBUF-NEXT: $sgpr28 = S_ADD_U32 $sgpr28, $sgpr11, implicit-def $scc, implicit-def $sgpr28_sgpr29_sgpr30_sgpr31
     ; GCN64-MUBUF-NEXT: $sgpr29 = S_ADDC_U32 $sgpr29, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr28_sgpr29_sgpr30_sgpr31
-    ; GCN64-MUBUF-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $exec
-    ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr1
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 4, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $sgpr12 = V_READLANE_B32 killed $vgpr1, 0
-    ; GCN64-MUBUF-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5, implicit killed $vgpr1
-    ; GCN64-MUBUF-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $exec
-    ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr2
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 8, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $sgpr12 = V_READLANE_B32 $vgpr2, 0, implicit-def $sgpr12_sgpr13
-    ; GCN64-MUBUF-NEXT: $sgpr13 = V_READLANE_B32 killed $vgpr2, 1
-    ; GCN64-MUBUF-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr2_sgpr3, implicit killed $vgpr2
-    ; GCN64-MUBUF-NEXT: $sgpr6_sgpr7 = S_MOV_B64 $exec
-    ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 7, implicit-def $vgpr3
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 16, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $sgpr12 = V_READLANE_B32 $vgpr3, 0, implicit-def $sgpr12_sgpr13_sgpr14
-    ; GCN64-MUBUF-NEXT: $sgpr13 = V_READLANE_B32 $vgpr3, 1
-    ; GCN64-MUBUF-NEXT: $sgpr14 = V_READLANE_B32 killed $vgpr3, 2
-    ; GCN64-MUBUF-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7, implicit killed $vgpr3
+    ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
+    ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr0
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 4, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $sgpr12 = V_READLANE_B32 killed $vgpr0, 0
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
+    ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
+    ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 8, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $sgpr12 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr12_sgpr13
+    ; GCN64-MUBUF-NEXT: $sgpr13 = V_READLANE_B32 killed $vgpr0, 1
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
+    ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
+    ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 7, implicit-def $vgpr0
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 16, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $sgpr12 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14
+    ; GCN64-MUBUF-NEXT: $sgpr13 = V_READLANE_B32 $vgpr0, 1
+    ; GCN64-MUBUF-NEXT: $sgpr14 = V_READLANE_B32 killed $vgpr0, 2
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 15, implicit-def $vgpr0
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
@@ -656,31 +656,31 @@ body:             |
     ; GCN64-MUBUF-NEXT: $sgpr15 = V_READLANE_B32 killed $vgpr0, 3
     ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
-    ; GCN64-MUBUF-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $exec
-    ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 31, implicit-def $vgpr1
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 44, 0, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $sgpr12 = V_READLANE_B32 $vgpr1, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
-    ; GCN64-MUBUF-NEXT: $sgpr13 = V_READLANE_B32 $vgpr1, 1
-    ; GCN64-MUBUF-NEXT: $sgpr14 = V_READLANE_B32 $vgpr1, 2
-    ; GCN64-MUBUF-NEXT: $sgpr15 = V_READLANE_B32 $vgpr1, 3
-    ; GCN64-MUBUF-NEXT: $sgpr16 = V_READLANE_B32 killed $vgpr1, 4
-    ; GCN64-MUBUF-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5, implicit killed $vgpr1
-    ; GCN64-MUBUF-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $exec
-    ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 255, implicit-def $vgpr2
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 64, 0, 0, implicit $exec :: (load (s32) from %stack.5, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $sgpr12 = V_READLANE_B32 $vgpr2, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN64-MUBUF-NEXT: $sgpr13 = V_READLANE_B32 $vgpr2, 1
-    ; GCN64-MUBUF-NEXT: $sgpr14 = V_READLANE_B32 $vgpr2, 2
-    ; GCN64-MUBUF-NEXT: $sgpr15 = V_READLANE_B32 $vgpr2, 3
-    ; GCN64-MUBUF-NEXT: $sgpr16 = V_READLANE_B32 $vgpr2, 4
-    ; GCN64-MUBUF-NEXT: $sgpr17 = V_READLANE_B32 $vgpr2, 5
-    ; GCN64-MUBUF-NEXT: $sgpr18 = V_READLANE_B32 $vgpr2, 6
-    ; GCN64-MUBUF-NEXT: $sgpr19 = V_READLANE_B32 killed $vgpr2, 7
-    ; GCN64-MUBUF-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr2_sgpr3, implicit killed $vgpr2
+    ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
+    ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 31, implicit-def $vgpr0
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 44, 0, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $sgpr12 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
+    ; GCN64-MUBUF-NEXT: $sgpr13 = V_READLANE_B32 $vgpr0, 1
+    ; GCN64-MUBUF-NEXT: $sgpr14 = V_READLANE_B32 $vgpr0, 2
+    ; GCN64-MUBUF-NEXT: $sgpr15 = V_READLANE_B32 $vgpr0, 3
+    ; GCN64-MUBUF-NEXT: $sgpr16 = V_READLANE_B32 killed $vgpr0, 4
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
+    ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
+    ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 255, implicit-def $vgpr0
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 64, 0, 0, implicit $exec :: (load (s32) from %stack.5, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $sgpr12 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN64-MUBUF-NEXT: $sgpr13 = V_READLANE_B32 $vgpr0, 1
+    ; GCN64-MUBUF-NEXT: $sgpr14 = V_READLANE_B32 $vgpr0, 2
+    ; GCN64-MUBUF-NEXT: $sgpr15 = V_READLANE_B32 $vgpr0, 3
+    ; GCN64-MUBUF-NEXT: $sgpr16 = V_READLANE_B32 $vgpr0, 4
+    ; GCN64-MUBUF-NEXT: $sgpr17 = V_READLANE_B32 $vgpr0, 5
+    ; GCN64-MUBUF-NEXT: $sgpr18 = V_READLANE_B32 $vgpr0, 6
+    ; GCN64-MUBUF-NEXT: $sgpr19 = V_READLANE_B32 killed $vgpr0, 7
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 65535, implicit-def $vgpr0
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
@@ -703,44 +703,44 @@ body:             |
     ; GCN64-MUBUF-NEXT: $sgpr27 = V_READLANE_B32 killed $vgpr0, 15
     ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
-    ; GCN64-MUBUF-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $exec
-    ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 4294967295, implicit-def $vgpr1
-    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 160, 0, 0, implicit $exec :: (load (s32) from %stack.7, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $sgpr64 = V_READLANE_B32 $vgpr1, 0, implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $sgpr65 = V_READLANE_B32 $vgpr1, 1
-    ; GCN64-MUBUF-NEXT: $sgpr66 = V_READLANE_B32 $vgpr1, 2
-    ; GCN64-MUBUF-NEXT: $sgpr67 = V_READLANE_B32 $vgpr1, 3
-    ; GCN64-MUBUF-NEXT: $sgpr68 = V_READLANE_B32 $vgpr1, 4
-    ; GCN64-MUBUF-NEXT: $sgpr69 = V_READLANE_B32 $vgpr1, 5
-    ; GCN64-MUBUF-NEXT: $sgpr70 = V_READLANE_B32 $vgpr1, 6
-    ; GCN64-MUBUF-NEXT: $sgpr71 = V_READLANE_B32 $vgpr1, 7
-    ; GCN64-MUBUF-NEXT: $sgpr72 = V_READLANE_B32 $vgpr1, 8
-    ; GCN64-MUBUF-NEXT: $sgpr73 = V_READLANE_B32 $vgpr1, 9
-    ; GCN64-MUBUF-NEXT: $sgpr74 = V_READLANE_B32 $vgpr1, 10
-    ; GCN64-MUBUF-NEXT: $sgpr75 = V_READLANE_B32 $vgpr1, 11
-    ; GCN64-MUBUF-NEXT: $sgpr76 = V_READLANE_B32 $vgpr1, 12
-    ; GCN64-MUBUF-NEXT: $sgpr77 = V_READLANE_B32 $vgpr1, 13
-    ; GCN64-MUBUF-NEXT: $sgpr78 = V_READLANE_B32 $vgpr1, 14
-    ; GCN64-MUBUF-NEXT: $sgpr79 = V_READLANE_B32 $vgpr1, 15
-    ; GCN64-MUBUF-NEXT: $sgpr80 = V_READLANE_B32 $vgpr1, 16
-    ; GCN64-MUBUF-NEXT: $sgpr81 = V_READLANE_B32 $vgpr1, 17
-    ; GCN64-MUBUF-NEXT: $sgpr82 = V_READLANE_B32 $vgpr1, 18
-    ; GCN64-MUBUF-NEXT: $sgpr83 = V_READLANE_B32 $vgpr1, 19
-    ; GCN64-MUBUF-NEXT: $sgpr84 = V_READLANE_B32 $vgpr1, 20
-    ; GCN64-MUBUF-NEXT: $sgpr85 = V_READLANE_B32 $vgpr1, 21
-    ; GCN64-MUBUF-NEXT: $sgpr86 = V_READLANE_B32 $vgpr1, 22
-    ; GCN64-MUBUF-NEXT: $sgpr87 = V_READLANE_B32 $vgpr1, 23
-    ; GCN64-MUBUF-NEXT: $sgpr88 = V_READLANE_B32 $vgpr1, 24
-    ; GCN64-MUBUF-NEXT: $sgpr89 = V_READLANE_B32 $vgpr1, 25
-    ; GCN64-MUBUF-NEXT: $sgpr90 = V_READLANE_B32 $vgpr1, 26
-    ; GCN64-MUBUF-NEXT: $sgpr91 = V_READLANE_B32 $vgpr1, 27
-    ; GCN64-MUBUF-NEXT: $sgpr92 = V_READLANE_B32 $vgpr1, 28
-    ; GCN64-MUBUF-NEXT: $sgpr93 = V_READLANE_B32 $vgpr1, 29
-    ; GCN64-MUBUF-NEXT: $sgpr94 = V_READLANE_B32 $vgpr1, 30
-    ; GCN64-MUBUF-NEXT: $sgpr95 = V_READLANE_B32 killed $vgpr1, 31
-    ; GCN64-MUBUF-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5, implicit killed $vgpr1
+    ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
+    ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 4294967295, implicit-def $vgpr0
+    ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 160, 0, 0, implicit $exec :: (load (s32) from %stack.7, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $sgpr64 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $sgpr65 = V_READLANE_B32 $vgpr0, 1
+    ; GCN64-MUBUF-NEXT: $sgpr66 = V_READLANE_B32 $vgpr0, 2
+    ; GCN64-MUBUF-NEXT: $sgpr67 = V_READLANE_B32 $vgpr0, 3
+    ; GCN64-MUBUF-NEXT: $sgpr68 = V_READLANE_B32 $vgpr0, 4
+    ; GCN64-MUBUF-NEXT: $sgpr69 = V_READLANE_B32 $vgpr0, 5
+    ; GCN64-MUBUF-NEXT: $sgpr70 = V_READLANE_B32 $vgpr0, 6
+    ; GCN64-MUBUF-NEXT: $sgpr71 = V_READLANE_B32 $vgpr0, 7
+    ; GCN64-MUBUF-NEXT: $sgpr72 = V_READLANE_B32 $vgpr0, 8
+    ; GCN64-MUBUF-NEXT: $sgpr73 = V_READLANE_B32 $vgpr0, 9
+    ; GCN64-MUBUF-NEXT: $sgpr74 = V_READLANE_B32 $vgpr0, 10
+    ; GCN64-MUBUF-NEXT: $sgpr75 = V_READLANE_B32 $vgpr0, 11
+    ; GCN64-MUBUF-NEXT: $sgpr76 = V_READLANE_B32 $vgpr0, 12
+    ; GCN64-MUBUF-NEXT: $sgpr77 = V_READLANE_B32 $vgpr0, 13
+    ; GCN64-MUBUF-NEXT: $sgpr78 = V_READLANE_B32 $vgpr0, 14
+    ; GCN64-MUBUF-NEXT: $sgpr79 = V_READLANE_B32 $vgpr0, 15
+    ; GCN64-MUBUF-NEXT: $sgpr80 = V_READLANE_B32 $vgpr0, 16
+    ; GCN64-MUBUF-NEXT: $sgpr81 = V_READLANE_B32 $vgpr0, 17
+    ; GCN64-MUBUF-NEXT: $sgpr82 = V_READLANE_B32 $vgpr0, 18
+    ; GCN64-MUBUF-NEXT: $sgpr83 = V_READLANE_B32 $vgpr0, 19
+    ; GCN64-MUBUF-NEXT: $sgpr84 = V_READLANE_B32 $vgpr0, 20
+    ; GCN64-MUBUF-NEXT: $sgpr85 = V_READLANE_B32 $vgpr0, 21
+    ; GCN64-MUBUF-NEXT: $sgpr86 = V_READLANE_B32 $vgpr0, 22
+    ; GCN64-MUBUF-NEXT: $sgpr87 = V_READLANE_B32 $vgpr0, 23
+    ; GCN64-MUBUF-NEXT: $sgpr88 = V_READLANE_B32 $vgpr0, 24
+    ; GCN64-MUBUF-NEXT: $sgpr89 = V_READLANE_B32 $vgpr0, 25
+    ; GCN64-MUBUF-NEXT: $sgpr90 = V_READLANE_B32 $vgpr0, 26
+    ; GCN64-MUBUF-NEXT: $sgpr91 = V_READLANE_B32 $vgpr0, 27
+    ; GCN64-MUBUF-NEXT: $sgpr92 = V_READLANE_B32 $vgpr0, 28
+    ; GCN64-MUBUF-NEXT: $sgpr93 = V_READLANE_B32 $vgpr0, 29
+    ; GCN64-MUBUF-NEXT: $sgpr94 = V_READLANE_B32 $vgpr0, 30
+    ; GCN64-MUBUF-NEXT: $sgpr95 = V_READLANE_B32 killed $vgpr0, 31
+    ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr0
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
@@ -759,30 +759,30 @@ body:             |
     ; GCN32-MUBUF-NEXT: $sgpr99 = S_MOV_B32 834756608, implicit-def $sgpr96_sgpr97_sgpr98_sgpr99
     ; GCN32-MUBUF-NEXT: $sgpr96 = S_ADD_U32 $sgpr96, $sgpr11, implicit-def $scc, implicit-def $sgpr96_sgpr97_sgpr98_sgpr99
     ; GCN32-MUBUF-NEXT: $sgpr97 = S_ADDC_U32 $sgpr97, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr96_sgpr97_sgpr98_sgpr99
-    ; GCN32-MUBUF-NEXT: $sgpr2 = S_MOV_B32 $exec_lo
-    ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 1, implicit-def $vgpr1
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $sgpr12 = V_READLANE_B32 killed $vgpr1, 0
-    ; GCN32-MUBUF-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr2, implicit killed $vgpr1
-    ; GCN32-MUBUF-NEXT: $sgpr1 = S_MOV_B32 $exec_lo
-    ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 3, implicit-def $vgpr2
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 8, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $sgpr12 = V_READLANE_B32 $vgpr2, 0, implicit-def $sgpr12_sgpr13
-    ; GCN32-MUBUF-NEXT: $sgpr13 = V_READLANE_B32 killed $vgpr2, 1
-    ; GCN32-MUBUF-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1, implicit killed $vgpr2
-    ; GCN32-MUBUF-NEXT: $sgpr3 = S_MOV_B32 $exec_lo
-    ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 7, implicit-def $vgpr3
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 16, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $sgpr12 = V_READLANE_B32 $vgpr3, 0, implicit-def $sgpr12_sgpr13_sgpr14
-    ; GCN32-MUBUF-NEXT: $sgpr13 = V_READLANE_B32 $vgpr3, 1
-    ; GCN32-MUBUF-NEXT: $sgpr14 = V_READLANE_B32 killed $vgpr3, 2
-    ; GCN32-MUBUF-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr3, implicit killed $vgpr3
+    ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
+    ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 1, implicit-def $vgpr0
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $sgpr12 = V_READLANE_B32 killed $vgpr0, 0
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
+    ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
+    ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 3, implicit-def $vgpr0
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 8, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $sgpr12 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr12_sgpr13
+    ; GCN32-MUBUF-NEXT: $sgpr13 = V_READLANE_B32 killed $vgpr0, 1
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
+    ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
+    ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 7, implicit-def $vgpr0
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 16, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $sgpr12 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14
+    ; GCN32-MUBUF-NEXT: $sgpr13 = V_READLANE_B32 $vgpr0, 1
+    ; GCN32-MUBUF-NEXT: $sgpr14 = V_READLANE_B32 killed $vgpr0, 2
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 15, implicit-def $vgpr0
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
@@ -793,31 +793,31 @@ body:             |
     ; GCN32-MUBUF-NEXT: $sgpr15 = V_READLANE_B32 killed $vgpr0, 3
     ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
-    ; GCN32-MUBUF-NEXT: $sgpr2 = S_MOV_B32 $exec_lo
-    ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 31, implicit-def $vgpr1
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 44, 0, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $sgpr12 = V_READLANE_B32 $vgpr1, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
-    ; GCN32-MUBUF-NEXT: $sgpr13 = V_READLANE_B32 $vgpr1, 1
-    ; GCN32-MUBUF-NEXT: $sgpr14 = V_READLANE_B32 $vgpr1, 2
-    ; GCN32-MUBUF-NEXT: $sgpr15 = V_READLANE_B32 $vgpr1, 3
-    ; GCN32-MUBUF-NEXT: $sgpr16 = V_READLANE_B32 killed $vgpr1, 4
-    ; GCN32-MUBUF-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr2, implicit killed $vgpr1
-    ; GCN32-MUBUF-NEXT: $sgpr1 = S_MOV_B32 $exec_lo
-    ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 255, implicit-def $vgpr2
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 64, 0, 0, implicit $exec :: (load (s32) from %stack.5, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $sgpr12 = V_READLANE_B32 $vgpr2, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN32-MUBUF-NEXT: $sgpr13 = V_READLANE_B32 $vgpr2, 1
-    ; GCN32-MUBUF-NEXT: $sgpr14 = V_READLANE_B32 $vgpr2, 2
-    ; GCN32-MUBUF-NEXT: $sgpr15 = V_READLANE_B32 $vgpr2, 3
-    ; GCN32-MUBUF-NEXT: $sgpr16 = V_READLANE_B32 $vgpr2, 4
-    ; GCN32-MUBUF-NEXT: $sgpr17 = V_READLANE_B32 $vgpr2, 5
-    ; GCN32-MUBUF-NEXT: $sgpr18 = V_READLANE_B32 $vgpr2, 6
-    ; GCN32-MUBUF-NEXT: $sgpr19 = V_READLANE_B32 killed $vgpr2, 7
-    ; GCN32-MUBUF-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1, implicit killed $vgpr2
+    ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
+    ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 31, implicit-def $vgpr0
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 44, 0, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $sgpr12 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
+    ; GCN32-MUBUF-NEXT: $sgpr13 = V_READLANE_B32 $vgpr0, 1
+    ; GCN32-MUBUF-NEXT: $sgpr14 = V_READLANE_B32 $vgpr0, 2
+    ; GCN32-MUBUF-NEXT: $sgpr15 = V_READLANE_B32 $vgpr0, 3
+    ; GCN32-MUBUF-NEXT: $sgpr16 = V_READLANE_B32 killed $vgpr0, 4
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
+    ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
+    ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 255, implicit-def $vgpr0
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 64, 0, 0, implicit $exec :: (load (s32) from %stack.5, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $sgpr12 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN32-MUBUF-NEXT: $sgpr13 = V_READLANE_B32 $vgpr0, 1
+    ; GCN32-MUBUF-NEXT: $sgpr14 = V_READLANE_B32 $vgpr0, 2
+    ; GCN32-MUBUF-NEXT: $sgpr15 = V_READLANE_B32 $vgpr0, 3
+    ; GCN32-MUBUF-NEXT: $sgpr16 = V_READLANE_B32 $vgpr0, 4
+    ; GCN32-MUBUF-NEXT: $sgpr17 = V_READLANE_B32 $vgpr0, 5
+    ; GCN32-MUBUF-NEXT: $sgpr18 = V_READLANE_B32 $vgpr0, 6
+    ; GCN32-MUBUF-NEXT: $sgpr19 = V_READLANE_B32 killed $vgpr0, 7
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 65535, implicit-def $vgpr0
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
@@ -840,44 +840,44 @@ body:             |
     ; GCN32-MUBUF-NEXT: $sgpr27 = V_READLANE_B32 killed $vgpr0, 15
     ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
-    ; GCN32-MUBUF-NEXT: $sgpr2 = S_MOV_B32 $exec_lo
-    ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 4294967295, implicit-def $vgpr1
-    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 160, 0, 0, implicit $exec :: (load (s32) from %stack.7, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $sgpr64 = V_READLANE_B32 $vgpr1, 0, implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $sgpr65 = V_READLANE_B32 $vgpr1, 1
-    ; GCN32-MUBUF-NEXT: $sgpr66 = V_READLANE_B32 $vgpr1, 2
-    ; GCN32-MUBUF-NEXT: $sgpr67 = V_READLANE_B32 $vgpr1, 3
-    ; GCN32-MUBUF-NEXT: $sgpr68 = V_READLANE_B32 $vgpr1, 4
-    ; GCN32-MUBUF-NEXT: $sgpr69 = V_READLANE_B32 $vgpr1, 5
-    ; GCN32-MUBUF-NEXT: $sgpr70 = V_READLANE_B32 $vgpr1, 6
-    ; GCN32-MUBUF-NEXT: $sgpr71 = V_READLANE_B32 $vgpr1, 7
-    ; GCN32-MUBUF-NEXT: $sgpr72 = V_READLANE_B32 $vgpr1, 8
-    ; GCN32-MUBUF-NEXT: $sgpr73 = V_READLANE_B32 $vgpr1, 9
-    ; GCN32-MUBUF-NEXT: $sgpr74 = V_READLANE_B32 $vgpr1, 10
-    ; GCN32-MUBUF-NEXT: $sgpr75 = V_READLANE_B32 $vgpr1, 11
-    ; GCN32-MUBUF-NEXT: $sgpr76 = V_READLANE_B32 $vgpr1, 12
-    ; GCN32-MUBUF-NEXT: $sgpr77 = V_READLANE_B32 $vgpr1, 13
-    ; GCN32-MUBUF-NEXT: $sgpr78 = V_READLANE_B32 $vgpr1, 14
-    ; GCN32-MUBUF-NEXT: $sgpr79 = V_READLANE_B32 $vgpr1, 15
-    ; GCN32-MUBUF-NEXT: $sgpr80 = V_READLANE_B32 $vgpr1, 16
-    ; GCN32-MUBUF-NEXT: $sgpr81 = V_READLANE_B32 $vgpr1, 17
-    ; GCN32-MUBUF-NEXT: $sgpr82 = V_READLANE_B32 $vgpr1, 18
-    ; GCN32-MUBUF-NEXT: $sgpr83 = V_READLANE_B32 $vgpr1, 19
-    ; GCN32-MUBUF-NEXT: $sgpr84 = V_READLANE_B32 $vgpr1, 20
-    ; GCN32-MUBUF-NEXT: $sgpr85 = V_READLANE_B32 $vgpr1, 21
-    ; GCN32-MUBUF-NEXT: $sgpr86 = V_READLANE_B32 $vgpr1, 22
-    ; GCN32-MUBUF-NEXT: $sgpr87 = V_READLANE_B32 $vgpr1, 23
-    ; GCN32-MUBUF-NEXT: $sgpr88 = V_READLANE_B32 $vgpr1, 24
-    ; GCN32-MUBUF-NEXT: $sgpr89 = V_READLANE_B32 $vgpr1, 25
-    ; GCN32-MUBUF-NEXT: $sgpr90 = V_READLANE_B32 $vgpr1, 26
-    ; GCN32-MUBUF-NEXT: $sgpr91 = V_READLANE_B32 $vgpr1, 27
-    ; GCN32-MUBUF-NEXT: $sgpr92 = V_READLANE_B32 $vgpr1, 28
-    ; GCN32-MUBUF-NEXT: $sgpr93 = V_READLANE_B32 $vgpr1, 29
-    ; GCN32-MUBUF-NEXT: $sgpr94 = V_READLANE_B32 $vgpr1, 30
-    ; GCN32-MUBUF-NEXT: $sgpr95 = V_READLANE_B32 killed $vgpr1, 31
-    ; GCN32-MUBUF-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr2, implicit killed $vgpr1
+    ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
+    ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 4294967295, implicit-def $vgpr0
+    ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 160, 0, 0, implicit $exec :: (load (s32) from %stack.7, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $sgpr64 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $sgpr65 = V_READLANE_B32 $vgpr0, 1
+    ; GCN32-MUBUF-NEXT: $sgpr66 = V_READLANE_B32 $vgpr0, 2
+    ; GCN32-MUBUF-NEXT: $sgpr67 = V_READLANE_B32 $vgpr0, 3
+    ; GCN32-MUBUF-NEXT: $sgpr68 = V_READLANE_B32 $vgpr0, 4
+    ; GCN32-MUBUF-NEXT: $sgpr69 = V_READLANE_B32 $vgpr0, 5
+    ; GCN32-MUBUF-NEXT: $sgpr70 = V_READLANE_B32 $vgpr0, 6
+    ; GCN32-MUBUF-NEXT: $sgpr71 = V_READLANE_B32 $vgpr0, 7
+    ; GCN32-MUBUF-NEXT: $sgpr72 = V_READLANE_B32 $vgpr0, 8
+    ; GCN32-MUBUF-NEXT: $sgpr73 = V_READLANE_B32 $vgpr0, 9
+    ; GCN32-MUBUF-NEXT: $sgpr74 = V_READLANE_B32 $vgpr0, 10
+    ; GCN32-MUBUF-NEXT: $sgpr75 = V_READLANE_B32 $vgpr0, 11
+    ; GCN32-MUBUF-NEXT: $sgpr76 = V_READLANE_B32 $vgpr0, 12
+    ; GCN32-MUBUF-NEXT: $sgpr77 = V_READLANE_B32 $vgpr0, 13
+    ; GCN32-MUBUF-NEXT: $sgpr78 = V_READLANE_B32 $vgpr0, 14
+    ; GCN32-MUBUF-NEXT: $sgpr79 = V_READLANE_B32 $vgpr0, 15
+    ; GCN32-MUBUF-NEXT: $sgpr80 = V_READLANE_B32 $vgpr0, 16
+    ; GCN32-MUBUF-NEXT: $sgpr81 = V_READLANE_B32 $vgpr0, 17
+    ; GCN32-MUBUF-NEXT: $sgpr82 = V_READLANE_B32 $vgpr0, 18
+    ; GCN32-MUBUF-NEXT: $sgpr83 = V_READLANE_B32 $vgpr0, 19
+    ; GCN32-MUBUF-NEXT: $sgpr84 = V_READLANE_B32 $vgpr0, 20
+    ; GCN32-MUBUF-NEXT: $sgpr85 = V_READLANE_B32 $vgpr0, 21
+    ; GCN32-MUBUF-NEXT: $sgpr86 = V_READLANE_B32 $vgpr0, 22
+    ; GCN32-MUBUF-NEXT: $sgpr87 = V_READLANE_B32 $vgpr0, 23
+    ; GCN32-MUBUF-NEXT: $sgpr88 = V_READLANE_B32 $vgpr0, 24
+    ; GCN32-MUBUF-NEXT: $sgpr89 = V_READLANE_B32 $vgpr0, 25
+    ; GCN32-MUBUF-NEXT: $sgpr90 = V_READLANE_B32 $vgpr0, 26
+    ; GCN32-MUBUF-NEXT: $sgpr91 = V_READLANE_B32 $vgpr0, 27
+    ; GCN32-MUBUF-NEXT: $sgpr92 = V_READLANE_B32 $vgpr0, 28
+    ; GCN32-MUBUF-NEXT: $sgpr93 = V_READLANE_B32 $vgpr0, 29
+    ; GCN32-MUBUF-NEXT: $sgpr94 = V_READLANE_B32 $vgpr0, 30
+    ; GCN32-MUBUF-NEXT: $sgpr95 = V_READLANE_B32 killed $vgpr0, 31
+    ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 1, implicit-def $vgpr0
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
@@ -892,30 +892,30 @@ body:             |
     ; GCN64-FLATSCR-NEXT: $sgpr33 = S_MOV_B32 0
     ; GCN64-FLATSCR-NEXT: $flat_scr_lo = S_ADD_U32 $sgpr0, $sgpr11, implicit-def $scc
     ; GCN64-FLATSCR-NEXT: $flat_scr_hi = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc
-    ; GCN64-FLATSCR-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $exec
-    ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr1
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $sgpr12 = V_READLANE_B32 killed $vgpr1, 0
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5, implicit killed $vgpr1
-    ; GCN64-FLATSCR-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $exec
-    ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr2
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr2, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $sgpr12 = V_READLANE_B32 $vgpr2, 0, implicit-def $sgpr12_sgpr13
-    ; GCN64-FLATSCR-NEXT: $sgpr13 = V_READLANE_B32 killed $vgpr2, 1
-    ; GCN64-FLATSCR-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr2_sgpr3, implicit killed $vgpr2
-    ; GCN64-FLATSCR-NEXT: $sgpr6_sgpr7 = S_MOV_B64 $exec
-    ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 7, implicit-def $vgpr3
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr3, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr3 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.2, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $sgpr12 = V_READLANE_B32 $vgpr3, 0, implicit-def $sgpr12_sgpr13_sgpr14
-    ; GCN64-FLATSCR-NEXT: $sgpr13 = V_READLANE_B32 $vgpr3, 1
-    ; GCN64-FLATSCR-NEXT: $sgpr14 = V_READLANE_B32 killed $vgpr3, 2
-    ; GCN64-FLATSCR-NEXT: $vgpr3 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7, implicit killed $vgpr3
+    ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
+    ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr0
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $sgpr12 = V_READLANE_B32 killed $vgpr0, 0
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
+    ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
+    ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $sgpr12 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr12_sgpr13
+    ; GCN64-FLATSCR-NEXT: $sgpr13 = V_READLANE_B32 killed $vgpr0, 1
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
+    ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
+    ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 7, implicit-def $vgpr0
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.2, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $sgpr12 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14
+    ; GCN64-FLATSCR-NEXT: $sgpr13 = V_READLANE_B32 $vgpr0, 1
+    ; GCN64-FLATSCR-NEXT: $sgpr14 = V_READLANE_B32 killed $vgpr0, 2
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 15, implicit-def $vgpr0
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
@@ -926,31 +926,31 @@ body:             |
     ; GCN64-FLATSCR-NEXT: $sgpr15 = V_READLANE_B32 killed $vgpr0, 3
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
-    ; GCN64-FLATSCR-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $exec
-    ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 31, implicit-def $vgpr1
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 44, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.4, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $sgpr12 = V_READLANE_B32 $vgpr1, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
-    ; GCN64-FLATSCR-NEXT: $sgpr13 = V_READLANE_B32 $vgpr1, 1
-    ; GCN64-FLATSCR-NEXT: $sgpr14 = V_READLANE_B32 $vgpr1, 2
-    ; GCN64-FLATSCR-NEXT: $sgpr15 = V_READLANE_B32 $vgpr1, 3
-    ; GCN64-FLATSCR-NEXT: $sgpr16 = V_READLANE_B32 killed $vgpr1, 4
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5, implicit killed $vgpr1
-    ; GCN64-FLATSCR-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $exec
-    ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 255, implicit-def $vgpr2
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr2, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 64, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.5, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $sgpr12 = V_READLANE_B32 $vgpr2, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN64-FLATSCR-NEXT: $sgpr13 = V_READLANE_B32 $vgpr2, 1
-    ; GCN64-FLATSCR-NEXT: $sgpr14 = V_READLANE_B32 $vgpr2, 2
-    ; GCN64-FLATSCR-NEXT: $sgpr15 = V_READLANE_B32 $vgpr2, 3
-    ; GCN64-FLATSCR-NEXT: $sgpr16 = V_READLANE_B32 $vgpr2, 4
-    ; GCN64-FLATSCR-NEXT: $sgpr17 = V_READLANE_B32 $vgpr2, 5
-    ; GCN64-FLATSCR-NEXT: $sgpr18 = V_READLANE_B32 $vgpr2, 6
-    ; GCN64-FLATSCR-NEXT: $sgpr19 = V_READLANE_B32 killed $vgpr2, 7
-    ; GCN64-FLATSCR-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr2_sgpr3, implicit killed $vgpr2
+    ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
+    ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 31, implicit-def $vgpr0
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 44, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.4, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $sgpr12 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
+    ; GCN64-FLATSCR-NEXT: $sgpr13 = V_READLANE_B32 $vgpr0, 1
+    ; GCN64-FLATSCR-NEXT: $sgpr14 = V_READLANE_B32 $vgpr0, 2
+    ; GCN64-FLATSCR-NEXT: $sgpr15 = V_READLANE_B32 $vgpr0, 3
+    ; GCN64-FLATSCR-NEXT: $sgpr16 = V_READLANE_B32 killed $vgpr0, 4
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
+    ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
+    ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 255, implicit-def $vgpr0
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 64, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.5, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $sgpr12 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN64-FLATSCR-NEXT: $sgpr13 = V_READLANE_B32 $vgpr0, 1
+    ; GCN64-FLATSCR-NEXT: $sgpr14 = V_READLANE_B32 $vgpr0, 2
+    ; GCN64-FLATSCR-NEXT: $sgpr15 = V_READLANE_B32 $vgpr0, 3
+    ; GCN64-FLATSCR-NEXT: $sgpr16 = V_READLANE_B32 $vgpr0, 4
+    ; GCN64-FLATSCR-NEXT: $sgpr17 = V_READLANE_B32 $vgpr0, 5
+    ; GCN64-FLATSCR-NEXT: $sgpr18 = V_READLANE_B32 $vgpr0, 6
+    ; GCN64-FLATSCR-NEXT: $sgpr19 = V_READLANE_B32 killed $vgpr0, 7
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 65535, implicit-def $vgpr0
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
@@ -973,44 +973,44 @@ body:             |
     ; GCN64-FLATSCR-NEXT: $sgpr27 = V_READLANE_B32 killed $vgpr0, 15
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
-    ; GCN64-FLATSCR-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $exec
-    ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 4294967295, implicit-def $vgpr1
-    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 160, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.7, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $sgpr64 = V_READLANE_B32 $vgpr1, 0, implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $sgpr65 = V_READLANE_B32 $vgpr1, 1
-    ; GCN64-FLATSCR-NEXT: $sgpr66 = V_READLANE_B32 $vgpr1, 2
-    ; GCN64-FLATSCR-NEXT: $sgpr67 = V_READLANE_B32 $vgpr1, 3
-    ; GCN64-FLATSCR-NEXT: $sgpr68 = V_READLANE_B32 $vgpr1, 4
-    ; GCN64-FLATSCR-NEXT: $sgpr69 = V_READLANE_B32 $vgpr1, 5
-    ; GCN64-FLATSCR-NEXT: $sgpr70 = V_READLANE_B32 $vgpr1, 6
-    ; GCN64-FLATSCR-NEXT: $sgpr71 = V_READLANE_B32 $vgpr1, 7
-    ; GCN64-FLATSCR-NEXT: $sgpr72 = V_READLANE_B32 $vgpr1, 8
-    ; GCN64-FLATSCR-NEXT: $sgpr73 = V_READLANE_B32 $vgpr1, 9
-    ; GCN64-FLATSCR-NEXT: $sgpr74 = V_READLANE_B32 $vgpr1, 10
-    ; GCN64-FLATSCR-NEXT: $sgpr75 = V_READLANE_B32 $vgpr1, 11
-    ; GCN64-FLATSCR-NEXT: $sgpr76 = V_READLANE_B32 $vgpr1, 12
-    ; GCN64-FLATSCR-NEXT: $sgpr77 = V_READLANE_B32 $vgpr1, 13
-    ; GCN64-FLATSCR-NEXT: $sgpr78 = V_READLANE_B32 $vgpr1, 14
-    ; GCN64-FLATSCR-NEXT: $sgpr79 = V_READLANE_B32 $vgpr1, 15
-    ; GCN64-FLATSCR-NEXT: $sgpr80 = V_READLANE_B32 $vgpr1, 16
-    ; GCN64-FLATSCR-NEXT: $sgpr81 = V_READLANE_B32 $vgpr1, 17
-    ; GCN64-FLATSCR-NEXT: $sgpr82 = V_READLANE_B32 $vgpr1, 18
-    ; GCN64-FLATSCR-NEXT: $sgpr83 = V_READLANE_B32 $vgpr1, 19
-    ; GCN64-FLATSCR-NEXT: $sgpr84 = V_READLANE_B32 $vgpr1, 20
-    ; GCN64-FLATSCR-NEXT: $sgpr85 = V_READLANE_B32 $vgpr1, 21
-    ; GCN64-FLATSCR-NEXT: $sgpr86 = V_READLANE_B32 $vgpr1, 22
-    ; GCN64-FLATSCR-NEXT: $sgpr87 = V_READLANE_B32 $vgpr1, 23
-    ; GCN64-FLATSCR-NEXT: $sgpr88 = V_READLANE_B32 $vgpr1, 24
-    ; GCN64-FLATSCR-NEXT: $sgpr89 = V_READLANE_B32 $vgpr1, 25
-    ; GCN64-FLATSCR-NEXT: $sgpr90 = V_READLANE_B32 $vgpr1, 26
-    ; GCN64-FLATSCR-NEXT: $sgpr91 = V_READLANE_B32 $vgpr1, 27
-    ; GCN64-FLATSCR-NEXT: $sgpr92 = V_READLANE_B32 $vgpr1, 28
-    ; GCN64-FLATSCR-NEXT: $sgpr93 = V_READLANE_B32 $vgpr1, 29
-    ; GCN64-FLATSCR-NEXT: $sgpr94 = V_READLANE_B32 $vgpr1, 30
-    ; GCN64-FLATSCR-NEXT: $sgpr95 = V_READLANE_B32 killed $vgpr1, 31
-    ; GCN64-FLATSCR-NEXT: $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5, implicit killed $vgpr1
+    ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
+    ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 4294967295, implicit-def $vgpr0
+    ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 160, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.7, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $sgpr64 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $sgpr65 = V_READLANE_B32 $vgpr0, 1
+    ; GCN64-FLATSCR-NEXT: $sgpr66 = V_READLANE_B32 $vgpr0, 2
+    ; GCN64-FLATSCR-NEXT: $sgpr67 = V_READLANE_B32 $vgpr0, 3
+    ; GCN64-FLATSCR-NEXT: $sgpr68 = V_READLANE_B32 $vgpr0, 4
+    ; GCN64-FLATSCR-NEXT: $sgpr69 = V_READLANE_B32 $vgpr0, 5
+    ; GCN64-FLATSCR-NEXT: $sgpr70 = V_READLANE_B32 $vgpr0, 6
+    ; GCN64-FLATSCR-NEXT: $sgpr71 = V_READLANE_B32 $vgpr0, 7
+    ; GCN64-FLATSCR-NEXT: $sgpr72 = V_READLANE_B32 $vgpr0, 8
+    ; GCN64-FLATSCR-NEXT: $sgpr73 = V_READLANE_B32 $vgpr0, 9
+    ; GCN64-FLATSCR-NEXT: $sgpr74 = V_READLANE_B32 $vgpr0, 10
+    ; GCN64-FLATSCR-NEXT: $sgpr75 = V_READLANE_B32 $vgpr0, 11
+    ; GCN64-FLATSCR-NEXT: $sgpr76 = V_READLANE_B32 $vgpr0, 12
+    ; GCN64-FLATSCR-NEXT: $sgpr77 = V_READLANE_B32 $vgpr0, 13
+    ; GCN64-FLATSCR-NEXT: $sgpr78 = V_READLANE_B32 $vgpr0, 14
+    ; GCN64-FLATSCR-NEXT: $sgpr79 = V_READLANE_B32 $vgpr0, 15
+    ; GCN64-FLATSCR-NEXT: $sgpr80 = V_READLANE_B32 $vgpr0, 16
+    ; GCN64-FLATSCR-NEXT: $sgpr81 = V_READLANE_B32 $vgpr0, 17
+    ; GCN64-FLATSCR-NEXT: $sgpr82 = V_READLANE_B32 $vgpr0, 18
+    ; GCN64-FLATSCR-NEXT: $sgpr83 = V_READLANE_B32 $vgpr0, 19
+    ; GCN64-FLATSCR-NEXT: $sgpr84 = V_READLANE_B32 $vgpr0, 20
+    ; GCN64-FLATSCR-NEXT: $sgpr85 = V_READLANE_B32 $vgpr0, 21
+    ; GCN64-FLATSCR-NEXT: $sgpr86 = V_READLANE_B32 $vgpr0, 22
+    ; GCN64-FLATSCR-NEXT: $sgpr87 = V_READLANE_B32 $vgpr0, 23
+    ; GCN64-FLATSCR-NEXT: $sgpr88 = V_READLANE_B32 $vgpr0, 24
+    ; GCN64-FLATSCR-NEXT: $sgpr89 = V_READLANE_B32 $vgpr0, 25
+    ; GCN64-FLATSCR-NEXT: $sgpr90 = V_READLANE_B32 $vgpr0, 26
+    ; GCN64-FLATSCR-NEXT: $sgpr91 = V_READLANE_B32 $vgpr0, 27
+    ; GCN64-FLATSCR-NEXT: $sgpr92 = V_READLANE_B32 $vgpr0, 28
+    ; GCN64-FLATSCR-NEXT: $sgpr93 = V_READLANE_B32 $vgpr0, 29
+    ; GCN64-FLATSCR-NEXT: $sgpr94 = V_READLANE_B32 $vgpr0, 30
+    ; GCN64-FLATSCR-NEXT: $sgpr95 = V_READLANE_B32 killed $vgpr0, 31
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr0
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)

diff  --git a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
index 855a44baf63ed..92da6309c16ba 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
@@ -1621,22 +1621,22 @@ define void @spill_sgpr_no_free_vgpr_ipra() #0 {
 ; GCN-NEXT:    buffer_store_dword v253, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v254, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v255, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    s_mov_b64 s[14:15], exec
+; GCN-NEXT:    s_mov_b64 s[4:5], exec
 ; GCN-NEXT:    s_mov_b64 exec, 1
-; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:456
-; GCN-NEXT:    v_writelane_b32 v1, s30, 0
-; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:456
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:456
+; GCN-NEXT:    v_writelane_b32 v0, s30, 0
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:456
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_mov_b64 exec, s[14:15]
-; GCN-NEXT:    s_mov_b64 s[12:13], exec
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_mov_b64 s[4:5], exec
 ; GCN-NEXT:    s_mov_b64 exec, 1
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:456
 ; GCN-NEXT:    v_writelane_b32 v0, s31, 0
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:452 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:456
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_mov_b64 exec, s[12:13]
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, child_function_ipra at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s5, s5, child_function_ipra at rel32@hi+12
@@ -1645,15 +1645,15 @@ define void @spill_sgpr_no_free_vgpr_ipra() #0 {
 ; GCN-NEXT:    s_mov_b64 s[0:1], s[8:9]
 ; GCN-NEXT:    s_mov_b64 s[2:3], s[10:11]
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    s_mov_b64 s[8:9], exec
+; GCN-NEXT:    s_mov_b64 s[4:5], exec
 ; GCN-NEXT:    s_mov_b64 exec, 1
-; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:456
-; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:452 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:456
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:452 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s31, v1, 0
-; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:456
+; GCN-NEXT:    v_readlane_b32 s31, v0, 0
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:456
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_mov_b64 exec, s[8:9]
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_mov_b64 s[4:5], exec
 ; GCN-NEXT:    s_mov_b64 exec, 1
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:456

diff  --git a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll
index 1fbdb5d512da3..2b41e412b586d 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll
@@ -26,8 +26,8 @@ define amdgpu_kernel void @test_inst_offset_kernel() {
 ; FLATSCR:       ; %bb.0: ; %entry
 ; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
 ; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
-; FLATSCR-NEXT:    s_mov_b32 s1, 0
-; FLATSCR-NEXT:    scratch_load_dword v0, off, s1 offset:8 glc
+; FLATSCR-NEXT:    s_mov_b32 s0, 0
+; FLATSCR-NEXT:    scratch_load_dword v0, off, s0 offset:8 glc
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    s_movk_i32 s0, 0xffc
 ; FLATSCR-NEXT:    scratch_store_dword off, v0, s0 ; 4-byte Folded Spill
@@ -79,8 +79,8 @@ define amdgpu_kernel void @test_sgpr_offset_kernel() {
 ; FLATSCR:       ; %bb.0: ; %entry
 ; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
 ; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
-; FLATSCR-NEXT:    s_mov_b32 s1, 0
-; FLATSCR-NEXT:    scratch_load_dword v0, off, s1 offset:8 glc
+; FLATSCR-NEXT:    s_mov_b32 s0, 0
+; FLATSCR-NEXT:    scratch_load_dword v0, off, s0 offset:8 glc
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    s_movk_i32 s0, 0x1000
 ; FLATSCR-NEXT:    scratch_store_dword off, v0, s0 ; 4-byte Folded Spill
@@ -218,10 +218,10 @@ define amdgpu_kernel void @test_sgpr_offset_function_scavenge_fail_kernel() #3 {
 ; FLATSCR:       ; %bb.0: ; %entry
 ; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
 ; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
-; FLATSCR-NEXT:    s_mov_b32 s9, 0
+; FLATSCR-NEXT:    s_mov_b32 s8, 0
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ;;#ASMEND
-; FLATSCR-NEXT:    scratch_load_dword v0, off, s9 offset:8 glc
+; FLATSCR-NEXT:    scratch_load_dword v0, off, s8 offset:8 glc
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    s_movk_i32 s8, 0x1004
 ; FLATSCR-NEXT:    scratch_store_dword off, v0, s8 ; 4-byte Folded Spill
@@ -301,16 +301,17 @@ define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() {
 ; FLATSCR:       ; %bb.0: ; %entry
 ; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
 ; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
-; FLATSCR-NEXT:    s_mov_b32 s2, 0
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s2 offset:12 glc
+; FLATSCR-NEXT:    s_mov_b32 s0, 0
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s0 offset:12 glc
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    s_movk_i32 s0, 0xff8
-; FLATSCR-NEXT:    s_mov_b32 s1, 0
 ; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 ; 8-byte Folded Spill
+; FLATSCR-NEXT:    s_mov_b32 s0, 0
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ;;#ASMEND
-; FLATSCR-NEXT:    scratch_load_dword v0, off, s1 offset:8 glc
+; FLATSCR-NEXT:    scratch_load_dword v0, off, s0 offset:8 glc
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_movk_i32 s0, 0xff8
 ; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s0 ; 8-byte Folded Reload
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    ;;#ASMSTART
@@ -367,16 +368,17 @@ define amdgpu_kernel void @test_inst_offset_subregs_kernel() {
 ; FLATSCR:       ; %bb.0: ; %entry
 ; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
 ; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
-; FLATSCR-NEXT:    s_mov_b32 s2, 0
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s2 offset:12 glc
+; FLATSCR-NEXT:    s_mov_b32 s0, 0
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s0 offset:12 glc
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    s_movk_i32 s0, 0xffc
-; FLATSCR-NEXT:    s_mov_b32 s1, 0
 ; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 ; 8-byte Folded Spill
+; FLATSCR-NEXT:    s_mov_b32 s0, 0
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ;;#ASMEND
-; FLATSCR-NEXT:    scratch_load_dword v0, off, s1 offset:8 glc
+; FLATSCR-NEXT:    scratch_load_dword v0, off, s0 offset:8 glc
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_movk_i32 s0, 0xffc
 ; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s0 ; 8-byte Folded Reload
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    ;;#ASMSTART

diff  --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
index 356348aa64c89..dd13c41253fb6 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
@@ -10094,7 +10094,6 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:240
 ; GFX6-NEXT:    s_addc_u32 s41, s41, 0
 ; GFX6-NEXT:    s_mov_b32 s2, 0x83400
-; GFX6-NEXT:    s_mov_b64 s[44:45], exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10215,6 +10214,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 13, v0
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 16, v4
 ; GFX6-NEXT:    v_mov_b32_e32 v7, 1
+; GFX6-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX6-NEXT:    buffer_store_dword v7, v4, s[40:43], 0 offen
 ; GFX6-NEXT:    ;;#ASMSTART
 ; GFX6-NEXT:    ; def s[4:11]
@@ -10230,12 +10230,12 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_writelane_b32 v4, s9, 5
 ; GFX6-NEXT:    v_writelane_b32 v4, s10, 6
 ; GFX6-NEXT:    v_writelane_b32 v4, s11, 7
-; GFX6-NEXT:    s_mov_b32 s2, 0x83800
-; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_mov_b32 s12, 0x83800
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[44:45]
+; GFX6-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX6-NEXT:    ;;#ASMSTART
 ; GFX6-NEXT:    ; def s[8:15]
@@ -10263,251 +10263,251 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:  ; %bb.1: ; %bb0
 ; GFX6-NEXT:    s_mov_b64 s[44:45], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
-; GFX6-NEXT:    buffer_store_dword v9, off, s[40:43], 0
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_writelane_b32 v9, s8, 0
-; GFX6-NEXT:    v_writelane_b32 v9, s9, 1
-; GFX6-NEXT:    v_writelane_b32 v9, s10, 2
-; GFX6-NEXT:    v_writelane_b32 v9, s11, 3
-; GFX6-NEXT:    v_writelane_b32 v9, s12, 4
-; GFX6-NEXT:    v_writelane_b32 v9, s13, 5
-; GFX6-NEXT:    v_writelane_b32 v9, s14, 6
-; GFX6-NEXT:    v_writelane_b32 v9, s15, 7
-; GFX6-NEXT:    v_mov_b32_e32 v4, 0x2100
-; GFX6-NEXT:    buffer_store_dword v9, v4, s[40:43], 0 offen ; 4-byte Folded Spill
+; GFX6-NEXT:    v_writelane_b32 v4, s8, 0
+; GFX6-NEXT:    v_writelane_b32 v4, s9, 1
+; GFX6-NEXT:    v_writelane_b32 v4, s10, 2
+; GFX6-NEXT:    v_writelane_b32 v4, s11, 3
+; GFX6-NEXT:    v_writelane_b32 v4, s12, 4
+; GFX6-NEXT:    v_writelane_b32 v4, s13, 5
+; GFX6-NEXT:    v_writelane_b32 v4, s14, 6
+; GFX6-NEXT:    v_writelane_b32 v4, s15, 7
+; GFX6-NEXT:    v_mov_b32_e32 v7, 0x2100
+; GFX6-NEXT:    buffer_store_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], 0
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[44:45]
 ; GFX6-NEXT:    s_mov_b64 s[44:45], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
-; GFX6-NEXT:    v_mov_b32_e32 v4, 0x20e0
-; GFX6-NEXT:    buffer_store_dword v8, off, s[40:43], 0
+; GFX6-NEXT:    v_mov_b32_e32 v7, 0x20e0
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v8, v4, s[40:43], 0 offen ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_readlane_b32 s8, v8, 0
-; GFX6-NEXT:    v_readlane_b32 s9, v8, 1
-; GFX6-NEXT:    v_readlane_b32 s10, v8, 2
-; GFX6-NEXT:    v_readlane_b32 s11, v8, 3
-; GFX6-NEXT:    v_readlane_b32 s12, v8, 4
-; GFX6-NEXT:    v_readlane_b32 s13, v8, 5
-; GFX6-NEXT:    v_readlane_b32 s14, v8, 6
-; GFX6-NEXT:    v_readlane_b32 s15, v8, 7
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], 0
+; GFX6-NEXT:    v_readlane_b32 s8, v4, 0
+; GFX6-NEXT:    v_readlane_b32 s9, v4, 1
+; GFX6-NEXT:    v_readlane_b32 s10, v4, 2
+; GFX6-NEXT:    v_readlane_b32 s11, v4, 3
+; GFX6-NEXT:    v_readlane_b32 s12, v4, 4
+; GFX6-NEXT:    v_readlane_b32 s13, v4, 5
+; GFX6-NEXT:    v_readlane_b32 s14, v4, 6
+; GFX6-NEXT:    v_readlane_b32 s15, v4, 7
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[44:45]
 ; GFX6-NEXT:    s_mov_b64 s[44:45], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
-; GFX6-NEXT:    buffer_store_dword v7, off, s[40:43], 0
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_writelane_b32 v7, s16, 0
-; GFX6-NEXT:    v_writelane_b32 v7, s17, 1
-; GFX6-NEXT:    v_writelane_b32 v7, s18, 2
-; GFX6-NEXT:    v_writelane_b32 v7, s19, 3
-; GFX6-NEXT:    v_writelane_b32 v7, s20, 4
-; GFX6-NEXT:    v_writelane_b32 v7, s21, 5
-; GFX6-NEXT:    v_writelane_b32 v7, s22, 6
-; GFX6-NEXT:    v_writelane_b32 v7, s23, 7
-; GFX6-NEXT:    v_mov_b32_e32 v4, 0x2120
-; GFX6-NEXT:    buffer_store_dword v7, v4, s[40:43], 0 offen ; 4-byte Folded Spill
+; GFX6-NEXT:    v_writelane_b32 v4, s16, 0
+; GFX6-NEXT:    v_writelane_b32 v4, s17, 1
+; GFX6-NEXT:    v_writelane_b32 v4, s18, 2
+; GFX6-NEXT:    v_writelane_b32 v4, s19, 3
+; GFX6-NEXT:    v_writelane_b32 v4, s20, 4
+; GFX6-NEXT:    v_writelane_b32 v4, s21, 5
+; GFX6-NEXT:    v_writelane_b32 v4, s22, 6
+; GFX6-NEXT:    v_writelane_b32 v4, s23, 7
+; GFX6-NEXT:    v_mov_b32_e32 v7, 0x2120
+; GFX6-NEXT:    buffer_store_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], 0
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[44:45]
 ; GFX6-NEXT:    s_mov_b64 s[44:45], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
-; GFX6-NEXT:    v_mov_b32_e32 v4, 0x2100
-; GFX6-NEXT:    buffer_store_dword v9, off, s[40:43], 0
+; GFX6-NEXT:    v_mov_b32_e32 v7, 0x2100
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, v4, s[40:43], 0 offen ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_readlane_b32 s16, v9, 0
-; GFX6-NEXT:    v_readlane_b32 s17, v9, 1
-; GFX6-NEXT:    v_readlane_b32 s18, v9, 2
-; GFX6-NEXT:    v_readlane_b32 s19, v9, 3
-; GFX6-NEXT:    v_readlane_b32 s20, v9, 4
-; GFX6-NEXT:    v_readlane_b32 s21, v9, 5
-; GFX6-NEXT:    v_readlane_b32 s22, v9, 6
-; GFX6-NEXT:    v_readlane_b32 s23, v9, 7
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], 0
+; GFX6-NEXT:    v_readlane_b32 s16, v4, 0
+; GFX6-NEXT:    v_readlane_b32 s17, v4, 1
+; GFX6-NEXT:    v_readlane_b32 s18, v4, 2
+; GFX6-NEXT:    v_readlane_b32 s19, v4, 3
+; GFX6-NEXT:    v_readlane_b32 s20, v4, 4
+; GFX6-NEXT:    v_readlane_b32 s21, v4, 5
+; GFX6-NEXT:    v_readlane_b32 s22, v4, 6
+; GFX6-NEXT:    v_readlane_b32 s23, v4, 7
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[44:45]
 ; GFX6-NEXT:    s_mov_b64 s[44:45], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
-; GFX6-NEXT:    buffer_store_dword v8, off, s[40:43], 0
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_writelane_b32 v8, s24, 0
-; GFX6-NEXT:    v_writelane_b32 v8, s25, 1
-; GFX6-NEXT:    v_writelane_b32 v8, s26, 2
-; GFX6-NEXT:    v_writelane_b32 v8, s27, 3
-; GFX6-NEXT:    v_writelane_b32 v8, s28, 4
-; GFX6-NEXT:    v_writelane_b32 v8, s29, 5
-; GFX6-NEXT:    v_writelane_b32 v8, s30, 6
-; GFX6-NEXT:    v_writelane_b32 v8, s31, 7
-; GFX6-NEXT:    v_mov_b32_e32 v4, 0x2140
-; GFX6-NEXT:    buffer_store_dword v8, v4, s[40:43], 0 offen ; 4-byte Folded Spill
+; GFX6-NEXT:    v_writelane_b32 v4, s24, 0
+; GFX6-NEXT:    v_writelane_b32 v4, s25, 1
+; GFX6-NEXT:    v_writelane_b32 v4, s26, 2
+; GFX6-NEXT:    v_writelane_b32 v4, s27, 3
+; GFX6-NEXT:    v_writelane_b32 v4, s28, 4
+; GFX6-NEXT:    v_writelane_b32 v4, s29, 5
+; GFX6-NEXT:    v_writelane_b32 v4, s30, 6
+; GFX6-NEXT:    v_writelane_b32 v4, s31, 7
+; GFX6-NEXT:    v_mov_b32_e32 v7, 0x2140
+; GFX6-NEXT:    buffer_store_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], 0
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[44:45]
 ; GFX6-NEXT:    s_mov_b64 s[44:45], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
-; GFX6-NEXT:    v_mov_b32_e32 v4, 0x2120
-; GFX6-NEXT:    buffer_store_dword v7, off, s[40:43], 0
+; GFX6-NEXT:    v_mov_b32_e32 v7, 0x2120
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v7, v4, s[40:43], 0 offen ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_readlane_b32 s24, v7, 0
-; GFX6-NEXT:    v_readlane_b32 s25, v7, 1
-; GFX6-NEXT:    v_readlane_b32 s26, v7, 2
-; GFX6-NEXT:    v_readlane_b32 s27, v7, 3
-; GFX6-NEXT:    v_readlane_b32 s28, v7, 4
-; GFX6-NEXT:    v_readlane_b32 s29, v7, 5
-; GFX6-NEXT:    v_readlane_b32 s30, v7, 6
-; GFX6-NEXT:    v_readlane_b32 s31, v7, 7
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], 0
+; GFX6-NEXT:    v_readlane_b32 s24, v4, 0
+; GFX6-NEXT:    v_readlane_b32 s25, v4, 1
+; GFX6-NEXT:    v_readlane_b32 s26, v4, 2
+; GFX6-NEXT:    v_readlane_b32 s27, v4, 3
+; GFX6-NEXT:    v_readlane_b32 s28, v4, 4
+; GFX6-NEXT:    v_readlane_b32 s29, v4, 5
+; GFX6-NEXT:    v_readlane_b32 s30, v4, 6
+; GFX6-NEXT:    v_readlane_b32 s31, v4, 7
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[44:45]
 ; GFX6-NEXT:    s_mov_b64 s[44:45], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 15
-; GFX6-NEXT:    buffer_store_dword v10, off, s[40:43], 0
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_writelane_b32 v10, s0, 0
-; GFX6-NEXT:    v_writelane_b32 v10, s1, 1
-; GFX6-NEXT:    v_writelane_b32 v10, s2, 2
-; GFX6-NEXT:    v_writelane_b32 v10, s3, 3
-; GFX6-NEXT:    v_mov_b32_e32 v4, 0x2160
-; GFX6-NEXT:    buffer_store_dword v10, v4, s[40:43], 0 offen ; 4-byte Folded Spill
+; GFX6-NEXT:    v_writelane_b32 v4, s0, 0
+; GFX6-NEXT:    v_writelane_b32 v4, s1, 1
+; GFX6-NEXT:    v_writelane_b32 v4, s2, 2
+; GFX6-NEXT:    v_writelane_b32 v4, s3, 3
+; GFX6-NEXT:    v_mov_b32_e32 v7, 0x2160
+; GFX6-NEXT:    buffer_store_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], 0
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[44:45]
-; GFX6-NEXT:    s_mov_b64 s[44:45], exec
+; GFX6-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 15
-; GFX6-NEXT:    buffer_store_dword v8, off, s[40:43], 0
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_writelane_b32 v8, s4, 0
-; GFX6-NEXT:    v_writelane_b32 v8, s5, 1
-; GFX6-NEXT:    v_writelane_b32 v8, s6, 2
-; GFX6-NEXT:    v_writelane_b32 v8, s7, 3
-; GFX6-NEXT:    s_mov_b32 s0, 0x85c00
-; GFX6-NEXT:    buffer_store_dword v8, off, s[40:43], s0 ; 4-byte Folded Spill
+; GFX6-NEXT:    v_writelane_b32 v4, s4, 0
+; GFX6-NEXT:    v_writelane_b32 v4, s5, 1
+; GFX6-NEXT:    v_writelane_b32 v4, s6, 2
+; GFX6-NEXT:    v_writelane_b32 v4, s7, 3
+; GFX6-NEXT:    s_mov_b32 s44, 0x85c00
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s44 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], 0
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[44:45]
+; GFX6-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX6-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 3
-; GFX6-NEXT:    buffer_store_dword v9, off, s[40:43], 0
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_writelane_b32 v9, s2, 0
-; GFX6-NEXT:    v_writelane_b32 v9, s3, 1
+; GFX6-NEXT:    v_writelane_b32 v4, s2, 0
+; GFX6-NEXT:    v_writelane_b32 v4, s3, 1
 ; GFX6-NEXT:    s_mov_b32 s4, 0x86600
-; GFX6-NEXT:    buffer_store_dword v9, off, s[40:43], s4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s4 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], 0
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX6-NEXT:    s_mov_b64 s[44:45], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
-; GFX6-NEXT:    v_mov_b32_e32 v4, 0x2140
-; GFX6-NEXT:    buffer_store_dword v7, off, s[40:43], 0
+; GFX6-NEXT:    v_mov_b32_e32 v7, 0x2140
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v7, v4, s[40:43], 0 offen ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_readlane_b32 s0, v7, 0
-; GFX6-NEXT:    v_readlane_b32 s1, v7, 1
-; GFX6-NEXT:    v_readlane_b32 s2, v7, 2
-; GFX6-NEXT:    v_readlane_b32 s3, v7, 3
-; GFX6-NEXT:    v_readlane_b32 s4, v7, 4
-; GFX6-NEXT:    v_readlane_b32 s5, v7, 5
-; GFX6-NEXT:    v_readlane_b32 s6, v7, 6
-; GFX6-NEXT:    v_readlane_b32 s7, v7, 7
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], 0
+; GFX6-NEXT:    v_readlane_b32 s0, v4, 0
+; GFX6-NEXT:    v_readlane_b32 s1, v4, 1
+; GFX6-NEXT:    v_readlane_b32 s2, v4, 2
+; GFX6-NEXT:    v_readlane_b32 s3, v4, 3
+; GFX6-NEXT:    v_readlane_b32 s4, v4, 4
+; GFX6-NEXT:    v_readlane_b32 s5, v4, 5
+; GFX6-NEXT:    v_readlane_b32 s6, v4, 6
+; GFX6-NEXT:    v_readlane_b32 s7, v4, 7
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[44:45]
 ; GFX6-NEXT:    s_mov_b64 s[44:45], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 15
-; GFX6-NEXT:    buffer_store_dword v8, off, s[40:43], 0
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_writelane_b32 v8, s36, 0
-; GFX6-NEXT:    v_writelane_b32 v8, s37, 1
-; GFX6-NEXT:    v_writelane_b32 v8, s38, 2
-; GFX6-NEXT:    v_writelane_b32 v8, s39, 3
-; GFX6-NEXT:    v_mov_b32_e32 v4, 0x2180
-; GFX6-NEXT:    buffer_store_dword v8, v4, s[40:43], 0 offen ; 4-byte Folded Spill
+; GFX6-NEXT:    v_writelane_b32 v4, s36, 0
+; GFX6-NEXT:    v_writelane_b32 v4, s37, 1
+; GFX6-NEXT:    v_writelane_b32 v4, s38, 2
+; GFX6-NEXT:    v_writelane_b32 v4, s39, 3
+; GFX6-NEXT:    v_mov_b32_e32 v7, 0x2180
+; GFX6-NEXT:    buffer_store_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], 0
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[44:45]
 ; GFX6-NEXT:    s_mov_b64 s[38:39], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 3
-; GFX6-NEXT:    buffer_store_dword v10, off, s[40:43], 0
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_writelane_b32 v10, s36, 0
-; GFX6-NEXT:    v_writelane_b32 v10, s37, 1
+; GFX6-NEXT:    v_writelane_b32 v4, s36, 0
+; GFX6-NEXT:    v_writelane_b32 v4, s37, 1
 ; GFX6-NEXT:    s_mov_b32 s44, 0x86400
-; GFX6-NEXT:    buffer_store_dword v10, off, s[40:43], s44 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s44 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], 0
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[38:39]
 ; GFX6-NEXT:    s_mov_b64 s[44:45], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 15
-; GFX6-NEXT:    v_mov_b32_e32 v4, 0x2170
-; GFX6-NEXT:    buffer_store_dword v9, off, s[40:43], 0
+; GFX6-NEXT:    v_mov_b32_e32 v7, 0x2170
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, v4, s[40:43], 0 offen ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_readlane_b32 s36, v9, 0
-; GFX6-NEXT:    v_readlane_b32 s37, v9, 1
-; GFX6-NEXT:    v_readlane_b32 s38, v9, 2
-; GFX6-NEXT:    v_readlane_b32 s39, v9, 3
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], 0
+; GFX6-NEXT:    v_readlane_b32 s36, v4, 0
+; GFX6-NEXT:    v_readlane_b32 s37, v4, 1
+; GFX6-NEXT:    v_readlane_b32 s38, v4, 2
+; GFX6-NEXT:    v_readlane_b32 s39, v4, 3
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[44:45]
 ; GFX6-NEXT:    s_not_b64 exec, exec
-; GFX6-NEXT:    v_mov_b32_e32 v4, 0x2190
-; GFX6-NEXT:    buffer_store_dword v7, off, s[40:43], 0
+; GFX6-NEXT:    v_mov_b32_e32 v7, 0x2190
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v7, v4, s[40:43], 0 offen ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_not_b64 exec, exec
-; GFX6-NEXT:    v_mov_b32_e32 v4, 0x2190
-; GFX6-NEXT:    buffer_load_dword v7, v4, s[40:43], 0 offen ; 4-byte Folded Reload
+; GFX6-NEXT:    v_mov_b32_e32 v7, 0x2190
+; GFX6-NEXT:    buffer_load_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_not_b64 exec, exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_readlane_b32 s44, v7, 0
-; GFX6-NEXT:    v_readlane_b32 s45, v7, 1
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], 0
+; GFX6-NEXT:    v_readlane_b32 s44, v4, 0
+; GFX6-NEXT:    v_readlane_b32 s45, v4, 1
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_not_b64 exec, exec
 ; GFX6-NEXT:    s_mov_b64 vcc, s[34:35]
 ; GFX6-NEXT:    s_not_b64 exec, exec
-; GFX6-NEXT:    v_mov_b32_e32 v4, 0x2198
-; GFX6-NEXT:    buffer_store_dword v8, off, s[40:43], 0
+; GFX6-NEXT:    v_mov_b32_e32 v7, 0x2198
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v8, v4, s[40:43], 0 offen ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_not_b64 exec, exec
-; GFX6-NEXT:    v_mov_b32_e32 v4, 0x2198
-; GFX6-NEXT:    buffer_load_dword v8, v4, s[40:43], 0 offen ; 4-byte Folded Reload
+; GFX6-NEXT:    v_mov_b32_e32 v7, 0x2198
+; GFX6-NEXT:    buffer_load_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_not_b64 exec, exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_readlane_b32 s34, v8, 0
-; GFX6-NEXT:    v_readlane_b32 s35, v8, 1
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], 0
+; GFX6-NEXT:    v_readlane_b32 s34, v4, 0
+; GFX6-NEXT:    v_readlane_b32 s35, v4, 1
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_not_b64 exec, exec
 ; GFX6-NEXT:    ;;#ASMSTART
 ; GFX6-NEXT:    ; use s[8:15],s[16:23],s[24:31],s[0:7],s[36:39],s[34:35],s[44:45]
 ; GFX6-NEXT:    ;;#ASMEND
 ; GFX6-NEXT:    s_mov_b64 s[34:35], vcc
-; GFX6-NEXT:    s_mov_b64 s[8:9], exec
+; GFX6-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 15
-; GFX6-NEXT:    s_mov_b32 s0, 0x86000
+; GFX6-NEXT:    s_mov_b32 s2, 0x86000
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s2 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_readlane_b32 s36, v4, 0
 ; GFX6-NEXT:    v_readlane_b32 s37, v4, 1
@@ -10515,19 +10515,19 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_readlane_b32 s39, v4, 3
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[8:9]
+; GFX6-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX6-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 15
 ; GFX6-NEXT:    s_mov_b32 s6, 0x85800
-; GFX6-NEXT:    buffer_store_dword v7, off, s[40:43], 0
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s6 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s6 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_readlane_b32 s0, v7, 0
-; GFX6-NEXT:    v_readlane_b32 s1, v7, 1
-; GFX6-NEXT:    v_readlane_b32 s2, v7, 2
-; GFX6-NEXT:    v_readlane_b32 s3, v7, 3
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], 0
+; GFX6-NEXT:    v_readlane_b32 s0, v4, 0
+; GFX6-NEXT:    v_readlane_b32 s1, v4, 1
+; GFX6-NEXT:    v_readlane_b32 s2, v4, 2
+; GFX6-NEXT:    v_readlane_b32 s3, v4, 3
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX6-NEXT:    s_mov_b32 s2, 0x83800

diff  --git a/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir b/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir
index 3914c3b2fe073..69442032f37ec 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir
@@ -48,23 +48,23 @@ body:             |
     ; GFX9-NEXT: $sgpr12 = S_ADD_U32 $sgpr12, $sgpr9, implicit-def $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
     ; GFX9-NEXT: $sgpr13 = S_ADDC_U32 $sgpr13, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
     ; GFX9-NEXT: $vcc = IMPLICIT_DEF
-    ; GFX9-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $exec
-    ; GFX9-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr2
-    ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GFX9-NEXT: $vgpr2 = V_WRITELANE_B32 $vcc_lo, 0, undef $vgpr2, implicit $vcc
-    ; GFX9-NEXT: $vgpr2 = V_WRITELANE_B32 $vcc_hi, 1, $vgpr2, implicit $vcc
-    ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
-    ; GFX9-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
-    ; GFX9-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5, implicit killed $vgpr2
+    ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
+    ; GFX9-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
+    ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GFX9-NEXT: $vgpr0 = V_WRITELANE_B32 $vcc_lo, 0, undef $vgpr0, implicit $vcc
+    ; GFX9-NEXT: $vgpr0 = V_WRITELANE_B32 $vcc_hi, 1, $vgpr0, implicit $vcc
+    ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
+    ; GFX9-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GFX9-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GFX9-NEXT: $vcc = IMPLICIT_DEF
-    ; GFX9-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $exec
-    ; GFX9-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr1
-    ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GFX9-NEXT: $vgpr1 = V_WRITELANE_B32 $vcc_lo, 0, undef $vgpr1, implicit $vcc
-    ; GFX9-NEXT: $vgpr1 = V_WRITELANE_B32 $vcc_hi, 1, $vgpr1, implicit killed $vcc
-    ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
-    ; GFX9-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
-    ; GFX9-NEXT: $exec = S_MOV_B64 killed $sgpr2_sgpr3, implicit killed $vgpr1
+    ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
+    ; GFX9-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
+    ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GFX9-NEXT: $vgpr0 = V_WRITELANE_B32 $vcc_lo, 0, undef $vgpr0, implicit $vcc
+    ; GFX9-NEXT: $vgpr0 = V_WRITELANE_B32 $vcc_hi, 1, $vgpr0, implicit killed $vcc
+    ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
+    ; GFX9-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GFX9-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GFX9-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
     ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
@@ -84,23 +84,23 @@ body:             |
     ; GFX10-NEXT: $sgpr96 = S_ADD_U32 $sgpr96, $sgpr9, implicit-def $scc, implicit-def $sgpr96_sgpr97_sgpr98_sgpr99
     ; GFX10-NEXT: $sgpr97 = S_ADDC_U32 $sgpr97, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr96_sgpr97_sgpr98_sgpr99
     ; GFX10-NEXT: $vcc = IMPLICIT_DEF
-    ; GFX10-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $exec
-    ; GFX10-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr2
-    ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GFX10-NEXT: $vgpr2 = V_WRITELANE_B32 $vcc_lo, 0, undef $vgpr2, implicit $vcc
-    ; GFX10-NEXT: $vgpr2 = V_WRITELANE_B32 $vcc_hi, 1, $vgpr2, implicit $vcc
-    ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
-    ; GFX10-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
-    ; GFX10-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5, implicit killed $vgpr2
+    ; GFX10-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
+    ; GFX10-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
+    ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GFX10-NEXT: $vgpr0 = V_WRITELANE_B32 $vcc_lo, 0, undef $vgpr0, implicit $vcc
+    ; GFX10-NEXT: $vgpr0 = V_WRITELANE_B32 $vcc_hi, 1, $vgpr0, implicit $vcc
+    ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
+    ; GFX10-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GFX10-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GFX10-NEXT: $vcc = IMPLICIT_DEF
-    ; GFX10-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $exec
-    ; GFX10-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr1
-    ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GFX10-NEXT: $vgpr1 = V_WRITELANE_B32 $vcc_lo, 0, undef $vgpr1, implicit $vcc
-    ; GFX10-NEXT: $vgpr1 = V_WRITELANE_B32 $vcc_hi, 1, $vgpr1, implicit killed $vcc
-    ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
-    ; GFX10-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
-    ; GFX10-NEXT: $exec = S_MOV_B64 killed $sgpr2_sgpr3, implicit killed $vgpr1
+    ; GFX10-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
+    ; GFX10-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
+    ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GFX10-NEXT: $vgpr0 = V_WRITELANE_B32 $vcc_lo, 0, undef $vgpr0, implicit $vcc
+    ; GFX10-NEXT: $vgpr0 = V_WRITELANE_B32 $vcc_hi, 1, $vgpr0, implicit killed $vcc
+    ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
+    ; GFX10-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GFX10-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GFX10-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GFX10-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
     ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
@@ -114,23 +114,23 @@ body:             |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: $sgpr33 = S_MOV_B32 0
     ; GFX11-NEXT: $vcc = IMPLICIT_DEF
-    ; GFX11-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $exec
-    ; GFX11-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr2
-    ; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr2, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GFX11-NEXT: $vgpr2 = V_WRITELANE_B32 $vcc_lo, 0, undef $vgpr2, implicit $vcc
-    ; GFX11-NEXT: $vgpr2 = V_WRITELANE_B32 $vcc_hi, 1, $vgpr2, implicit $vcc
-    ; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr2, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
-    ; GFX11-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
-    ; GFX11-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5, implicit killed $vgpr2
+    ; GFX11-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
+    ; GFX11-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
+    ; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GFX11-NEXT: $vgpr0 = V_WRITELANE_B32 $vcc_lo, 0, undef $vgpr0, implicit $vcc
+    ; GFX11-NEXT: $vgpr0 = V_WRITELANE_B32 $vcc_hi, 1, $vgpr0, implicit $vcc
+    ; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+    ; GFX11-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GFX11-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GFX11-NEXT: $vcc = IMPLICIT_DEF
-    ; GFX11-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $exec
-    ; GFX11-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr1
-    ; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GFX11-NEXT: $vgpr1 = V_WRITELANE_B32 $vcc_lo, 0, undef $vgpr1, implicit $vcc
-    ; GFX11-NEXT: $vgpr1 = V_WRITELANE_B32 $vcc_hi, 1, $vgpr1, implicit killed $vcc
-    ; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
-    ; GFX11-NEXT: $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
-    ; GFX11-NEXT: $exec = S_MOV_B64 killed $sgpr2_sgpr3, implicit killed $vgpr1
+    ; GFX11-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
+    ; GFX11-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
+    ; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
+    ; GFX11-NEXT: $vgpr0 = V_WRITELANE_B32 $vcc_lo, 0, undef $vgpr0, implicit $vcc
+    ; GFX11-NEXT: $vgpr0 = V_WRITELANE_B32 $vcc_hi, 1, $vgpr0, implicit killed $vcc
+    ; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+    ; GFX11-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
+    ; GFX11-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GFX11-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GFX11-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
     ; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)

diff  --git a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
index 8ac9fd0f71b09..b243c499bf9e6 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
@@ -68,9 +68,10 @@ define amdgpu_kernel void @kernel_background_evaluate(ptr addrspace(5) %kg, ptr
 ; FLATSCR-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; FLATSCR-NEXT:    s_cbranch_execz .LBB0_2
 ; FLATSCR-NEXT:  ; %bb.1: ; %if.then4.i
-; FLATSCR-NEXT:    s_movk_i32 s1, 0x4000
+; FLATSCR-NEXT:    s_movk_i32 s0, 0x4000
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s0 offset:4
+; FLATSCR-NEXT:    s_waitcnt_depctr 0xffe3
 ; FLATSCR-NEXT:    s_mov_b32 s0, 0x41c64e6d
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s1 offset:4
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    v_add_nc_u32_e32 v0, v1, v0
 ; FLATSCR-NEXT:    v_mad_u64_u32 v[0:1], s0, v0, s0, 0x3039
@@ -95,9 +96,9 @@ define amdgpu_kernel void @kernel_background_evaluate(ptr addrspace(5) %kg, ptr
 ; MUBUF11-NEXT:    v_cmpx_ne_u32_e32 0, v0
 ; MUBUF11-NEXT:    s_cbranch_execz .LBB0_2
 ; MUBUF11-NEXT:  ; %bb.1: ; %if.then4.i
-; MUBUF11-NEXT:    s_movk_i32 s1, 0x4000
+; MUBUF11-NEXT:    s_movk_i32 s0, 0x4000
+; MUBUF11-NEXT:    scratch_load_b64 v[0:1], off, s0 offset:4
 ; MUBUF11-NEXT:    s_mov_b32 s0, 0x41c64e6d
-; MUBUF11-NEXT:    scratch_load_b64 v[0:1], off, s1 offset:4
 ; MUBUF11-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF11-NEXT:    v_add_nc_u32_e32 v2, v1, v0
 ; MUBUF11-NEXT:    v_mad_u64_u32 v[0:1], null, v2, s0, 0x3039
@@ -123,9 +124,9 @@ define amdgpu_kernel void @kernel_background_evaluate(ptr addrspace(5) %kg, ptr
 ; FLATSCR11-NEXT:    v_cmpx_ne_u32_e32 0, v0
 ; FLATSCR11-NEXT:    s_cbranch_execz .LBB0_2
 ; FLATSCR11-NEXT:  ; %bb.1: ; %if.then4.i
-; FLATSCR11-NEXT:    s_movk_i32 s1, 0x4000
+; FLATSCR11-NEXT:    s_movk_i32 s0, 0x4000
+; FLATSCR11-NEXT:    scratch_load_b64 v[0:1], off, s0 offset:4
 ; FLATSCR11-NEXT:    s_mov_b32 s0, 0x41c64e6d
-; FLATSCR11-NEXT:    scratch_load_b64 v[0:1], off, s1 offset:4
 ; FLATSCR11-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR11-NEXT:    v_add_nc_u32_e32 v2, v1, v0
 ; FLATSCR11-NEXT:    v_mad_u64_u32 v[0:1], null, v2, s0, 0x3039


        


More information about the llvm-commits mailing list