[llvm] 230c13d - [AMDGPU] Pick available high VGPR for CSR SGPR spilling (#78669)

via llvm-commits llvm-commits at lists.llvm.org
Tue Jan 23 17:38:49 PST 2024


Author: Christudasan Devadasan
Date: 2024-01-24T07:08:43+05:30
New Revision: 230c13d59d0843c3b738920b85c341cc78a61fa9

URL: https://github.com/llvm/llvm-project/commit/230c13d59d0843c3b738920b85c341cc78a61fa9
DIFF: https://github.com/llvm/llvm-project/commit/230c13d59d0843c3b738920b85c341cc78a61fa9.diff

LOG: [AMDGPU] Pick available high VGPR for CSR SGPR spilling (#78669)

CSR SGPR spilling currently uses the early available physical VGPRs. It
currently imposes a high register pressure while trying to allocate
large VGPR tuples within the default register budget.

This patch changes the spilling strategy by picking the VGPRs in the
reverse order, the highest available VGPR first and later after regalloc
shift them back to the lowest available range. With that, the initial
VGPRs would be available for allocation and possibility
of finding large number of contiguous registers will be more.

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
    llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
    llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
    llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
    llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
    llvm/test/CodeGen/AMDGPU/bf16.ll
    llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
    llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll
    llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
    llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
    llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
    llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
    llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
    llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
    llvm/test/CodeGen/AMDGPU/indirect-call.ll
    llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll
    llvm/test/CodeGen/AMDGPU/ipra.ll
    llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
    llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll
    llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
    llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
    llvm/test/CodeGen/AMDGPU/sibling-call.ll
    llvm/test/CodeGen/AMDGPU/spill-sgpr-csr-live-ins.mir
    llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir
    llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll
    llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
    llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll
    llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
    llvm/test/CodeGen/AMDGPU/vgpr-large-tuple-alloc-error.ll
    llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
    llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index a02c2a46590822..9d062eb156d5c5 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -95,7 +95,8 @@ static void getVGPRSpillLaneOrTempRegister(
                                          TargetStackID::SGPRSpill);
 
     if (TRI->spillSGPRToVGPR() &&
-        MFI->allocateSGPRSpillToVGPRLane(MF, FI, /* IsPrologEpilog */ true)) {
+        MFI->allocateSGPRSpillToVGPRLane(MF, FI, /*SpillToPhysVGPRLane=*/true,
+                                         /*IsPrologEpilog=*/true)) {
       // 2: There's no free lane to spill, and no free register to save the
       // SGPR, so we're forced to take another VGPR to use for the spill.
       MFI->addToPrologEpilogSGPRSpills(
@@ -1560,6 +1561,8 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
   if (MFI->isChainFunction() && !MF.getFrameInfo().hasTailCall())
     return;
 
+  MFI->shiftSpillPhysVGPRsToLowestRange(MF);
+
   TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS);
   if (MFI->isEntryFunction())
     return;

diff  --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index 70ffb8ea0a6220..4b13825040ebe6 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -368,7 +368,8 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
           // regalloc aware CFI generation to insert new CFIs along with the
           // intermediate spills is implemented. There is no such support
           // currently exist in the LLVM compiler.
-          if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI, true)) {
+          if (FuncInfo->allocateSGPRSpillToVGPRLane(
+                  MF, FI, /*SpillToPhysVGPRLane=*/true)) {
             bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(
                 MI, FI, nullptr, Indexes, LIS, true);
             if (!Spilled)

diff  --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index e8142244b7db69..b94d143a75e5ed 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -312,6 +312,33 @@ bool SIMachineFunctionInfo::isCalleeSavedReg(const MCPhysReg *CSRegs,
   return false;
 }
 
+void SIMachineFunctionInfo::shiftSpillPhysVGPRsToLowestRange(
+    MachineFunction &MF) {
+  const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  for (unsigned I = 0, E = SpillPhysVGPRs.size(); I < E; ++I) {
+    Register Reg = SpillPhysVGPRs[I];
+    Register NewReg =
+        TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
+    if (!NewReg || NewReg >= Reg)
+      break;
+
+    MRI.replaceRegWith(Reg, NewReg);
+
+    // Update various tables with the new VGPR.
+    SpillPhysVGPRs[I] = NewReg;
+    WWMReservedRegs.remove(Reg);
+    WWMReservedRegs.insert(NewReg);
+    WWMSpills.insert(std::make_pair(NewReg, WWMSpills[Reg]));
+    WWMSpills.erase(Reg);
+
+    for (MachineBasicBlock &MBB : MF) {
+      MBB.removeLiveIn(Reg);
+      MBB.sortUniqueLiveIns();
+    }
+  }
+}
+
 bool SIMachineFunctionInfo::allocateVirtualVGPRForSGPRSpills(
     MachineFunction &MF, int FI, unsigned LaneIndex) {
   MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -329,13 +356,17 @@ bool SIMachineFunctionInfo::allocateVirtualVGPRForSGPRSpills(
 }
 
 bool SIMachineFunctionInfo::allocatePhysicalVGPRForSGPRSpills(
-    MachineFunction &MF, int FI, unsigned LaneIndex) {
+    MachineFunction &MF, int FI, unsigned LaneIndex, bool IsPrologEpilog) {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
   Register LaneVGPR;
   if (!LaneIndex) {
-    LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
+    // Find the highest available register if called before RA to ensure the
+    // lowest registers are available for allocation. The LaneVGPR, in that
+    // case, will be shifted back to the lowest range after VGPR allocation.
+    LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF,
+                                       !IsPrologEpilog);
     if (LaneVGPR == AMDGPU::NoRegister) {
       // We have no VGPRs left for spilling SGPRs. Reset because we will not
       // partially spill the SGPR to VGPRs.
@@ -359,12 +390,12 @@ bool SIMachineFunctionInfo::allocatePhysicalVGPRForSGPRSpills(
   return true;
 }
 
-bool SIMachineFunctionInfo::allocateSGPRSpillToVGPRLane(MachineFunction &MF,
-                                                        int FI,
-                                                        bool IsPrologEpilog) {
+bool SIMachineFunctionInfo::allocateSGPRSpillToVGPRLane(
+    MachineFunction &MF, int FI, bool SpillToPhysVGPRLane,
+    bool IsPrologEpilog) {
   std::vector<SIRegisterInfo::SpilledReg> &SpillLanes =
-      IsPrologEpilog ? SGPRSpillsToPhysicalVGPRLanes[FI]
-                     : SGPRSpillsToVirtualVGPRLanes[FI];
+      SpillToPhysVGPRLane ? SGPRSpillsToPhysicalVGPRLanes[FI]
+                          : SGPRSpillsToVirtualVGPRLanes[FI];
 
   // This has already been allocated.
   if (!SpillLanes.empty())
@@ -384,14 +415,15 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPRLane(MachineFunction &MF,
   assert(ST.getRegisterInfo()->spillSGPRToVGPR() &&
          "not spilling SGPRs to VGPRs");
 
-  unsigned &NumSpillLanes =
-      IsPrologEpilog ? NumPhysicalVGPRSpillLanes : NumVirtualVGPRSpillLanes;
+  unsigned &NumSpillLanes = SpillToPhysVGPRLane ? NumPhysicalVGPRSpillLanes
+                                                : NumVirtualVGPRSpillLanes;
 
   for (unsigned I = 0; I < NumLanes; ++I, ++NumSpillLanes) {
     unsigned LaneIndex = (NumSpillLanes % WaveSize);
 
-    bool Allocated = IsPrologEpilog
-                         ? allocatePhysicalVGPRForSGPRSpills(MF, FI, LaneIndex)
+    bool Allocated = SpillToPhysVGPRLane
+                         ? allocatePhysicalVGPRForSGPRSpills(MF, FI, LaneIndex,
+                                                             IsPrologEpilog)
                          : allocateVirtualVGPRForSGPRSpills(MF, FI, LaneIndex);
     if (!Allocated) {
       NumSpillLanes -= I;

diff  --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index ecc31fbd9dd3d8..9ff66a094f991f 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -548,7 +548,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   bool allocateVirtualVGPRForSGPRSpills(MachineFunction &MF, int FI,
                                         unsigned LaneIndex);
   bool allocatePhysicalVGPRForSGPRSpills(MachineFunction &MF, int FI,
-                                         unsigned LaneIndex);
+                                         unsigned LaneIndex,
+                                         bool IsPrologEpilog);
 
 public:
   Register getVGPRForAGPRCopy() const {
@@ -588,6 +589,7 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   }
 
   ArrayRef<Register> getSGPRSpillVGPRs() const { return SpillVGPRs; }
+
   const WWMSpillsMap &getWWMSpills() const { return WWMSpills; }
   const ReservedRegSet &getWWMReservedRegs() const { return WWMReservedRegs; }
 
@@ -702,7 +704,12 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
       I->second.IsDead = true;
   }
 
+  // To bring the Physical VGPRs in the highest range allocated for CSR SGPR
+  // spilling into the lowest available range.
+  void shiftSpillPhysVGPRsToLowestRange(MachineFunction &MF);
+
   bool allocateSGPRSpillToVGPRLane(MachineFunction &MF, int FI,
+                                   bool SpillToPhysVGPRLane = false,
                                    bool IsPrologEpilog = false);
   bool allocateVGPRSpillToAGPR(MachineFunction &MF, int FI, bool isAGPRtoVGPR);
 

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
index e65eca78106105..bdd7ff11fde634 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
@@ -116,38 +116,38 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) {
 ; CHECK-NEXT:    s_mov_b32 s16, s33
 ; CHECK-NEXT:    s_mov_b32 s33, s32
 ; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
-; CHECK-NEXT:    v_writelane_b32 v40, s16, 14
-; CHECK-NEXT:    v_writelane_b32 v40, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v40, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v40, s35, 3
-; CHECK-NEXT:    v_writelane_b32 v40, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v40, s37, 5
-; CHECK-NEXT:    v_writelane_b32 v40, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v40, s39, 7
+; CHECK-NEXT:    v_writelane_b32 v43, s16, 14
+; CHECK-NEXT:    v_writelane_b32 v43, s30, 0
+; CHECK-NEXT:    v_writelane_b32 v43, s31, 1
+; CHECK-NEXT:    v_writelane_b32 v43, s34, 2
+; CHECK-NEXT:    v_writelane_b32 v43, s35, 3
+; CHECK-NEXT:    v_writelane_b32 v43, s36, 4
+; CHECK-NEXT:    v_writelane_b32 v43, s37, 5
+; CHECK-NEXT:    v_writelane_b32 v43, s38, 6
+; CHECK-NEXT:    v_writelane_b32 v43, s39, 7
 ; CHECK-NEXT:    s_addk_i32 s32, 0x800
-; CHECK-NEXT:    v_writelane_b32 v40, s40, 8
-; CHECK-NEXT:    v_writelane_b32 v40, s41, 9
+; CHECK-NEXT:    v_writelane_b32 v43, s40, 8
+; CHECK-NEXT:    v_writelane_b32 v43, s41, 9
 ; CHECK-NEXT:    s_mov_b64 s[40:41], s[4:5]
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
 ; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v40, s42, 10
-; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_writelane_b32 v40, s43, 11
-; CHECK-NEXT:    v_mov_b32_e32 v43, v1
-; CHECK-NEXT:    v_writelane_b32 v40, s44, 12
-; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v43
+; CHECK-NEXT:    v_writelane_b32 v43, s42, 10
+; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT:    v_writelane_b32 v43, s43, 11
+; CHECK-NEXT:    v_mov_b32_e32 v42, v1
+; CHECK-NEXT:    v_writelane_b32 v43, s44, 12
+; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v42
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT:    v_writelane_b32 v40, s45, 13
-; CHECK-NEXT:    v_mov_b32_e32 v41, v31
+; CHECK-NEXT:    v_writelane_b32 v43, s45, 13
+; CHECK-NEXT:    v_mov_b32_e32 v40, v31
 ; CHECK-NEXT:    s_mov_b64 s[34:35], s[6:7]
-; CHECK-NEXT:    v_mov_b32_e32 v42, v2
+; CHECK-NEXT:    v_mov_b32_e32 v41, v2
 ; CHECK-NEXT:    s_mov_b32 s42, s15
 ; CHECK-NEXT:    s_mov_b32 s43, s14
 ; CHECK-NEXT:    s_mov_b32 s44, s13
@@ -156,7 +156,7 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) {
 ; CHECK-NEXT:    s_mov_b64 s[38:39], s[8:9]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_cvt_f64_i32_e32 v[2:3], v42
+; CHECK-NEXT:    v_cvt_f64_i32_e32 v[2:3], v41
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, _Z4exp2d at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4exp2d at gotpcrel32@hi+12
@@ -170,32 +170,32 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) {
 ; CHECK-NEXT:    s_mov_b32 s13, s44
 ; CHECK-NEXT:    s_mov_b32 s14, s43
 ; CHECK-NEXT:    s_mov_b32 s15, s42
-; CHECK-NEXT:    v_mov_b32_e32 v31, v41
+; CHECK-NEXT:    v_mov_b32_e32 v31, v40
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 31, v42
-; CHECK-NEXT:    v_and_b32_e32 v2, v2, v43
-; CHECK-NEXT:    buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 31, v41
+; CHECK-NEXT:    v_and_b32_e32 v2, v2, v42
+; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; CHECK-NEXT:    v_or_b32_e32 v1, v2, v1
-; CHECK-NEXT:    v_readlane_b32 s45, v40, 13
-; CHECK-NEXT:    v_readlane_b32 s44, v40, 12
-; CHECK-NEXT:    v_readlane_b32 s43, v40, 11
-; CHECK-NEXT:    v_readlane_b32 s42, v40, 10
-; CHECK-NEXT:    v_readlane_b32 s41, v40, 9
-; CHECK-NEXT:    v_readlane_b32 s40, v40, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v40, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v40, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v40, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v40, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v40, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v40, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
-; CHECK-NEXT:    v_readlane_b32 s4, v40, 14
+; CHECK-NEXT:    v_readlane_b32 s45, v43, 13
+; CHECK-NEXT:    v_readlane_b32 s44, v43, 12
+; CHECK-NEXT:    v_readlane_b32 s43, v43, 11
+; CHECK-NEXT:    v_readlane_b32 s42, v43, 10
+; CHECK-NEXT:    v_readlane_b32 s41, v43, 9
+; CHECK-NEXT:    v_readlane_b32 s40, v43, 8
+; CHECK-NEXT:    v_readlane_b32 s39, v43, 7
+; CHECK-NEXT:    v_readlane_b32 s38, v43, 6
+; CHECK-NEXT:    v_readlane_b32 s37, v43, 5
+; CHECK-NEXT:    v_readlane_b32 s36, v43, 4
+; CHECK-NEXT:    v_readlane_b32 s35, v43, 3
+; CHECK-NEXT:    v_readlane_b32 s34, v43, 2
+; CHECK-NEXT:    v_readlane_b32 s31, v43, 1
+; CHECK-NEXT:    v_readlane_b32 s30, v43, 0
+; CHECK-NEXT:    v_readlane_b32 s4, v43, 14
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
 ; CHECK-NEXT:    s_addk_i32 s32, 0xf800
 ; CHECK-NEXT:    s_mov_b32 s33, s4
@@ -257,37 +257,37 @@ define double @test_powr_fast_f64(double %x, double %y) {
 ; CHECK-NEXT:    s_mov_b32 s16, s33
 ; CHECK-NEXT:    s_mov_b32 s33, s32
 ; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
-; CHECK-NEXT:    v_writelane_b32 v40, s16, 14
-; CHECK-NEXT:    v_writelane_b32 v40, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v40, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v40, s35, 3
-; CHECK-NEXT:    v_writelane_b32 v40, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v40, s37, 5
-; CHECK-NEXT:    v_writelane_b32 v40, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v40, s39, 7
+; CHECK-NEXT:    v_writelane_b32 v43, s16, 14
+; CHECK-NEXT:    v_writelane_b32 v43, s30, 0
+; CHECK-NEXT:    v_writelane_b32 v43, s31, 1
+; CHECK-NEXT:    v_writelane_b32 v43, s34, 2
+; CHECK-NEXT:    v_writelane_b32 v43, s35, 3
+; CHECK-NEXT:    v_writelane_b32 v43, s36, 4
+; CHECK-NEXT:    v_writelane_b32 v43, s37, 5
+; CHECK-NEXT:    v_writelane_b32 v43, s38, 6
+; CHECK-NEXT:    v_writelane_b32 v43, s39, 7
 ; CHECK-NEXT:    s_addk_i32 s32, 0x800
-; CHECK-NEXT:    v_writelane_b32 v40, s40, 8
-; CHECK-NEXT:    v_writelane_b32 v40, s41, 9
+; CHECK-NEXT:    v_writelane_b32 v43, s40, 8
+; CHECK-NEXT:    v_writelane_b32 v43, s41, 9
 ; CHECK-NEXT:    s_mov_b64 s[40:41], s[4:5]
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
 ; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v40, s42, 10
-; CHECK-NEXT:    v_writelane_b32 v40, s43, 11
-; CHECK-NEXT:    v_writelane_b32 v40, s44, 12
+; CHECK-NEXT:    v_writelane_b32 v43, s42, 10
+; CHECK-NEXT:    v_writelane_b32 v43, s43, 11
+; CHECK-NEXT:    v_writelane_b32 v43, s44, 12
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_writelane_b32 v40, s45, 13
-; CHECK-NEXT:    v_mov_b32_e32 v43, v31
+; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT:    v_writelane_b32 v43, s45, 13
+; CHECK-NEXT:    v_mov_b32_e32 v42, v31
 ; CHECK-NEXT:    s_mov_b64 s[34:35], s[6:7]
-; CHECK-NEXT:    v_mov_b32_e32 v42, v3
-; CHECK-NEXT:    v_mov_b32_e32 v41, v2
+; CHECK-NEXT:    v_mov_b32_e32 v41, v3
+; CHECK-NEXT:    v_mov_b32_e32 v40, v2
 ; CHECK-NEXT:    s_mov_b32 s42, s15
 ; CHECK-NEXT:    s_mov_b32 s43, s14
 ; CHECK-NEXT:    s_mov_b32 s44, s13
@@ -296,7 +296,7 @@ define double @test_powr_fast_f64(double %x, double %y) {
 ; CHECK-NEXT:    s_mov_b64 s[38:39], s[8:9]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_mul_f64 v[0:1], v[0:1], v[41:42]
+; CHECK-NEXT:    v_mul_f64 v[0:1], v[0:1], v[40:41]
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, _Z4exp2d at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4exp2d at gotpcrel32@hi+12
@@ -309,29 +309,29 @@ define double @test_powr_fast_f64(double %x, double %y) {
 ; CHECK-NEXT:    s_mov_b32 s13, s44
 ; CHECK-NEXT:    s_mov_b32 s14, s43
 ; CHECK-NEXT:    s_mov_b32 s15, s42
-; CHECK-NEXT:    v_mov_b32_e32 v31, v43
+; CHECK-NEXT:    v_mov_b32_e32 v31, v42
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; CHECK-NEXT:    v_readlane_b32 s45, v40, 13
-; CHECK-NEXT:    v_readlane_b32 s44, v40, 12
-; CHECK-NEXT:    v_readlane_b32 s43, v40, 11
-; CHECK-NEXT:    v_readlane_b32 s42, v40, 10
-; CHECK-NEXT:    v_readlane_b32 s41, v40, 9
-; CHECK-NEXT:    v_readlane_b32 s40, v40, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v40, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v40, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v40, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v40, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v40, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v40, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
-; CHECK-NEXT:    v_readlane_b32 s4, v40, 14
+; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT:    v_readlane_b32 s45, v43, 13
+; CHECK-NEXT:    v_readlane_b32 s44, v43, 12
+; CHECK-NEXT:    v_readlane_b32 s43, v43, 11
+; CHECK-NEXT:    v_readlane_b32 s42, v43, 10
+; CHECK-NEXT:    v_readlane_b32 s41, v43, 9
+; CHECK-NEXT:    v_readlane_b32 s40, v43, 8
+; CHECK-NEXT:    v_readlane_b32 s39, v43, 7
+; CHECK-NEXT:    v_readlane_b32 s38, v43, 6
+; CHECK-NEXT:    v_readlane_b32 s37, v43, 5
+; CHECK-NEXT:    v_readlane_b32 s36, v43, 4
+; CHECK-NEXT:    v_readlane_b32 s35, v43, 3
+; CHECK-NEXT:    v_readlane_b32 s34, v43, 2
+; CHECK-NEXT:    v_readlane_b32 s31, v43, 1
+; CHECK-NEXT:    v_readlane_b32 s30, v43, 0
+; CHECK-NEXT:    v_readlane_b32 s4, v43, 14
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
 ; CHECK-NEXT:    s_addk_i32 s32, 0xf800
 ; CHECK-NEXT:    s_mov_b32 s33, s4
@@ -400,38 +400,38 @@ define double @test_pown_fast_f64(double %x, i32 %y) {
 ; CHECK-NEXT:    s_mov_b32 s16, s33
 ; CHECK-NEXT:    s_mov_b32 s33, s32
 ; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
-; CHECK-NEXT:    v_writelane_b32 v40, s16, 14
-; CHECK-NEXT:    v_writelane_b32 v40, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v40, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v40, s35, 3
-; CHECK-NEXT:    v_writelane_b32 v40, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v40, s37, 5
-; CHECK-NEXT:    v_writelane_b32 v40, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v40, s39, 7
+; CHECK-NEXT:    v_writelane_b32 v43, s16, 14
+; CHECK-NEXT:    v_writelane_b32 v43, s30, 0
+; CHECK-NEXT:    v_writelane_b32 v43, s31, 1
+; CHECK-NEXT:    v_writelane_b32 v43, s34, 2
+; CHECK-NEXT:    v_writelane_b32 v43, s35, 3
+; CHECK-NEXT:    v_writelane_b32 v43, s36, 4
+; CHECK-NEXT:    v_writelane_b32 v43, s37, 5
+; CHECK-NEXT:    v_writelane_b32 v43, s38, 6
+; CHECK-NEXT:    v_writelane_b32 v43, s39, 7
 ; CHECK-NEXT:    s_addk_i32 s32, 0x800
-; CHECK-NEXT:    v_writelane_b32 v40, s40, 8
-; CHECK-NEXT:    v_writelane_b32 v40, s41, 9
+; CHECK-NEXT:    v_writelane_b32 v43, s40, 8
+; CHECK-NEXT:    v_writelane_b32 v43, s41, 9
 ; CHECK-NEXT:    s_mov_b64 s[40:41], s[4:5]
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
 ; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v40, s42, 10
-; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_writelane_b32 v40, s43, 11
-; CHECK-NEXT:    v_mov_b32_e32 v43, v1
-; CHECK-NEXT:    v_writelane_b32 v40, s44, 12
-; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v43
+; CHECK-NEXT:    v_writelane_b32 v43, s42, 10
+; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT:    v_writelane_b32 v43, s43, 11
+; CHECK-NEXT:    v_mov_b32_e32 v42, v1
+; CHECK-NEXT:    v_writelane_b32 v43, s44, 12
+; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v42
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT:    v_writelane_b32 v40, s45, 13
-; CHECK-NEXT:    v_mov_b32_e32 v41, v31
+; CHECK-NEXT:    v_writelane_b32 v43, s45, 13
+; CHECK-NEXT:    v_mov_b32_e32 v40, v31
 ; CHECK-NEXT:    s_mov_b64 s[34:35], s[6:7]
-; CHECK-NEXT:    v_mov_b32_e32 v42, v2
+; CHECK-NEXT:    v_mov_b32_e32 v41, v2
 ; CHECK-NEXT:    s_mov_b32 s42, s15
 ; CHECK-NEXT:    s_mov_b32 s43, s14
 ; CHECK-NEXT:    s_mov_b32 s44, s13
@@ -440,7 +440,7 @@ define double @test_pown_fast_f64(double %x, i32 %y) {
 ; CHECK-NEXT:    s_mov_b64 s[38:39], s[8:9]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_cvt_f64_i32_e32 v[2:3], v42
+; CHECK-NEXT:    v_cvt_f64_i32_e32 v[2:3], v41
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, _Z4exp2d at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4exp2d at gotpcrel32@hi+12
@@ -454,32 +454,32 @@ define double @test_pown_fast_f64(double %x, i32 %y) {
 ; CHECK-NEXT:    s_mov_b32 s13, s44
 ; CHECK-NEXT:    s_mov_b32 s14, s43
 ; CHECK-NEXT:    s_mov_b32 s15, s42
-; CHECK-NEXT:    v_mov_b32_e32 v31, v41
+; CHECK-NEXT:    v_mov_b32_e32 v31, v40
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 31, v42
-; CHECK-NEXT:    v_and_b32_e32 v2, v2, v43
-; CHECK-NEXT:    buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 31, v41
+; CHECK-NEXT:    v_and_b32_e32 v2, v2, v42
+; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; CHECK-NEXT:    v_or_b32_e32 v1, v2, v1
-; CHECK-NEXT:    v_readlane_b32 s45, v40, 13
-; CHECK-NEXT:    v_readlane_b32 s44, v40, 12
-; CHECK-NEXT:    v_readlane_b32 s43, v40, 11
-; CHECK-NEXT:    v_readlane_b32 s42, v40, 10
-; CHECK-NEXT:    v_readlane_b32 s41, v40, 9
-; CHECK-NEXT:    v_readlane_b32 s40, v40, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v40, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v40, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v40, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v40, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v40, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v40, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
-; CHECK-NEXT:    v_readlane_b32 s4, v40, 14
+; CHECK-NEXT:    v_readlane_b32 s45, v43, 13
+; CHECK-NEXT:    v_readlane_b32 s44, v43, 12
+; CHECK-NEXT:    v_readlane_b32 s43, v43, 11
+; CHECK-NEXT:    v_readlane_b32 s42, v43, 10
+; CHECK-NEXT:    v_readlane_b32 s41, v43, 9
+; CHECK-NEXT:    v_readlane_b32 s40, v43, 8
+; CHECK-NEXT:    v_readlane_b32 s39, v43, 7
+; CHECK-NEXT:    v_readlane_b32 s38, v43, 6
+; CHECK-NEXT:    v_readlane_b32 s37, v43, 5
+; CHECK-NEXT:    v_readlane_b32 s36, v43, 4
+; CHECK-NEXT:    v_readlane_b32 s35, v43, 3
+; CHECK-NEXT:    v_readlane_b32 s34, v43, 2
+; CHECK-NEXT:    v_readlane_b32 s31, v43, 1
+; CHECK-NEXT:    v_readlane_b32 s30, v43, 0
+; CHECK-NEXT:    v_readlane_b32 s4, v43, 14
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
 ; CHECK-NEXT:    s_addk_i32 s32, 0xf800
 ; CHECK-NEXT:    s_mov_b32 s33, s4
@@ -543,34 +543,34 @@ define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) {
 ; CHECK-NEXT:    s_mov_b32 s16, s33
 ; CHECK-NEXT:    s_mov_b32 s33, s32
 ; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
-; CHECK-NEXT:    v_writelane_b32 v40, s16, 14
-; CHECK-NEXT:    v_writelane_b32 v40, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v40, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v40, s35, 3
-; CHECK-NEXT:    v_writelane_b32 v40, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v40, s37, 5
-; CHECK-NEXT:    v_writelane_b32 v40, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v40, s39, 7
+; CHECK-NEXT:    v_writelane_b32 v42, s16, 14
+; CHECK-NEXT:    v_writelane_b32 v42, s30, 0
+; CHECK-NEXT:    v_writelane_b32 v42, s31, 1
+; CHECK-NEXT:    v_writelane_b32 v42, s34, 2
+; CHECK-NEXT:    v_writelane_b32 v42, s35, 3
+; CHECK-NEXT:    v_writelane_b32 v42, s36, 4
+; CHECK-NEXT:    v_writelane_b32 v42, s37, 5
+; CHECK-NEXT:    v_writelane_b32 v42, s38, 6
+; CHECK-NEXT:    v_writelane_b32 v42, s39, 7
 ; CHECK-NEXT:    s_addk_i32 s32, 0x400
-; CHECK-NEXT:    v_writelane_b32 v40, s40, 8
-; CHECK-NEXT:    v_writelane_b32 v40, s41, 9
+; CHECK-NEXT:    v_writelane_b32 v42, s40, 8
+; CHECK-NEXT:    v_writelane_b32 v42, s41, 9
 ; CHECK-NEXT:    s_mov_b64 s[40:41], s[4:5]
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
 ; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v40, s42, 10
-; CHECK-NEXT:    v_writelane_b32 v40, s43, 11
-; CHECK-NEXT:    v_writelane_b32 v40, s44, 12
+; CHECK-NEXT:    v_writelane_b32 v42, s42, 10
+; CHECK-NEXT:    v_writelane_b32 v42, s43, 11
+; CHECK-NEXT:    v_writelane_b32 v42, s44, 12
 ; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_writelane_b32 v40, s45, 13
-; CHECK-NEXT:    v_mov_b32_e32 v41, v31
+; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT:    v_writelane_b32 v42, s45, 13
+; CHECK-NEXT:    v_mov_b32_e32 v40, v31
 ; CHECK-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; CHECK-NEXT:    s_mov_b32 s42, s15
 ; CHECK-NEXT:    s_mov_b32 s43, s14
@@ -578,10 +578,10 @@ define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) {
 ; CHECK-NEXT:    s_mov_b32 s45, s12
 ; CHECK-NEXT:    s_mov_b64 s[36:37], s[10:11]
 ; CHECK-NEXT:    s_mov_b64 s[38:39], s[8:9]
-; CHECK-NEXT:    v_lshlrev_b32_e32 v42, 1, v2
+; CHECK-NEXT:    v_lshlrev_b32_e32 v41, 1, v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_cvt_f64_i32_e32 v[2:3], v42
+; CHECK-NEXT:    v_cvt_f64_i32_e32 v[2:3], v41
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, _Z4exp2d at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4exp2d at gotpcrel32@hi+12
@@ -595,28 +595,28 @@ define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) {
 ; CHECK-NEXT:    s_mov_b32 s13, s44
 ; CHECK-NEXT:    s_mov_b32 s14, s43
 ; CHECK-NEXT:    s_mov_b32 s15, s42
-; CHECK-NEXT:    v_mov_b32_e32 v31, v41
+; CHECK-NEXT:    v_mov_b32_e32 v31, v40
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    v_readlane_b32 s45, v40, 13
-; CHECK-NEXT:    v_readlane_b32 s44, v40, 12
-; CHECK-NEXT:    v_readlane_b32 s43, v40, 11
-; CHECK-NEXT:    v_readlane_b32 s42, v40, 10
-; CHECK-NEXT:    v_readlane_b32 s41, v40, 9
-; CHECK-NEXT:    v_readlane_b32 s40, v40, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v40, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v40, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v40, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v40, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v40, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v40, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
-; CHECK-NEXT:    v_readlane_b32 s4, v40, 14
+; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT:    v_readlane_b32 s45, v42, 13
+; CHECK-NEXT:    v_readlane_b32 s44, v42, 12
+; CHECK-NEXT:    v_readlane_b32 s43, v42, 11
+; CHECK-NEXT:    v_readlane_b32 s42, v42, 10
+; CHECK-NEXT:    v_readlane_b32 s41, v42, 9
+; CHECK-NEXT:    v_readlane_b32 s40, v42, 8
+; CHECK-NEXT:    v_readlane_b32 s39, v42, 7
+; CHECK-NEXT:    v_readlane_b32 s38, v42, 6
+; CHECK-NEXT:    v_readlane_b32 s37, v42, 5
+; CHECK-NEXT:    v_readlane_b32 s36, v42, 4
+; CHECK-NEXT:    v_readlane_b32 s35, v42, 3
+; CHECK-NEXT:    v_readlane_b32 s34, v42, 2
+; CHECK-NEXT:    v_readlane_b32 s31, v42, 1
+; CHECK-NEXT:    v_readlane_b32 s30, v42, 0
+; CHECK-NEXT:    v_readlane_b32 s4, v42, 14
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
 ; CHECK-NEXT:    s_addk_i32 s32, 0xfc00
 ; CHECK-NEXT:    s_mov_b32 s33, s4
@@ -685,36 +685,36 @@ define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) {
 ; CHECK-NEXT:    s_mov_b32 s16, s33
 ; CHECK-NEXT:    s_mov_b32 s33, s32
 ; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
-; CHECK-NEXT:    v_writelane_b32 v40, s16, 14
-; CHECK-NEXT:    v_writelane_b32 v40, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v40, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v40, s35, 3
-; CHECK-NEXT:    v_writelane_b32 v40, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v40, s37, 5
-; CHECK-NEXT:    v_writelane_b32 v40, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v40, s39, 7
+; CHECK-NEXT:    v_writelane_b32 v43, s16, 14
+; CHECK-NEXT:    v_writelane_b32 v43, s30, 0
+; CHECK-NEXT:    v_writelane_b32 v43, s31, 1
+; CHECK-NEXT:    v_writelane_b32 v43, s34, 2
+; CHECK-NEXT:    v_writelane_b32 v43, s35, 3
+; CHECK-NEXT:    v_writelane_b32 v43, s36, 4
+; CHECK-NEXT:    v_writelane_b32 v43, s37, 5
+; CHECK-NEXT:    v_writelane_b32 v43, s38, 6
+; CHECK-NEXT:    v_writelane_b32 v43, s39, 7
 ; CHECK-NEXT:    s_addk_i32 s32, 0x800
-; CHECK-NEXT:    v_writelane_b32 v40, s40, 8
-; CHECK-NEXT:    v_writelane_b32 v40, s41, 9
+; CHECK-NEXT:    v_writelane_b32 v43, s40, 8
+; CHECK-NEXT:    v_writelane_b32 v43, s41, 9
 ; CHECK-NEXT:    s_mov_b64 s[40:41], s[4:5]
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
 ; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v40, s42, 10
-; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_writelane_b32 v40, s43, 11
-; CHECK-NEXT:    v_mov_b32_e32 v42, v1
-; CHECK-NEXT:    v_writelane_b32 v40, s44, 12
-; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v42
+; CHECK-NEXT:    v_writelane_b32 v43, s42, 10
+; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT:    v_writelane_b32 v43, s43, 11
+; CHECK-NEXT:    v_mov_b32_e32 v41, v1
+; CHECK-NEXT:    v_writelane_b32 v43, s44, 12
+; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v41
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT:    v_writelane_b32 v40, s45, 13
-; CHECK-NEXT:    v_mov_b32_e32 v41, v31
+; CHECK-NEXT:    v_writelane_b32 v43, s45, 13
+; CHECK-NEXT:    v_mov_b32_e32 v40, v31
 ; CHECK-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; CHECK-NEXT:    s_mov_b32 s42, s15
 ; CHECK-NEXT:    s_mov_b32 s43, s14
@@ -722,10 +722,10 @@ define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) {
 ; CHECK-NEXT:    s_mov_b32 s45, s12
 ; CHECK-NEXT:    s_mov_b64 s[36:37], s[10:11]
 ; CHECK-NEXT:    s_mov_b64 s[38:39], s[8:9]
-; CHECK-NEXT:    v_or_b32_e32 v43, 1, v2
+; CHECK-NEXT:    v_or_b32_e32 v42, 1, v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_cvt_f64_i32_e32 v[2:3], v43
+; CHECK-NEXT:    v_cvt_f64_i32_e32 v[2:3], v42
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, _Z4exp2d at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4exp2d at gotpcrel32@hi+12
@@ -739,31 +739,31 @@ define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) {
 ; CHECK-NEXT:    s_mov_b32 s13, s44
 ; CHECK-NEXT:    s_mov_b32 s14, s43
 ; CHECK-NEXT:    s_mov_b32 s15, s42
-; CHECK-NEXT:    v_mov_b32_e32 v31, v41
+; CHECK-NEXT:    v_mov_b32_e32 v31, v40
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_and_b32_e32 v2, 0x80000000, v42
-; CHECK-NEXT:    buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT:    v_and_b32_e32 v2, 0x80000000, v41
+; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; CHECK-NEXT:    v_or_b32_e32 v1, v2, v1
-; CHECK-NEXT:    v_readlane_b32 s45, v40, 13
-; CHECK-NEXT:    v_readlane_b32 s44, v40, 12
-; CHECK-NEXT:    v_readlane_b32 s43, v40, 11
-; CHECK-NEXT:    v_readlane_b32 s42, v40, 10
-; CHECK-NEXT:    v_readlane_b32 s41, v40, 9
-; CHECK-NEXT:    v_readlane_b32 s40, v40, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v40, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v40, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v40, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v40, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v40, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v40, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
-; CHECK-NEXT:    v_readlane_b32 s4, v40, 14
+; CHECK-NEXT:    v_readlane_b32 s45, v43, 13
+; CHECK-NEXT:    v_readlane_b32 s44, v43, 12
+; CHECK-NEXT:    v_readlane_b32 s43, v43, 11
+; CHECK-NEXT:    v_readlane_b32 s42, v43, 10
+; CHECK-NEXT:    v_readlane_b32 s41, v43, 9
+; CHECK-NEXT:    v_readlane_b32 s40, v43, 8
+; CHECK-NEXT:    v_readlane_b32 s39, v43, 7
+; CHECK-NEXT:    v_readlane_b32 s38, v43, 6
+; CHECK-NEXT:    v_readlane_b32 s37, v43, 5
+; CHECK-NEXT:    v_readlane_b32 s36, v43, 4
+; CHECK-NEXT:    v_readlane_b32 s35, v43, 3
+; CHECK-NEXT:    v_readlane_b32 s34, v43, 2
+; CHECK-NEXT:    v_readlane_b32 s31, v43, 1
+; CHECK-NEXT:    v_readlane_b32 s30, v43, 0
+; CHECK-NEXT:    v_readlane_b32 s4, v43, 14
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
 ; CHECK-NEXT:    s_addk_i32 s32, 0xf800
 ; CHECK-NEXT:    s_mov_b32 s33, s4

diff  --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 4a696879ad7b23..2f7190e7611024 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -3544,12 +3544,12 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-NEXT:    s_mov_b32 s8, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
 ; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_writelane_b32 v3, s30, 0
-; GCN-NEXT:    v_writelane_b32 v3, s31, 1
+; GCN-NEXT:    v_writelane_b32 v4, s30, 0
+; GCN-NEXT:    v_writelane_b32 v4, s31, 1
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
@@ -3558,15 +3558,15 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, 2, v2
-; GCN-NEXT:    buffer_store_short v1, v4, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, 2, v2
+; GCN-NEXT:    buffer_store_short v1, v3, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_store_short v0, v2, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s31, v3, 1
-; GCN-NEXT:    v_readlane_b32 s30, v3, 0
+; GCN-NEXT:    v_readlane_b32 s31, v4, 1
+; GCN-NEXT:    v_readlane_b32 s30, v4, 0
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    s_mov_b32 s33, s8
@@ -3579,28 +3579,28 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX7-NEXT:    s_mov_b32 s8, s33
 ; GFX7-NEXT:    s_mov_b32 s33, s32
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX7-NEXT:    buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX7-NEXT:    s_addk_i32 s32, 0x400
 ; GFX7-NEXT:    s_getpc_b64 s[4:5]
 ; GFX7-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
 ; GFX7-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX7-NEXT:    v_writelane_b32 v3, s30, 0
-; GFX7-NEXT:    v_writelane_b32 v3, s31, 1
+; GFX7-NEXT:    v_writelane_b32 v4, s30, 0
+; GFX7-NEXT:    v_writelane_b32 v4, s31, 1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 2, v2
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 2, v2
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    buffer_store_short v1, v4, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_short v1, v3, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_store_short v0, v2, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX7-NEXT:    v_readlane_b32 s31, v4, 1
+; GFX7-NEXT:    v_readlane_b32 s30, v4, 0
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX7-NEXT:    buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX7-NEXT:    s_mov_b32 s33, s8
@@ -3737,12 +3737,12 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-NEXT:    s_mov_b32 s8, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
 ; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_writelane_b32 v4, s30, 0
-; GCN-NEXT:    v_writelane_b32 v4, s31, 1
+; GCN-NEXT:    v_writelane_b32 v5, s30, 0
+; GCN-NEXT:    v_writelane_b32 v5, s31, 1
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
@@ -3751,16 +3751,16 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, 4, v3
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, 4, v3
 ; GCN-NEXT:    v_alignbit_b32 v0, v1, v0, 16
-; GCN-NEXT:    buffer_store_short v2, v5, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_short v2, v4, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_store_dword v0, v3, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s31, v4, 1
-; GCN-NEXT:    v_readlane_b32 s30, v4, 0
+; GCN-NEXT:    v_readlane_b32 s31, v5, 1
+; GCN-NEXT:    v_readlane_b32 s30, v5, 0
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    s_mov_b32 s33, s8
@@ -3808,26 +3808,26 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    s_mov_b32 s6, s33
 ; GFX8-NEXT:    s_mov_b32 s33, s32
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX8-NEXT:    buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX8-NEXT:    s_addk_i32 s32, 0x400
 ; GFX8-NEXT:    s_getpc_b64 s[4:5]
 ; GFX8-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
 ; GFX8-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
 ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX8-NEXT:    v_writelane_b32 v3, s30, 0
-; GFX8-NEXT:    v_writelane_b32 v3, s31, 1
+; GFX8-NEXT:    v_writelane_b32 v4, s30, 0
+; GFX8-NEXT:    v_writelane_b32 v4, s31, 1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 4, v2
-; GFX8-NEXT:    buffer_store_short v1, v4, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 4, v2
+; GFX8-NEXT:    buffer_store_short v1, v3, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX8-NEXT:    v_readlane_b32 s31, v4, 1
+; GFX8-NEXT:    v_readlane_b32 s30, v4, 0
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX8-NEXT:    s_mov_b32 s33, s6
@@ -3941,12 +3941,12 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-NEXT:    s_mov_b32 s8, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
 ; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_writelane_b32 v5, s30, 0
-; GCN-NEXT:    v_writelane_b32 v5, s31, 1
+; GCN-NEXT:    v_writelane_b32 v8, s30, 0
+; GCN-NEXT:    v_writelane_b32 v8, s31, 1
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
@@ -3957,21 +3957,21 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, 6, v4
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, 4, v4
-; GCN-NEXT:    v_add_i32_e32 v8, vcc, 2, v4
-; GCN-NEXT:    buffer_store_short v3, v6, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, 6, v4
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, 4, v4
+; GCN-NEXT:    v_add_i32_e32 v7, vcc, 2, v4
+; GCN-NEXT:    buffer_store_short v3, v5, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v2, v7, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_short v2, v6, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v1, v8, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_short v1, v7, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_store_short v0, v4, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s31, v5, 1
-; GCN-NEXT:    v_readlane_b32 s30, v5, 0
+; GCN-NEXT:    v_readlane_b32 s31, v8, 1
+; GCN-NEXT:    v_readlane_b32 s30, v8, 0
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    s_mov_b32 s33, s8
@@ -3984,21 +3984,21 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX7-NEXT:    s_mov_b32 s8, s33
 ; GFX7-NEXT:    s_mov_b32 s33, s32
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX7-NEXT:    buffer_store_dword v6, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX7-NEXT:    s_addk_i32 s32, 0x400
 ; GFX7-NEXT:    s_getpc_b64 s[4:5]
 ; GFX7-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
 ; GFX7-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX7-NEXT:    v_writelane_b32 v5, s30, 0
-; GFX7-NEXT:    v_writelane_b32 v5, s31, 1
+; GFX7-NEXT:    v_writelane_b32 v6, s30, 0
+; GFX7-NEXT:    v_writelane_b32 v6, s31, 1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_add_i32_e32 v6, vcc, 6, v4
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 6, v4
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    buffer_store_short v3, v6, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_short v3, v5, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 4, v4
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
@@ -4010,10 +4010,10 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_store_short v0, v4, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_readlane_b32 s31, v5, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v5, 0
+; GFX7-NEXT:    v_readlane_b32 s31, v6, 1
+; GFX7-NEXT:    v_readlane_b32 s30, v6, 0
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX7-NEXT:    buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX7-NEXT:    s_mov_b32 s33, s8
@@ -4026,26 +4026,26 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    s_mov_b32 s6, s33
 ; GFX8-NEXT:    s_mov_b32 s33, s32
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX8-NEXT:    buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX8-NEXT:    s_addk_i32 s32, 0x400
 ; GFX8-NEXT:    s_getpc_b64 s[4:5]
 ; GFX8-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
 ; GFX8-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
 ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX8-NEXT:    v_writelane_b32 v3, s30, 0
-; GFX8-NEXT:    v_writelane_b32 v3, s31, 1
+; GFX8-NEXT:    v_writelane_b32 v4, s30, 0
+; GFX8-NEXT:    v_writelane_b32 v4, s31, 1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 4, v2
-; GFX8-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 4, v2
+; GFX8-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX8-NEXT:    v_readlane_b32 s31, v4, 1
+; GFX8-NEXT:    v_readlane_b32 s30, v4, 0
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX8-NEXT:    s_mov_b32 s33, s6
@@ -4157,12 +4157,12 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-NEXT:    s_mov_b32 s8, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
 ; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_writelane_b32 v9, s30, 0
-; GCN-NEXT:    v_writelane_b32 v9, s31, 1
+; GCN-NEXT:    v_writelane_b32 v16, s30, 0
+; GCN-NEXT:    v_writelane_b32 v16, s31, 1
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
@@ -4177,33 +4177,33 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
 ; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
 ; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GCN-NEXT:    v_add_i32_e32 v10, vcc, 14, v8
-; GCN-NEXT:    v_add_i32_e32 v11, vcc, 12, v8
-; GCN-NEXT:    v_add_i32_e32 v12, vcc, 10, v8
-; GCN-NEXT:    v_add_i32_e32 v13, vcc, 8, v8
-; GCN-NEXT:    v_add_i32_e32 v14, vcc, 6, v8
-; GCN-NEXT:    v_add_i32_e32 v15, vcc, 4, v8
-; GCN-NEXT:    v_add_i32_e32 v16, vcc, 2, v8
-; GCN-NEXT:    buffer_store_short v7, v10, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v9, vcc, 14, v8
+; GCN-NEXT:    v_add_i32_e32 v10, vcc, 12, v8
+; GCN-NEXT:    v_add_i32_e32 v11, vcc, 10, v8
+; GCN-NEXT:    v_add_i32_e32 v12, vcc, 8, v8
+; GCN-NEXT:    v_add_i32_e32 v13, vcc, 6, v8
+; GCN-NEXT:    v_add_i32_e32 v14, vcc, 4, v8
+; GCN-NEXT:    v_add_i32_e32 v15, vcc, 2, v8
+; GCN-NEXT:    buffer_store_short v7, v9, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v6, v11, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_short v6, v10, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v5, v12, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_short v5, v11, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v4, v13, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_short v4, v12, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v3, v14, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_short v3, v13, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v2, v15, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_short v2, v14, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v1, v16, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_short v1, v15, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_store_short v0, v8, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s31, v9, 1
-; GCN-NEXT:    v_readlane_b32 s30, v9, 0
+; GCN-NEXT:    v_readlane_b32 s31, v16, 1
+; GCN-NEXT:    v_readlane_b32 s30, v16, 0
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    s_mov_b32 s33, s8
@@ -4216,21 +4216,21 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX7-NEXT:    s_mov_b32 s8, s33
 ; GFX7-NEXT:    s_mov_b32 s33, s32
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX7-NEXT:    buffer_store_dword v10, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX7-NEXT:    s_addk_i32 s32, 0x400
 ; GFX7-NEXT:    s_getpc_b64 s[4:5]
 ; GFX7-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
 ; GFX7-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX7-NEXT:    v_writelane_b32 v9, s30, 0
-; GFX7-NEXT:    v_writelane_b32 v9, s31, 1
+; GFX7-NEXT:    v_writelane_b32 v10, s30, 0
+; GFX7-NEXT:    v_writelane_b32 v10, s31, 1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX7-NEXT:    v_add_i32_e32 v10, vcc, 14, v8
+; GFX7-NEXT:    v_add_i32_e32 v9, vcc, 14, v8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX7-NEXT:    buffer_store_short v7, v10, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_short v7, v9, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v7, vcc, 12, v8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
@@ -4258,10 +4258,10 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_store_short v0, v8, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_readlane_b32 s31, v9, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v9, 0
+; GFX7-NEXT:    v_readlane_b32 s31, v10, 1
+; GFX7-NEXT:    v_readlane_b32 s30, v10, 0
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX7-NEXT:    buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX7-NEXT:    s_mov_b32 s33, s8
@@ -4274,19 +4274,19 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    s_mov_b32 s6, s33
 ; GFX8-NEXT:    s_mov_b32 s33, s32
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX8-NEXT:    buffer_store_dword v6, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX8-NEXT:    s_addk_i32 s32, 0x400
 ; GFX8-NEXT:    s_getpc_b64 s[4:5]
 ; GFX8-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
 ; GFX8-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
 ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX8-NEXT:    v_writelane_b32 v5, s30, 0
-; GFX8-NEXT:    v_writelane_b32 v5, s31, 1
+; GFX8-NEXT:    v_writelane_b32 v6, s30, 0
+; GFX8-NEXT:    v_writelane_b32 v6, s31, 1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 12, v4
-; GFX8-NEXT:    buffer_store_dword v3, v6, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 12, v4
+; GFX8-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 8, v4
 ; GFX8-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
@@ -4296,10 +4296,10 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_readlane_b32 s31, v5, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v5, 0
+; GFX8-NEXT:    v_readlane_b32 s31, v6, 1
+; GFX8-NEXT:    v_readlane_b32 s30, v6, 0
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX8-NEXT:    s_mov_b32 s33, s6
@@ -4419,12 +4419,12 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-NEXT:    s_mov_b32 s8, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v28, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
 ; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_writelane_b32 v17, s30, 0
-; GCN-NEXT:    v_writelane_b32 v17, s31, 1
+; GCN-NEXT:    v_writelane_b32 v28, s30, 0
+; GCN-NEXT:    v_writelane_b32 v28, s31, 1
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
@@ -4447,57 +4447,57 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
 ; GCN-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
 ; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; GCN-NEXT:    v_add_i32_e32 v18, vcc, 30, v16
-; GCN-NEXT:    v_add_i32_e32 v19, vcc, 28, v16
-; GCN-NEXT:    v_add_i32_e32 v20, vcc, 26, v16
-; GCN-NEXT:    v_add_i32_e32 v21, vcc, 24, v16
-; GCN-NEXT:    v_add_i32_e32 v22, vcc, 22, v16
-; GCN-NEXT:    v_add_i32_e32 v23, vcc, 20, v16
-; GCN-NEXT:    v_add_i32_e32 v24, vcc, 18, v16
-; GCN-NEXT:    v_add_i32_e32 v25, vcc, 16, v16
-; GCN-NEXT:    v_add_i32_e32 v26, vcc, 14, v16
-; GCN-NEXT:    v_add_i32_e32 v27, vcc, 12, v16
-; GCN-NEXT:    v_add_i32_e32 v28, vcc, 10, v16
-; GCN-NEXT:    buffer_store_short v15, v18, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v17, vcc, 30, v16
+; GCN-NEXT:    v_add_i32_e32 v18, vcc, 28, v16
+; GCN-NEXT:    v_add_i32_e32 v19, vcc, 26, v16
+; GCN-NEXT:    v_add_i32_e32 v20, vcc, 24, v16
+; GCN-NEXT:    v_add_i32_e32 v21, vcc, 22, v16
+; GCN-NEXT:    v_add_i32_e32 v22, vcc, 20, v16
+; GCN-NEXT:    v_add_i32_e32 v23, vcc, 18, v16
+; GCN-NEXT:    v_add_i32_e32 v24, vcc, 16, v16
+; GCN-NEXT:    v_add_i32_e32 v25, vcc, 14, v16
+; GCN-NEXT:    v_add_i32_e32 v26, vcc, 12, v16
+; GCN-NEXT:    v_add_i32_e32 v27, vcc, 10, v16
+; GCN-NEXT:    buffer_store_short v15, v17, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    v_add_i32_e32 v15, vcc, 8, v16
-; GCN-NEXT:    v_add_i32_e32 v18, vcc, 6, v16
-; GCN-NEXT:    buffer_store_short v14, v19, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v17, vcc, 6, v16
+; GCN-NEXT:    buffer_store_short v14, v18, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    v_add_i32_e32 v14, vcc, 4, v16
-; GCN-NEXT:    v_add_i32_e32 v19, vcc, 2, v16
-; GCN-NEXT:    buffer_store_short v13, v20, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v18, vcc, 2, v16
+; GCN-NEXT:    buffer_store_short v13, v19, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v12, v21, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_short v12, v20, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v11, v22, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_short v11, v21, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v10, v23, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_short v10, v22, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v9, v24, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_short v9, v23, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v8, v25, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_short v8, v24, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v7, v26, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_short v7, v25, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v6, v27, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_short v6, v26, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v5, v28, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_short v5, v27, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_store_short v4, v15, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v3, v18, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_short v3, v17, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_store_short v2, v14, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v1, v19, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_short v1, v18, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_store_short v0, v16, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s31, v17, 1
-; GCN-NEXT:    v_readlane_b32 s30, v17, 0
+; GCN-NEXT:    v_readlane_b32 s31, v28, 1
+; GCN-NEXT:    v_readlane_b32 s30, v28, 0
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v28, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    s_mov_b32 s33, s8
@@ -4510,21 +4510,21 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX7-NEXT:    s_mov_b32 s8, s33
 ; GFX7-NEXT:    s_mov_b32 s33, s32
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_store_dword v17, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX7-NEXT:    buffer_store_dword v18, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX7-NEXT:    s_addk_i32 s32, 0x400
 ; GFX7-NEXT:    s_getpc_b64 s[4:5]
 ; GFX7-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
 ; GFX7-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX7-NEXT:    v_writelane_b32 v17, s30, 0
-; GFX7-NEXT:    v_writelane_b32 v17, s31, 1
+; GFX7-NEXT:    v_writelane_b32 v18, s30, 0
+; GFX7-NEXT:    v_writelane_b32 v18, s31, 1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; GFX7-NEXT:    v_add_i32_e32 v18, vcc, 30, v16
+; GFX7-NEXT:    v_add_i32_e32 v17, vcc, 30, v16
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; GFX7-NEXT:    buffer_store_short v15, v18, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_short v15, v17, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v15, vcc, 28, v16
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
@@ -4584,10 +4584,10 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_store_short v0, v16, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_readlane_b32 s31, v17, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v17, 0
+; GFX7-NEXT:    v_readlane_b32 s31, v18, 1
+; GFX7-NEXT:    v_readlane_b32 s30, v18, 0
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_load_dword v17, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX7-NEXT:    buffer_load_dword v18, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX7-NEXT:    s_mov_b32 s33, s8
@@ -4600,19 +4600,19 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    s_mov_b32 s6, s33
 ; GFX8-NEXT:    s_mov_b32 s33, s32
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX8-NEXT:    buffer_store_dword v10, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX8-NEXT:    s_addk_i32 s32, 0x400
 ; GFX8-NEXT:    s_getpc_b64 s[4:5]
 ; GFX8-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
 ; GFX8-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
 ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX8-NEXT:    v_writelane_b32 v9, s30, 0
-; GFX8-NEXT:    v_writelane_b32 v9, s31, 1
+; GFX8-NEXT:    v_writelane_b32 v10, s30, 0
+; GFX8-NEXT:    v_writelane_b32 v10, s31, 1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 28, v8
-; GFX8-NEXT:    buffer_store_dword v7, v10, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 28, v8
+; GFX8-NEXT:    buffer_store_dword v7, v9, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 24, v8
 ; GFX8-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
@@ -4634,10 +4634,10 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, v8, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_readlane_b32 s31, v9, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v9, 0
+; GFX8-NEXT:    v_readlane_b32 s31, v10, 1
+; GFX8-NEXT:    v_readlane_b32 s30, v10, 0
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX8-NEXT:    s_mov_b32 s33, s6
@@ -27303,7 +27303,7 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX7-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
@@ -27330,16 +27330,16 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
 ; GFX7-NEXT:    v_and_b32_e32 v0, 1, v11
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[24:25], 1, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, 1, v12
-; GFX7-NEXT:    v_writelane_b32 v31, s30, 0
+; GFX7-NEXT:    v_writelane_b32 v32, s30, 0
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[26:27], 1, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, 1, v13
-; GFX7-NEXT:    v_writelane_b32 v31, s31, 1
+; GFX7-NEXT:    v_writelane_b32 v32, s31, 1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[28:29], 1, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, 1, v14
-; GFX7-NEXT:    v_writelane_b32 v31, s34, 2
+; GFX7-NEXT:    v_writelane_b32 v32, s34, 2
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[30:31], 1, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, 1, v15
-; GFX7-NEXT:    v_writelane_b32 v31, s35, 3
+; GFX7-NEXT:    v_writelane_b32 v32, s35, 3
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[34:35], 1, v0
 ; GFX7-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
 ; GFX7-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8
@@ -27357,7 +27357,7 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
 ; GFX7-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:56
 ; GFX7-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:60
 ; GFX7-NEXT:    buffer_load_dword v15, off, s[0:3], s32
-; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:64
 ; GFX7-NEXT:    s_waitcnt vmcnt(14)
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v17, s[4:5]
@@ -27388,7 +27388,7 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
 ; GFX7-NEXT:    v_cndmask_b32_e64 v14, v14, v30, s[30:31]
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cndmask_b32_e64 v15, v32, v15, s[34:35]
+; GFX7-NEXT:    v_cndmask_b32_e64 v15, v31, v15, s[34:35]
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
@@ -27404,12 +27404,12 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
 ; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX7-NEXT:    v_readlane_b32 s35, v31, 3
-; GFX7-NEXT:    v_readlane_b32 s34, v31, 2
-; GFX7-NEXT:    v_readlane_b32 s31, v31, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v31, 0
+; GFX7-NEXT:    v_readlane_b32 s35, v32, 3
+; GFX7-NEXT:    v_readlane_b32 s34, v32, 2
+; GFX7-NEXT:    v_readlane_b32 s31, v32, 1
+; GFX7-NEXT:    v_readlane_b32 s30, v32, 0
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -28211,107 +28211,107 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX8-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    v_writelane_b32 v31, s30, 0
-; GFX8-NEXT:    v_writelane_b32 v31, s31, 1
-; GFX8-NEXT:    v_writelane_b32 v31, s34, 2
+; GFX8-NEXT:    v_writelane_b32 v34, s30, 0
+; GFX8-NEXT:    v_writelane_b32 v34, s31, 1
+; GFX8-NEXT:    v_writelane_b32 v34, s34, 2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_writelane_b32 v31, s35, 3
+; GFX8-NEXT:    v_writelane_b32 v34, s35, 3
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v1
-; GFX8-NEXT:    v_writelane_b32 v31, s36, 4
+; GFX8-NEXT:    v_writelane_b32 v34, s36, 4
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v2
-; GFX8-NEXT:    v_writelane_b32 v31, s37, 5
+; GFX8-NEXT:    v_writelane_b32 v34, s37, 5
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[6:7], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v3
-; GFX8-NEXT:    v_writelane_b32 v31, s38, 6
+; GFX8-NEXT:    v_writelane_b32 v34, s38, 6
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[8:9], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v4
-; GFX8-NEXT:    v_writelane_b32 v31, s39, 7
+; GFX8-NEXT:    v_writelane_b32 v34, s39, 7
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[10:11], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v5
-; GFX8-NEXT:    v_writelane_b32 v31, s40, 8
+; GFX8-NEXT:    v_writelane_b32 v34, s40, 8
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[12:13], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v6
-; GFX8-NEXT:    v_writelane_b32 v31, s41, 9
+; GFX8-NEXT:    v_writelane_b32 v34, s41, 9
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[14:15], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v7
-; GFX8-NEXT:    v_writelane_b32 v31, s42, 10
+; GFX8-NEXT:    v_writelane_b32 v34, s42, 10
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[16:17], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v8
-; GFX8-NEXT:    v_writelane_b32 v31, s43, 11
+; GFX8-NEXT:    v_writelane_b32 v34, s43, 11
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[18:19], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v9
-; GFX8-NEXT:    v_writelane_b32 v31, s44, 12
+; GFX8-NEXT:    v_writelane_b32 v34, s44, 12
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[20:21], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v10
-; GFX8-NEXT:    v_writelane_b32 v31, s45, 13
+; GFX8-NEXT:    v_writelane_b32 v34, s45, 13
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[22:23], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v11
-; GFX8-NEXT:    v_writelane_b32 v31, s46, 14
+; GFX8-NEXT:    v_writelane_b32 v34, s46, 14
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[24:25], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v12
-; GFX8-NEXT:    v_writelane_b32 v31, s47, 15
+; GFX8-NEXT:    v_writelane_b32 v34, s47, 15
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[26:27], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v13
-; GFX8-NEXT:    v_writelane_b32 v31, s48, 16
+; GFX8-NEXT:    v_writelane_b32 v34, s48, 16
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[28:29], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v14
-; GFX8-NEXT:    v_writelane_b32 v31, s49, 17
+; GFX8-NEXT:    v_writelane_b32 v34, s49, 17
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[30:31], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v15
-; GFX8-NEXT:    v_writelane_b32 v31, s50, 18
+; GFX8-NEXT:    v_writelane_b32 v34, s50, 18
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[34:35], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v16
-; GFX8-NEXT:    v_writelane_b32 v31, s51, 19
+; GFX8-NEXT:    v_writelane_b32 v34, s51, 19
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[36:37], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v17
-; GFX8-NEXT:    v_writelane_b32 v31, s52, 20
+; GFX8-NEXT:    v_writelane_b32 v34, s52, 20
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[38:39], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v18
-; GFX8-NEXT:    v_writelane_b32 v31, s53, 21
+; GFX8-NEXT:    v_writelane_b32 v34, s53, 21
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[40:41], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v19
-; GFX8-NEXT:    v_writelane_b32 v31, s54, 22
+; GFX8-NEXT:    v_writelane_b32 v34, s54, 22
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[42:43], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v20
-; GFX8-NEXT:    v_writelane_b32 v31, s55, 23
+; GFX8-NEXT:    v_writelane_b32 v34, s55, 23
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[44:45], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v21
-; GFX8-NEXT:    v_writelane_b32 v31, s56, 24
+; GFX8-NEXT:    v_writelane_b32 v34, s56, 24
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[46:47], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v22
-; GFX8-NEXT:    v_writelane_b32 v31, s57, 25
+; GFX8-NEXT:    v_writelane_b32 v34, s57, 25
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[48:49], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v23
-; GFX8-NEXT:    v_writelane_b32 v31, s58, 26
+; GFX8-NEXT:    v_writelane_b32 v34, s58, 26
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[50:51], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v24
-; GFX8-NEXT:    v_writelane_b32 v31, s59, 27
+; GFX8-NEXT:    v_writelane_b32 v34, s59, 27
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[52:53], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v25
-; GFX8-NEXT:    v_writelane_b32 v31, s60, 28
+; GFX8-NEXT:    v_writelane_b32 v34, s60, 28
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[54:55], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v26
-; GFX8-NEXT:    v_writelane_b32 v31, s61, 29
+; GFX8-NEXT:    v_writelane_b32 v34, s61, 29
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[56:57], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v27
-; GFX8-NEXT:    v_writelane_b32 v31, s62, 30
+; GFX8-NEXT:    v_writelane_b32 v34, s62, 30
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[58:59], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v28
-; GFX8-NEXT:    v_writelane_b32 v31, s63, 31
+; GFX8-NEXT:    v_writelane_b32 v34, s63, 31
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[60:61], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v29
-; GFX8-NEXT:    v_writelane_b32 v31, s64, 32
+; GFX8-NEXT:    v_writelane_b32 v34, s64, 32
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[62:63], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v30
-; GFX8-NEXT:    v_writelane_b32 v31, s65, 33
+; GFX8-NEXT:    v_writelane_b32 v34, s65, 33
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[64:65], 1, v0
 ; GFX8-NEXT:    buffer_load_ushort v0, off, s[0:3], s32
-; GFX8-NEXT:    v_writelane_b32 v31, s66, 34
-; GFX8-NEXT:    v_writelane_b32 v31, s67, 35
+; GFX8-NEXT:    v_writelane_b32 v34, s66, 34
+; GFX8-NEXT:    v_writelane_b32 v34, s67, 35
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[66:67], 1, v0
@@ -28344,74 +28344,74 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX8-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:120
 ; GFX8-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:56
 ; GFX8-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:124
-; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
+; GFX8-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:60
 ; GFX8-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:128
-; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:64
+; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v29
+; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v29
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v28, 16, v33
-; GFX8-NEXT:    v_cndmask_b32_e64 v28, v34, v28, s[66:67]
-; GFX8-NEXT:    v_cndmask_b32_e64 v29, v29, v33, s[64:65]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v32
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v30
-; GFX8-NEXT:    v_cndmask_b32_e64 v33, v34, v33, s[62:63]
-; GFX8-NEXT:    v_cndmask_b32_e64 v30, v30, v32, s[60:61]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v32, 16, v27
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v26
-; GFX8-NEXT:    v_cndmask_b32_e64 v32, v34, v32, s[58:59]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v28, 16, v32
+; GFX8-NEXT:    v_cndmask_b32_e64 v28, v33, v28, s[66:67]
+; GFX8-NEXT:    v_cndmask_b32_e64 v29, v29, v32, s[64:65]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v32, 16, v31
+; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v30
+; GFX8-NEXT:    v_cndmask_b32_e64 v32, v33, v32, s[62:63]
+; GFX8-NEXT:    v_cndmask_b32_e64 v30, v30, v31, s[60:61]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v31, 16, v27
+; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
+; GFX8-NEXT:    v_cndmask_b32_e64 v31, v33, v31, s[58:59]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v26, v26, v27, s[56:57]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v27, 16, v25
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v24
-; GFX8-NEXT:    v_cndmask_b32_e64 v27, v34, v27, s[54:55]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
+; GFX8-NEXT:    v_cndmask_b32_e64 v27, v33, v27, s[54:55]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v24, v24, v25, s[52:53]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v25, 16, v23
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v22
-; GFX8-NEXT:    v_cndmask_b32_e64 v25, v34, v25, s[50:51]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
+; GFX8-NEXT:    v_cndmask_b32_e64 v25, v33, v25, s[50:51]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v22, v22, v23, s[48:49]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v23, 16, v21
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v20
-; GFX8-NEXT:    v_cndmask_b32_e64 v23, v34, v23, s[46:47]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
+; GFX8-NEXT:    v_cndmask_b32_e64 v23, v33, v23, s[46:47]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v20, v20, v21, s[44:45]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v21, 16, v19
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v18
-; GFX8-NEXT:    v_cndmask_b32_e64 v21, v34, v21, s[42:43]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v18
+; GFX8-NEXT:    v_cndmask_b32_e64 v21, v33, v21, s[42:43]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v18, v18, v19, s[40:41]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v17
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v16
-; GFX8-NEXT:    v_cndmask_b32_e64 v19, v34, v19, s[38:39]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
+; GFX8-NEXT:    v_cndmask_b32_e64 v19, v33, v19, s[38:39]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v16, v16, v17, s[36:37]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v15
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v14
-; GFX8-NEXT:    v_cndmask_b32_e64 v17, v34, v17, s[34:35]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
+; GFX8-NEXT:    v_cndmask_b32_e64 v17, v33, v17, s[34:35]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v14, v14, v15, s[30:31]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v13
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v12
-; GFX8-NEXT:    v_cndmask_b32_e64 v15, v34, v15, s[28:29]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
+; GFX8-NEXT:    v_cndmask_b32_e64 v15, v33, v15, s[28:29]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v12, v12, v13, s[26:27]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v11
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v10
-; GFX8-NEXT:    v_cndmask_b32_e64 v13, v34, v13, s[24:25]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v10
+; GFX8-NEXT:    v_cndmask_b32_e64 v13, v33, v13, s[24:25]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[22:23]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v8
-; GFX8-NEXT:    v_cndmask_b32_e64 v11, v34, v11, s[20:21]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v8
+; GFX8-NEXT:    v_cndmask_b32_e64 v11, v33, v11, s[20:21]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[18:19]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v6
-; GFX8-NEXT:    v_cndmask_b32_e64 v9, v34, v9, s[16:17]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v6
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, v33, v9, s[16:17]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[14:15]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v4
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, v34, v7, s[12:13]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v4
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v33, v7, s[12:13]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[10:11]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, v34, v5, s[8:9]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v33, v5, s[8:9]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[6:7]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v0
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v34, v3, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v0
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v33, v3, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -28434,8 +28434,8 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v23
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v25
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v27
-; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v32
-; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v33
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v31
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v32
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v28
 ; GFX8-NEXT:    v_or_b32_sdwa v8, v16, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v9, v18, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -28445,44 +28445,44 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX8-NEXT:    v_or_b32_sdwa v13, v26, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v14, v30, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v15, v29, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_readlane_b32 s67, v31, 35
-; GFX8-NEXT:    v_readlane_b32 s66, v31, 34
-; GFX8-NEXT:    v_readlane_b32 s65, v31, 33
-; GFX8-NEXT:    v_readlane_b32 s64, v31, 32
-; GFX8-NEXT:    v_readlane_b32 s63, v31, 31
-; GFX8-NEXT:    v_readlane_b32 s62, v31, 30
-; GFX8-NEXT:    v_readlane_b32 s61, v31, 29
-; GFX8-NEXT:    v_readlane_b32 s60, v31, 28
-; GFX8-NEXT:    v_readlane_b32 s59, v31, 27
-; GFX8-NEXT:    v_readlane_b32 s58, v31, 26
-; GFX8-NEXT:    v_readlane_b32 s57, v31, 25
-; GFX8-NEXT:    v_readlane_b32 s56, v31, 24
-; GFX8-NEXT:    v_readlane_b32 s55, v31, 23
-; GFX8-NEXT:    v_readlane_b32 s54, v31, 22
-; GFX8-NEXT:    v_readlane_b32 s53, v31, 21
-; GFX8-NEXT:    v_readlane_b32 s52, v31, 20
-; GFX8-NEXT:    v_readlane_b32 s51, v31, 19
-; GFX8-NEXT:    v_readlane_b32 s50, v31, 18
-; GFX8-NEXT:    v_readlane_b32 s49, v31, 17
-; GFX8-NEXT:    v_readlane_b32 s48, v31, 16
-; GFX8-NEXT:    v_readlane_b32 s47, v31, 15
-; GFX8-NEXT:    v_readlane_b32 s46, v31, 14
-; GFX8-NEXT:    v_readlane_b32 s45, v31, 13
-; GFX8-NEXT:    v_readlane_b32 s44, v31, 12
-; GFX8-NEXT:    v_readlane_b32 s43, v31, 11
-; GFX8-NEXT:    v_readlane_b32 s42, v31, 10
-; GFX8-NEXT:    v_readlane_b32 s41, v31, 9
-; GFX8-NEXT:    v_readlane_b32 s40, v31, 8
-; GFX8-NEXT:    v_readlane_b32 s39, v31, 7
-; GFX8-NEXT:    v_readlane_b32 s38, v31, 6
-; GFX8-NEXT:    v_readlane_b32 s37, v31, 5
-; GFX8-NEXT:    v_readlane_b32 s36, v31, 4
-; GFX8-NEXT:    v_readlane_b32 s35, v31, 3
-; GFX8-NEXT:    v_readlane_b32 s34, v31, 2
-; GFX8-NEXT:    v_readlane_b32 s31, v31, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v31, 0
+; GFX8-NEXT:    v_readlane_b32 s67, v34, 35
+; GFX8-NEXT:    v_readlane_b32 s66, v34, 34
+; GFX8-NEXT:    v_readlane_b32 s65, v34, 33
+; GFX8-NEXT:    v_readlane_b32 s64, v34, 32
+; GFX8-NEXT:    v_readlane_b32 s63, v34, 31
+; GFX8-NEXT:    v_readlane_b32 s62, v34, 30
+; GFX8-NEXT:    v_readlane_b32 s61, v34, 29
+; GFX8-NEXT:    v_readlane_b32 s60, v34, 28
+; GFX8-NEXT:    v_readlane_b32 s59, v34, 27
+; GFX8-NEXT:    v_readlane_b32 s58, v34, 26
+; GFX8-NEXT:    v_readlane_b32 s57, v34, 25
+; GFX8-NEXT:    v_readlane_b32 s56, v34, 24
+; GFX8-NEXT:    v_readlane_b32 s55, v34, 23
+; GFX8-NEXT:    v_readlane_b32 s54, v34, 22
+; GFX8-NEXT:    v_readlane_b32 s53, v34, 21
+; GFX8-NEXT:    v_readlane_b32 s52, v34, 20
+; GFX8-NEXT:    v_readlane_b32 s51, v34, 19
+; GFX8-NEXT:    v_readlane_b32 s50, v34, 18
+; GFX8-NEXT:    v_readlane_b32 s49, v34, 17
+; GFX8-NEXT:    v_readlane_b32 s48, v34, 16
+; GFX8-NEXT:    v_readlane_b32 s47, v34, 15
+; GFX8-NEXT:    v_readlane_b32 s46, v34, 14
+; GFX8-NEXT:    v_readlane_b32 s45, v34, 13
+; GFX8-NEXT:    v_readlane_b32 s44, v34, 12
+; GFX8-NEXT:    v_readlane_b32 s43, v34, 11
+; GFX8-NEXT:    v_readlane_b32 s42, v34, 10
+; GFX8-NEXT:    v_readlane_b32 s41, v34, 9
+; GFX8-NEXT:    v_readlane_b32 s40, v34, 8
+; GFX8-NEXT:    v_readlane_b32 s39, v34, 7
+; GFX8-NEXT:    v_readlane_b32 s38, v34, 6
+; GFX8-NEXT:    v_readlane_b32 s37, v34, 5
+; GFX8-NEXT:    v_readlane_b32 s36, v34, 4
+; GFX8-NEXT:    v_readlane_b32 s35, v34, 3
+; GFX8-NEXT:    v_readlane_b32 s34, v34, 2
+; GFX8-NEXT:    v_readlane_b32 s31, v34, 1
+; GFX8-NEXT:    v_readlane_b32 s30, v34, 0
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -28491,104 +28491,104 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v31, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v31, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v31, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v33, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v33, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v33, s34, 2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_writelane_b32 v31, s35, 3
+; GFX9-NEXT:    v_writelane_b32 v33, s35, 3
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v3
-; GFX9-NEXT:    v_writelane_b32 v31, s36, 4
+; GFX9-NEXT:    v_writelane_b32 v33, s36, 4
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v2
-; GFX9-NEXT:    v_writelane_b32 v31, s37, 5
+; GFX9-NEXT:    v_writelane_b32 v33, s37, 5
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[8:9], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v5
-; GFX9-NEXT:    v_writelane_b32 v31, s38, 6
+; GFX9-NEXT:    v_writelane_b32 v33, s38, 6
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[10:11], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v4
-; GFX9-NEXT:    v_writelane_b32 v31, s39, 7
+; GFX9-NEXT:    v_writelane_b32 v33, s39, 7
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[12:13], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v7
-; GFX9-NEXT:    v_writelane_b32 v31, s40, 8
+; GFX9-NEXT:    v_writelane_b32 v33, s40, 8
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[14:15], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v6
-; GFX9-NEXT:    v_writelane_b32 v31, s41, 9
+; GFX9-NEXT:    v_writelane_b32 v33, s41, 9
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[16:17], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v9
-; GFX9-NEXT:    v_writelane_b32 v31, s42, 10
+; GFX9-NEXT:    v_writelane_b32 v33, s42, 10
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[18:19], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v8
-; GFX9-NEXT:    v_writelane_b32 v31, s43, 11
+; GFX9-NEXT:    v_writelane_b32 v33, s43, 11
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[20:21], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v11
-; GFX9-NEXT:    v_writelane_b32 v31, s44, 12
+; GFX9-NEXT:    v_writelane_b32 v33, s44, 12
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[22:23], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v10
-; GFX9-NEXT:    v_writelane_b32 v31, s45, 13
+; GFX9-NEXT:    v_writelane_b32 v33, s45, 13
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[24:25], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v13
-; GFX9-NEXT:    v_writelane_b32 v31, s46, 14
+; GFX9-NEXT:    v_writelane_b32 v33, s46, 14
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[26:27], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v12
-; GFX9-NEXT:    v_writelane_b32 v31, s47, 15
+; GFX9-NEXT:    v_writelane_b32 v33, s47, 15
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[28:29], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v15
-; GFX9-NEXT:    v_writelane_b32 v31, s48, 16
+; GFX9-NEXT:    v_writelane_b32 v33, s48, 16
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[30:31], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v14
-; GFX9-NEXT:    v_writelane_b32 v31, s49, 17
+; GFX9-NEXT:    v_writelane_b32 v33, s49, 17
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[34:35], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v17
-; GFX9-NEXT:    v_writelane_b32 v31, s50, 18
+; GFX9-NEXT:    v_writelane_b32 v33, s50, 18
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[36:37], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v16
-; GFX9-NEXT:    v_writelane_b32 v31, s51, 19
+; GFX9-NEXT:    v_writelane_b32 v33, s51, 19
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[38:39], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v19
-; GFX9-NEXT:    v_writelane_b32 v31, s52, 20
+; GFX9-NEXT:    v_writelane_b32 v33, s52, 20
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[40:41], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v18
-; GFX9-NEXT:    v_writelane_b32 v31, s53, 21
+; GFX9-NEXT:    v_writelane_b32 v33, s53, 21
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[42:43], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v21
-; GFX9-NEXT:    v_writelane_b32 v31, s54, 22
+; GFX9-NEXT:    v_writelane_b32 v33, s54, 22
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[44:45], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v20
-; GFX9-NEXT:    v_writelane_b32 v31, s55, 23
+; GFX9-NEXT:    v_writelane_b32 v33, s55, 23
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[46:47], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v23
-; GFX9-NEXT:    v_writelane_b32 v31, s56, 24
+; GFX9-NEXT:    v_writelane_b32 v33, s56, 24
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[48:49], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v22
-; GFX9-NEXT:    v_writelane_b32 v31, s57, 25
+; GFX9-NEXT:    v_writelane_b32 v33, s57, 25
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[50:51], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v25
-; GFX9-NEXT:    v_writelane_b32 v31, s58, 26
+; GFX9-NEXT:    v_writelane_b32 v33, s58, 26
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[52:53], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v24
-; GFX9-NEXT:    v_writelane_b32 v31, s59, 27
+; GFX9-NEXT:    v_writelane_b32 v33, s59, 27
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[54:55], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v27
-; GFX9-NEXT:    v_writelane_b32 v31, s60, 28
+; GFX9-NEXT:    v_writelane_b32 v33, s60, 28
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[56:57], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v26
-; GFX9-NEXT:    v_writelane_b32 v31, s61, 29
+; GFX9-NEXT:    v_writelane_b32 v33, s61, 29
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[58:59], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v29
-; GFX9-NEXT:    v_writelane_b32 v31, s62, 30
+; GFX9-NEXT:    v_writelane_b32 v33, s62, 30
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[60:61], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v28
-; GFX9-NEXT:    v_writelane_b32 v31, s63, 31
+; GFX9-NEXT:    v_writelane_b32 v33, s63, 31
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[62:63], 1, v0
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32
-; GFX9-NEXT:    v_writelane_b32 v31, s64, 32
-; GFX9-NEXT:    v_writelane_b32 v31, s65, 33
-; GFX9-NEXT:    v_writelane_b32 v31, s66, 34
+; GFX9-NEXT:    v_writelane_b32 v33, s64, 32
+; GFX9-NEXT:    v_writelane_b32 v33, s65, 33
+; GFX9-NEXT:    v_writelane_b32 v33, s66, 34
 ; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX9-NEXT:    v_writelane_b32 v31, s67, 35
+; GFX9-NEXT:    v_writelane_b32 v33, s67, 35
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
@@ -28625,14 +28625,14 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX9-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:56
 ; GFX9-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:124
 ; GFX9-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:60
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:128
-; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:64
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:128
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e64 v29, v32, v33, s[66:67]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
+; GFX9-NEXT:    v_cndmask_b32_e64 v29, v31, v32, s[66:67]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
-; GFX9-NEXT:    v_cndmask_b32_e64 v32, v32, v33, s[64:65]
-; GFX9-NEXT:    v_cndmask_b32_e64 v33, v28, v30, s[62:63]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
+; GFX9-NEXT:    v_cndmask_b32_e64 v31, v31, v32, s[64:65]
+; GFX9-NEXT:    v_cndmask_b32_e64 v32, v28, v30, s[62:63]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
 ; GFX9-NEXT:    v_cndmask_b32_e64 v28, v28, v30, s[60:61]
@@ -28707,46 +28707,46 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX9-NEXT:    v_perm_b32 v11, v22, v25, s4
 ; GFX9-NEXT:    v_perm_b32 v12, v24, v27, s4
 ; GFX9-NEXT:    v_perm_b32 v13, v26, v30, s4
-; GFX9-NEXT:    v_perm_b32 v14, v28, v33, s4
-; GFX9-NEXT:    v_perm_b32 v15, v32, v29, s4
-; GFX9-NEXT:    v_readlane_b32 s67, v31, 35
-; GFX9-NEXT:    v_readlane_b32 s66, v31, 34
-; GFX9-NEXT:    v_readlane_b32 s65, v31, 33
-; GFX9-NEXT:    v_readlane_b32 s64, v31, 32
-; GFX9-NEXT:    v_readlane_b32 s63, v31, 31
-; GFX9-NEXT:    v_readlane_b32 s62, v31, 30
-; GFX9-NEXT:    v_readlane_b32 s61, v31, 29
-; GFX9-NEXT:    v_readlane_b32 s60, v31, 28
-; GFX9-NEXT:    v_readlane_b32 s59, v31, 27
-; GFX9-NEXT:    v_readlane_b32 s58, v31, 26
-; GFX9-NEXT:    v_readlane_b32 s57, v31, 25
-; GFX9-NEXT:    v_readlane_b32 s56, v31, 24
-; GFX9-NEXT:    v_readlane_b32 s55, v31, 23
-; GFX9-NEXT:    v_readlane_b32 s54, v31, 22
-; GFX9-NEXT:    v_readlane_b32 s53, v31, 21
-; GFX9-NEXT:    v_readlane_b32 s52, v31, 20
-; GFX9-NEXT:    v_readlane_b32 s51, v31, 19
-; GFX9-NEXT:    v_readlane_b32 s50, v31, 18
-; GFX9-NEXT:    v_readlane_b32 s49, v31, 17
-; GFX9-NEXT:    v_readlane_b32 s48, v31, 16
-; GFX9-NEXT:    v_readlane_b32 s47, v31, 15
-; GFX9-NEXT:    v_readlane_b32 s46, v31, 14
-; GFX9-NEXT:    v_readlane_b32 s45, v31, 13
-; GFX9-NEXT:    v_readlane_b32 s44, v31, 12
-; GFX9-NEXT:    v_readlane_b32 s43, v31, 11
-; GFX9-NEXT:    v_readlane_b32 s42, v31, 10
-; GFX9-NEXT:    v_readlane_b32 s41, v31, 9
-; GFX9-NEXT:    v_readlane_b32 s40, v31, 8
-; GFX9-NEXT:    v_readlane_b32 s39, v31, 7
-; GFX9-NEXT:    v_readlane_b32 s38, v31, 6
-; GFX9-NEXT:    v_readlane_b32 s37, v31, 5
-; GFX9-NEXT:    v_readlane_b32 s36, v31, 4
-; GFX9-NEXT:    v_readlane_b32 s35, v31, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v31, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v31, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v31, 0
+; GFX9-NEXT:    v_perm_b32 v14, v28, v32, s4
+; GFX9-NEXT:    v_perm_b32 v15, v31, v29, s4
+; GFX9-NEXT:    v_readlane_b32 s67, v33, 35
+; GFX9-NEXT:    v_readlane_b32 s66, v33, 34
+; GFX9-NEXT:    v_readlane_b32 s65, v33, 33
+; GFX9-NEXT:    v_readlane_b32 s64, v33, 32
+; GFX9-NEXT:    v_readlane_b32 s63, v33, 31
+; GFX9-NEXT:    v_readlane_b32 s62, v33, 30
+; GFX9-NEXT:    v_readlane_b32 s61, v33, 29
+; GFX9-NEXT:    v_readlane_b32 s60, v33, 28
+; GFX9-NEXT:    v_readlane_b32 s59, v33, 27
+; GFX9-NEXT:    v_readlane_b32 s58, v33, 26
+; GFX9-NEXT:    v_readlane_b32 s57, v33, 25
+; GFX9-NEXT:    v_readlane_b32 s56, v33, 24
+; GFX9-NEXT:    v_readlane_b32 s55, v33, 23
+; GFX9-NEXT:    v_readlane_b32 s54, v33, 22
+; GFX9-NEXT:    v_readlane_b32 s53, v33, 21
+; GFX9-NEXT:    v_readlane_b32 s52, v33, 20
+; GFX9-NEXT:    v_readlane_b32 s51, v33, 19
+; GFX9-NEXT:    v_readlane_b32 s50, v33, 18
+; GFX9-NEXT:    v_readlane_b32 s49, v33, 17
+; GFX9-NEXT:    v_readlane_b32 s48, v33, 16
+; GFX9-NEXT:    v_readlane_b32 s47, v33, 15
+; GFX9-NEXT:    v_readlane_b32 s46, v33, 14
+; GFX9-NEXT:    v_readlane_b32 s45, v33, 13
+; GFX9-NEXT:    v_readlane_b32 s44, v33, 12
+; GFX9-NEXT:    v_readlane_b32 s43, v33, 11
+; GFX9-NEXT:    v_readlane_b32 s42, v33, 10
+; GFX9-NEXT:    v_readlane_b32 s41, v33, 9
+; GFX9-NEXT:    v_readlane_b32 s40, v33, 8
+; GFX9-NEXT:    v_readlane_b32 s39, v33, 7
+; GFX9-NEXT:    v_readlane_b32 s38, v33, 6
+; GFX9-NEXT:    v_readlane_b32 s37, v33, 5
+; GFX9-NEXT:    v_readlane_b32 s36, v33, 4
+; GFX9-NEXT:    v_readlane_b32 s35, v33, 3
+; GFX9-NEXT:    v_readlane_b32 s34, v33, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v33, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v33, 0
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -28754,8 +28754,8 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX10-LABEL: v_vselect_v32bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX10-NEXT:    v_and_b32_e32 v29, 1, v29
@@ -28770,31 +28770,31 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX10-NEXT:    v_and_b32_e32 v14, 1, v14
 ; GFX10-NEXT:    v_and_b32_e32 v12, 1, v12
 ; GFX10-NEXT:    s_clause 0x14
-; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
-; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:124
-; GFX10-NEXT:    buffer_load_ushort v34, off, s[0:3], s32
-; GFX10-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:128
-; GFX10-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:64
-; GFX10-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:48
-; GFX10-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:116
-; GFX10-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:52
-; GFX10-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:120
-; GFX10-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:56
-; GFX10-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:32
-; GFX10-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:100
-; GFX10-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:36
-; GFX10-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:104
-; GFX10-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:40
-; GFX10-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:108
-; GFX10-NEXT:    buffer_load_dword v64, off, s[0:3], s32 offset:44
-; GFX10-NEXT:    buffer_load_dword v65, off, s[0:3], s32 offset:112
-; GFX10-NEXT:    buffer_load_dword v66, off, s[0:3], s32 offset:72
-; GFX10-NEXT:    buffer_load_dword v67, off, s[0:3], s32 offset:76
-; GFX10-NEXT:    buffer_load_dword v68, off, s[0:3], s32 offset:80
+; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:60
+; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:124
+; GFX10-NEXT:    buffer_load_ushort v33, off, s[0:3], s32
+; GFX10-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:128
+; GFX10-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:64
+; GFX10-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:48
+; GFX10-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:116
+; GFX10-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:52
+; GFX10-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:120
+; GFX10-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:56
+; GFX10-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:32
+; GFX10-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:100
+; GFX10-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:36
+; GFX10-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:104
+; GFX10-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:40
+; GFX10-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:108
+; GFX10-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:44
+; GFX10-NEXT:    buffer_load_dword v64, off, s[0:3], s32 offset:112
+; GFX10-NEXT:    buffer_load_dword v65, off, s[0:3], s32 offset:72
+; GFX10-NEXT:    buffer_load_dword v66, off, s[0:3], s32 offset:76
+; GFX10-NEXT:    buffer_load_dword v67, off, s[0:3], s32 offset:80
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v29
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:92
-; GFX10-NEXT:    buffer_load_dword v69, off, s[0:3], s32 offset:28
+; GFX10-NEXT:    buffer_load_dword v68, off, s[0:3], s32 offset:28
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 1, v30
 ; GFX10-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:96
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 1, v28
@@ -28816,17 +28816,17 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX10-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:24
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s13, 1, v12
 ; GFX10-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:4
-; GFX10-NEXT:    v_writelane_b32 v31, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GFX10-NEXT:    v_and_b32_e32 v4, 1, v4
 ; GFX10-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX10-NEXT:    v_writelane_b32 v31, s31, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    v_and_b32_e32 v8, 1, v8
 ; GFX10-NEXT:    v_and_b32_e32 v10, 1, v10
 ; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX10-NEXT:    v_writelane_b32 v31, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    v_and_b32_e32 v5, 1, v5
 ; GFX10-NEXT:    v_and_b32_e32 v7, 1, v7
 ; GFX10-NEXT:    v_and_b32_e32 v9, 1, v9
@@ -28845,7 +28845,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s17, 1, v4
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s18, 1, v2
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s19, 1, v0
-; GFX10-NEXT:    v_writelane_b32 v31, s35, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s35, 3
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s20, 1, v27
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s21, 1, v25
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s22, 1, v23
@@ -28861,111 +28861,111 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s34, 1, v5
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s35, 1, v9
 ; GFX10-NEXT:    s_waitcnt vmcnt(32)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v32
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v31
 ; GFX10-NEXT:    s_waitcnt vmcnt(31)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v33
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v32
 ; GFX10-NEXT:    s_waitcnt vmcnt(30)
-; GFX10-NEXT:    v_and_b32_e32 v2, 1, v34
+; GFX10-NEXT:    v_and_b32_e32 v2, 1, v33
 ; GFX10-NEXT:    s_waitcnt vmcnt(29)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v35
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v34
 ; GFX10-NEXT:    s_waitcnt vmcnt(28)
-; GFX10-NEXT:    v_cndmask_b32_e64 v15, v35, v36, s4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v36
-; GFX10-NEXT:    v_cndmask_b32_e64 v17, v33, v32, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v15, v34, v35, s4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v35
+; GFX10-NEXT:    v_cndmask_b32_e64 v17, v32, v31, s5
 ; GFX10-NEXT:    s_waitcnt vmcnt(25)
-; GFX10-NEXT:    v_cndmask_b32_e64 v19, v38, v39, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v19, v37, v38, s7
 ; GFX10-NEXT:    s_waitcnt vmcnt(24)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v48
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v39
 ; GFX10-NEXT:    s_waitcnt vmcnt(23)
-; GFX10-NEXT:    v_cndmask_b32_e64 v13, v48, v49, s6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v49
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v39
-; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v38
-; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v37
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, v39, v48, s6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v48
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v38
+; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v37
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v36
 ; GFX10-NEXT:    s_waitcnt vmcnt(18)
-; GFX10-NEXT:    v_cndmask_b32_e64 v27, v53, v54, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v27, v52, v53, s10
 ; GFX10-NEXT:    s_waitcnt vmcnt(17)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v55
+; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v54
 ; GFX10-NEXT:    s_waitcnt vmcnt(16)
-; GFX10-NEXT:    v_cndmask_b32_e64 v21, v55, v64, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v21, v54, v55, s9
 ; GFX10-NEXT:    s_waitcnt vmcnt(15)
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, v65, v37, s8
-; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v65
-; GFX10-NEXT:    v_lshrrev_b32_e32 v23, 16, v64
-; GFX10-NEXT:    v_lshrrev_b32_e32 v32, 16, v54
-; GFX10-NEXT:    v_lshrrev_b32_e32 v33, 16, v53
-; GFX10-NEXT:    v_cndmask_b32_e64 v34, v51, v52, s11
-; GFX10-NEXT:    v_lshrrev_b32_e32 v35, 16, v52
-; GFX10-NEXT:    v_lshrrev_b32_e32 v36, 16, v51
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, v64, v36, s8
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v64
+; GFX10-NEXT:    v_lshrrev_b32_e32 v23, 16, v55
+; GFX10-NEXT:    v_lshrrev_b32_e32 v31, 16, v53
+; GFX10-NEXT:    v_lshrrev_b32_e32 v32, 16, v52
+; GFX10-NEXT:    v_cndmask_b32_e64 v33, v50, v51, s11
+; GFX10-NEXT:    v_lshrrev_b32_e32 v34, 16, v51
+; GFX10-NEXT:    v_lshrrev_b32_e32 v35, 16, v50
 ; GFX10-NEXT:    s_waitcnt vmcnt(9)
-; GFX10-NEXT:    v_cndmask_b32_e64 v37, v30, v50, s12
-; GFX10-NEXT:    v_lshrrev_b32_e32 v38, 16, v50
+; GFX10-NEXT:    v_cndmask_b32_e64 v36, v30, v49, s12
+; GFX10-NEXT:    v_lshrrev_b32_e32 v37, 16, v49
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
-; GFX10-NEXT:    v_cndmask_b32_e64 v39, v29, v69, s13
-; GFX10-NEXT:    v_lshrrev_b32_e32 v48, 16, v69
+; GFX10-NEXT:    v_cndmask_b32_e64 v38, v29, v68, s13
+; GFX10-NEXT:    v_lshrrev_b32_e32 v39, 16, v68
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
 ; GFX10-NEXT:    s_waitcnt vmcnt(6)
-; GFX10-NEXT:    v_cndmask_b32_e64 v50, v24, v22, s15
+; GFX10-NEXT:    v_cndmask_b32_e64 v49, v24, v22, s15
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
 ; GFX10-NEXT:    s_waitcnt vmcnt(5)
-; GFX10-NEXT:    v_cndmask_b32_e64 v51, v68, v20, s16
+; GFX10-NEXT:    v_cndmask_b32_e64 v50, v67, v20, s16
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
-; GFX10-NEXT:    v_lshrrev_b32_e32 v52, 16, v68
+; GFX10-NEXT:    v_lshrrev_b32_e32 v51, 16, v67
 ; GFX10-NEXT:    s_waitcnt vmcnt(4)
-; GFX10-NEXT:    v_cndmask_b32_e64 v53, v67, v18, s17
+; GFX10-NEXT:    v_cndmask_b32_e64 v52, v66, v18, s17
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_cndmask_b32_e64 v49, v28, v26, s14
+; GFX10-NEXT:    v_cndmask_b32_e64 v48, v28, v26, s14
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
-; GFX10-NEXT:    v_lshrrev_b32_e32 v54, 16, v67
-; GFX10-NEXT:    v_cndmask_b32_e64 v55, v66, v16, s18
+; GFX10-NEXT:    v_lshrrev_b32_e32 v53, 16, v66
+; GFX10-NEXT:    v_cndmask_b32_e64 v54, v65, v16, s18
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; GFX10-NEXT:    v_lshrrev_b32_e32 v64, 16, v66
+; GFX10-NEXT:    v_lshrrev_b32_e32 v55, 16, v65
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cndmask_b32_e64 v65, v14, v12, s19
+; GFX10-NEXT:    v_cndmask_b32_e64 v64, v14, v12, s19
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 1, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v66, v1, v0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v67, v6, v5, s20
-; GFX10-NEXT:    v_cndmask_b32_e64 v68, v8, v7, s21
-; GFX10-NEXT:    v_cndmask_b32_e64 v69, v10, v9, s22
+; GFX10-NEXT:    v_cndmask_b32_e32 v65, v1, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v66, v6, v5, s20
+; GFX10-NEXT:    v_cndmask_b32_e64 v67, v8, v7, s21
+; GFX10-NEXT:    v_cndmask_b32_e64 v68, v10, v9, s22
 ; GFX10-NEXT:    v_cndmask_b32_e64 v10, v25, v23, s23
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, v33, v32, s24
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, v36, v35, s25
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, v30, v38, s26
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, v29, v48, s27
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, v32, v31, s24
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v35, v34, s25
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v30, v37, s26
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v29, v39, s27
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v28, v26, s28
-; GFX10-NEXT:    v_cndmask_b32_e64 v20, v52, v20, s29
+; GFX10-NEXT:    v_cndmask_b32_e64 v20, v51, v20, s29
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v14, v12, s31
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v64, v16, s30
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v54, v18, s34
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v55, v16, s30
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v53, v18, s34
 ; GFX10-NEXT:    v_cndmask_b32_e64 v12, v24, v22, s35
 ; GFX10-NEXT:    v_cndmask_b32_e64 v16, v4, v3, s4
-; GFX10-NEXT:    v_perm_b32 v0, v0, v65, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v1, v1, v55, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v2, v2, v53, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v3, v20, v51, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v4, v12, v50, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v5, v5, v49, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v6, v6, v39, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v7, v7, v37, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v8, v8, v34, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v0, v0, v64, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v1, v1, v54, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v2, v2, v52, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v3, v20, v50, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v4, v12, v49, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v5, v5, v48, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v6, v6, v38, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v7, v7, v36, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v8, v8, v33, 0x5040100
 ; GFX10-NEXT:    v_perm_b32 v9, v9, v27, 0x5040100
 ; GFX10-NEXT:    v_perm_b32 v10, v10, v21, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v11, v69, v11, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v12, v68, v19, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v13, v67, v13, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v14, v66, v17, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v11, v68, v11, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v12, v67, v19, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v13, v66, v13, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v14, v65, v17, 0x5040100
 ; GFX10-NEXT:    v_perm_b32 v15, v16, v15, 0x5040100
-; GFX10-NEXT:    v_readlane_b32 s35, v31, 3
-; GFX10-NEXT:    v_readlane_b32 s34, v31, 2
-; GFX10-NEXT:    v_readlane_b32 s31, v31, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v31, 0
-; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX10-NEXT:    v_readlane_b32 s35, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)

diff  --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
index 634106d20489e5..e926a3c728cbd2 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
@@ -278,11 +278,11 @@ define void @callee_with_stack_no_fp_elim_csr_vgpr() #1 {
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
 ; MUBUF:   buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
 ; FLATSCR: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
-; GCN: v_writelane_b32 v0
+; GCN: v_writelane_b32 v1
 ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:4
 ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s33 offset:4
 ; GCN: ;;#ASMSTART
-; GCN: v_writelane_b32 v0
+; GCN: v_writelane_b32 v1
 
 ; MUBUF:        s_addk_i32 s32, 0x400
 ; FLATSCR:      s_add_i32 s32, s32, 16
@@ -320,19 +320,19 @@ define void @last_lane_vgpr_for_fp_csr() #1 {
 ; MUBUF-NEXT:   buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s33 offset:8 ; 4-byte Folded Spill
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
-; GCN-COUNT-61: v_writelane_b32 v0,
+; GCN-COUNT-61: v_writelane_b32 v1,
 ; MUBUF:   buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
 ; FLATSCR: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
-; GCN: v_writelane_b32 v0,
+; GCN: v_writelane_b32 v1,
 ; MUBUF:   buffer_store_dword
 ; FLATSCR: scratch_store_dword
 ; GCN: ;;#ASMSTART
-; GCN: v_writelane_b32 v0,
+; GCN: v_writelane_b32 v1,
 ; MUBUF:   buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; FLATSCR: scratch_load_dword v41, off, s33 ; 4-byte Folded Reload
 ; MUBUF:        s_addk_i32 s32, 0x400
 ; FLATSCR:      s_add_i32 s32, s32, 16
-; GCN-COUNT-64: v_readlane_b32 s{{[0-9]+}}, v0
+; GCN-COUNT-64: v_readlane_b32 s{{[0-9]+}}, v1
 ; GCN-NEXT: s_xor_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
 ; MUBUF-NEXT:   buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 offset:8 ; 4-byte Folded Reload

diff  --git a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll
index 36a7ed51227a69..764c40ebc714d7 100644
--- a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll
@@ -19,25 +19,25 @@ define weak_odr void @test(i32 %0) !dbg !34 {
 ; CHECK-NEXT:    s_mov_b32 s16, s33
 ; CHECK-NEXT:    s_mov_b32 s33, s32
 ; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
-; CHECK-NEXT:    v_writelane_b32 v40, s16, 16
-; CHECK-NEXT:    v_writelane_b32 v40, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v40, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v40, s35, 3
-; CHECK-NEXT:    v_writelane_b32 v40, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v40, s37, 5
-; CHECK-NEXT:    v_writelane_b32 v40, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v40, s39, 7
-; CHECK-NEXT:    v_writelane_b32 v40, s40, 8
-; CHECK-NEXT:    v_writelane_b32 v40, s41, 9
-; CHECK-NEXT:    v_writelane_b32 v40, s42, 10
-; CHECK-NEXT:    v_writelane_b32 v40, s43, 11
-; CHECK-NEXT:    v_writelane_b32 v40, s44, 12
+; CHECK-NEXT:    v_writelane_b32 v41, s16, 16
+; CHECK-NEXT:    v_writelane_b32 v41, s30, 0
+; CHECK-NEXT:    v_writelane_b32 v41, s31, 1
+; CHECK-NEXT:    v_writelane_b32 v41, s34, 2
+; CHECK-NEXT:    v_writelane_b32 v41, s35, 3
+; CHECK-NEXT:    v_writelane_b32 v41, s36, 4
+; CHECK-NEXT:    v_writelane_b32 v41, s37, 5
+; CHECK-NEXT:    v_writelane_b32 v41, s38, 6
+; CHECK-NEXT:    v_writelane_b32 v41, s39, 7
+; CHECK-NEXT:    v_writelane_b32 v41, s40, 8
+; CHECK-NEXT:    v_writelane_b32 v41, s41, 9
+; CHECK-NEXT:    v_writelane_b32 v41, s42, 10
+; CHECK-NEXT:    v_writelane_b32 v41, s43, 11
+; CHECK-NEXT:    v_writelane_b32 v41, s44, 12
 ; CHECK-NEXT:    s_addk_i32 s32, 0x400
-; CHECK-NEXT:    v_writelane_b32 v40, s45, 13
-; CHECK-NEXT:    v_writelane_b32 v40, s46, 14
+; CHECK-NEXT:    v_writelane_b32 v41, s45, 13
+; CHECK-NEXT:    v_writelane_b32 v41, s46, 14
 ; CHECK-NEXT:    s_mov_b64 s[40:41], s[4:5]
 ; CHECK-NEXT:    ;DEBUG_VALUE: dummy:dummy <- undef
 ; CHECK-NEXT:  .Ltmp0:
@@ -45,11 +45,11 @@ define weak_odr void @test(i32 %0) !dbg !34 {
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, __kmpc_alloc_shared at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, __kmpc_alloc_shared at gotpcrel32@hi+12
-; CHECK-NEXT:    v_writelane_b32 v40, s47, 15
+; CHECK-NEXT:    v_writelane_b32 v41, s47, 15
 ; CHECK-NEXT:    s_load_dwordx2 s[46:47], s[4:5], 0x0
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_mov_b32_e32 v41, v31
+; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT:    v_mov_b32_e32 v40, v31
 ; CHECK-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; CHECK-NEXT:    s_mov_b32 s42, s15
 ; CHECK-NEXT:    s_mov_b32 s43, s14
@@ -67,33 +67,33 @@ define weak_odr void @test(i32 %0) !dbg !34 {
 ; CHECK-NEXT:    s_mov_b32 s13, s44
 ; CHECK-NEXT:    s_mov_b32 s14, s43
 ; CHECK-NEXT:    s_mov_b32 s15, s42
-; CHECK-NEXT:    v_mov_b32_e32 v31, v41
+; CHECK-NEXT:    v_mov_b32_e32 v31, v40
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[46:47]
 ; CHECK-NEXT:  .Ltmp1:
 ; CHECK-NEXT:    ;DEBUG_VALUE: dummy:dummy <- [$vgpr0_vgpr1+0]
 ; CHECK-NEXT:    .loc 1 0 9 is_stmt 0 ; dummy:0:9
-; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0
 ; CHECK-NEXT:    flat_store_dword v[0:1], v2
-; CHECK-NEXT:    v_readlane_b32 s47, v40, 15
-; CHECK-NEXT:    v_readlane_b32 s46, v40, 14
-; CHECK-NEXT:    v_readlane_b32 s45, v40, 13
-; CHECK-NEXT:    v_readlane_b32 s44, v40, 12
-; CHECK-NEXT:    v_readlane_b32 s43, v40, 11
-; CHECK-NEXT:    v_readlane_b32 s42, v40, 10
-; CHECK-NEXT:    v_readlane_b32 s41, v40, 9
-; CHECK-NEXT:    v_readlane_b32 s40, v40, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v40, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v40, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v40, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v40, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v40, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v40, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
-; CHECK-NEXT:    v_readlane_b32 s4, v40, 16
+; CHECK-NEXT:    v_readlane_b32 s47, v41, 15
+; CHECK-NEXT:    v_readlane_b32 s46, v41, 14
+; CHECK-NEXT:    v_readlane_b32 s45, v41, 13
+; CHECK-NEXT:    v_readlane_b32 s44, v41, 12
+; CHECK-NEXT:    v_readlane_b32 s43, v41, 11
+; CHECK-NEXT:    v_readlane_b32 s42, v41, 10
+; CHECK-NEXT:    v_readlane_b32 s41, v41, 9
+; CHECK-NEXT:    v_readlane_b32 s40, v41, 8
+; CHECK-NEXT:    v_readlane_b32 s39, v41, 7
+; CHECK-NEXT:    v_readlane_b32 s38, v41, 6
+; CHECK-NEXT:    v_readlane_b32 s37, v41, 5
+; CHECK-NEXT:    v_readlane_b32 s36, v41, 4
+; CHECK-NEXT:    v_readlane_b32 s35, v41, 3
+; CHECK-NEXT:    v_readlane_b32 s34, v41, 2
+; CHECK-NEXT:    v_readlane_b32 s31, v41, 1
+; CHECK-NEXT:    v_readlane_b32 s30, v41, 0
+; CHECK-NEXT:    v_readlane_b32 s4, v41, 16
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
 ; CHECK-NEXT:    s_addk_i32 s32, 0xfc00
 ; CHECK-NEXT:    s_mov_b32 s33, s4

diff  --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
index 145ab4ae6378b4..a118fa388f86d9 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@@ -3992,29 +3992,29 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v41, 0
-; GFX9-NEXT:    v_mov_b32_e32 v42, 0
-; GFX9-NEXT:    global_load_ubyte v0, v[41:42], off
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    global_load_ubyte v0, v[40:41], off
+; GFX9-NEXT:    v_writelane_b32 v42, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v42, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_i8_ret at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_i8_ret at abs32@lo
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    global_store_byte v[41:42], v0, off
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    global_store_byte v[40:41], v0, off
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -4027,30 +4027,30 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v41, 0
-; GFX10-NEXT:    v_mov_b32_e32 v42, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v42, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_i8_ret at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_i8_ret at abs32@lo
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    global_load_ubyte v0, v[41:42], off
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    global_load_ubyte v0, v[40:41], off
+; GFX10-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    global_store_byte v[41:42], v0, off
+; GFX10-NEXT:    global_store_byte v[40:41], v0, off
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33
-; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4
+; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -4064,30 +4064,30 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:8 ; 4-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
-; GFX11-NEXT:    scratch_store_b32 off, v42, s33
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:4
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33
+; GFX11-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v41, 0
-; GFX11-NEXT:    v_mov_b32_e32 v42, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v42, s0, 2
 ; GFX11-NEXT:    s_mov_b32 s1, external_void_func_i8_ret at abs32@hi
 ; GFX11-NEXT:    s_mov_b32 s0, external_void_func_i8_ret at abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    global_load_u8 v0, v[41:42], off
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    global_load_u8 v0, v[40:41], off
+; GFX11-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    global_store_b8 v[41:42], v0, off
+; GFX11-NEXT:    global_store_b8 v[40:41], v0, off
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_b32 v42, off, s33
-; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:4
+; GFX11-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -4100,30 +4100,30 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:8 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v42, s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v42, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v41, 0
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v42, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s0, 2
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_i8_ret at abs32@hi
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_i8_ret at abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    global_load_ubyte v0, v[41:42], off
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    global_load_ubyte v0, v[40:41], off
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    global_store_byte v[41:42], v0, off
+; GFX10-SCRATCH-NEXT:    global_store_byte v[40:41], v0, off
 ; GFX10-SCRATCH-NEXT:    s_clause 0x1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33 offset:8 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -4144,33 +4144,33 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v41, 0
-; GFX9-NEXT:    v_mov_b32_e32 v42, 0
-; GFX9-NEXT:    global_load_ushort v0, v[41:42], off
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    global_load_ushort v0, v[40:41], off
+; GFX9-NEXT:    v_writelane_b32 v42, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v42, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v2i8_ret at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v2i8_ret at abs32@lo
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v1, 8, v0
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    global_store_short v[41:42], v0, off
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    global_store_short v[40:41], v0, off
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -4183,34 +4183,34 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v41, 0
-; GFX10-NEXT:    v_mov_b32_e32 v42, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v42, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v2i8_ret at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v2i8_ret at abs32@lo
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    global_load_ushort v0, v[41:42], off
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    global_load_ushort v0, v[40:41], off
+; GFX10-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b16 v1, 8, v0
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-NEXT:    global_store_short v[41:42], v0, off
+; GFX10-NEXT:    global_store_short v[40:41], v0, off
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33
-; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -4224,36 +4224,36 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:8 ; 4-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
-; GFX11-NEXT:    scratch_store_b32 off, v42, s33
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:4
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33
+; GFX11-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v41, 0
-; GFX11-NEXT:    v_mov_b32_e32 v42, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v42, s0, 2
 ; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v2i8_ret at abs32@hi
 ; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v2i8_ret at abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    global_load_u16 v0, v[41:42], off
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    global_load_u16 v0, v[40:41], off
+; GFX11-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b16 v1, 8, v0
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    global_store_b16 v[41:42], v0, off
+; GFX11-NEXT:    global_store_b16 v[40:41], v0, off
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_b32 v42, off, s33
-; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:4
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -4266,34 +4266,34 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:8 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v42, s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v42, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v41, 0
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v42, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s0, 2
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v2i8_ret at abs32@hi
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v2i8_ret at abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    global_load_ushort v0, v[41:42], off
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    global_load_ushort v0, v[40:41], off
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    v_lshrrev_b16 v1, 8, v0
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-SCRATCH-NEXT:    global_store_short v[41:42], v0, off
+; GFX10-SCRATCH-NEXT:    global_store_short v[40:41], v0, off
 ; GFX10-SCRATCH-NEXT:    s_clause 0x1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33 offset:8 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -4314,19 +4314,19 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v41, 0
-; GFX9-NEXT:    v_mov_b32_e32 v42, 0
-; GFX9-NEXT:    global_load_dword v0, v[41:42], off
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    global_load_dword v0, v[40:41], off
+; GFX9-NEXT:    v_writelane_b32 v42, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v42, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3i8_ret at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3i8_ret at abs32@lo
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
@@ -4336,15 +4336,15 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    global_store_byte v[3:4], v2, off
-; GFX9-NEXT:    global_store_short v[41:42], v0, off
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    global_store_short v[40:41], v0, off
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -4357,20 +4357,20 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v41, 0
-; GFX10-NEXT:    v_mov_b32_e32 v42, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v42, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3i8_ret at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3i8_ret at abs32@lo
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    global_load_dword v0, v[41:42], off
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    global_load_dword v0, v[40:41], off
+; GFX10-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
@@ -4378,17 +4378,17 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX10-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 2
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v42, 0
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX10-NEXT:    global_store_byte v[3:4], v2, off
-; GFX10-NEXT:    global_store_short v[41:42], v0, off
+; GFX10-NEXT:    global_store_short v[40:41], v0, off
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33
-; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -4402,20 +4402,20 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:8 ; 4-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
-; GFX11-NEXT:    scratch_store_b32 off, v42, s33
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:4
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33
+; GFX11-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v41, 0
-; GFX11-NEXT:    v_mov_b32_e32 v42, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v42, s0, 2
 ; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3i8_ret at abs32@hi
 ; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3i8_ret at abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    global_load_b32 v0, v[41:42], off
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    global_load_b32 v0, v[40:41], off
+; GFX11-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
@@ -4425,18 +4425,18 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v42, 0
 ; GFX11-NEXT:    v_or_b32_e32 v3, v4, v3
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b8 v[0:1], v2, off
-; GFX11-NEXT:    global_store_b16 v[41:42], v3, off
+; GFX11-NEXT:    global_store_b16 v[40:41], v3, off
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_b32 v42, off, s33
-; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:4
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -4449,20 +4449,20 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:8 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v42, s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v42, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v41, 0
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v42, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s0, 2
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3i8_ret at abs32@hi
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3i8_ret at abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    global_load_dword v0, v[41:42], off
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    global_load_dword v0, v[40:41], off
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
@@ -4470,17 +4470,17 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, 2
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v42, 0
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX10-SCRATCH-NEXT:    global_store_byte v[3:4], v2, off
-; GFX10-SCRATCH-NEXT:    global_store_short v[41:42], v0, off
+; GFX10-SCRATCH-NEXT:    global_store_short v[40:41], v0, off
 ; GFX10-SCRATCH-NEXT:    s_clause 0x1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33 offset:8 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -4501,19 +4501,19 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v41, 0
-; GFX9-NEXT:    v_mov_b32_e32 v42, 0
-; GFX9-NEXT:    global_load_dword v0, v[41:42], off
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    global_load_dword v0, v[40:41], off
+; GFX9-NEXT:    v_writelane_b32 v42, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v42, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v4i8_ret at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v4i8_ret at abs32@lo
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
@@ -4524,15 +4524,15 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v3
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    global_store_dword v[41:42], v0, off
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    global_store_dword v[40:41], v0, off
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -4545,20 +4545,20 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v41, 0
-; GFX10-NEXT:    v_mov_b32_e32 v42, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v42, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v4i8_ret at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v4i8_ret at abs32@lo
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    global_load_dword v0, v[41:42], off
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    global_load_dword v0, v[40:41], off
+; GFX10-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
@@ -4566,18 +4566,18 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX10-NEXT:    v_lshlrev_b16 v3, 8, v3
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    global_store_dword v[41:42], v0, off
+; GFX10-NEXT:    global_store_dword v[40:41], v0, off
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33
-; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -4591,20 +4591,20 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:8 ; 4-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
-; GFX11-NEXT:    scratch_store_b32 off, v42, s33
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:4
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33
+; GFX11-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v41, 0
-; GFX11-NEXT:    v_mov_b32_e32 v42, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v42, s0, 2
 ; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v4i8_ret at abs32@hi
 ; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v4i8_ret at abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    global_load_b32 v0, v[41:42], off
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    global_load_b32 v0, v[40:41], off
+; GFX11-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
@@ -4615,22 +4615,22 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v3
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v42, 0
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    global_store_b32 v[41:42], v0, off
+; GFX11-NEXT:    global_store_b32 v[40:41], v0, off
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_b32 v42, off, s33
-; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:4
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -4643,20 +4643,20 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:8 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v42, s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v42, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v41, 0
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v42, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s0, 2
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v4i8_ret at abs32@hi
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v4i8_ret at abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    global_load_dword v0, v[41:42], off
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    global_load_dword v0, v[40:41], off
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
@@ -4664,18 +4664,18 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v3, 8, v3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-SCRATCH-NEXT:    global_store_dword v[41:42], v0, off
+; GFX10-SCRATCH-NEXT:    global_store_dword v[40:41], v0, off
 ; GFX10-SCRATCH-NEXT:    s_clause 0x1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33 offset:8 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -4696,19 +4696,19 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v41, 0
-; GFX9-NEXT:    v_mov_b32_e32 v42, 0
-; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[41:42], off
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[40:41], off
+; GFX9-NEXT:    v_writelane_b32 v42, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v42, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v5i8_ret at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v5i8_ret at abs32@lo
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b64 v[3:4], 24, v[5:6]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v5
@@ -4724,15 +4724,15 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    global_store_byte v[0:1], v4, off
-; GFX9-NEXT:    global_store_dword v[41:42], v2, off
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    global_store_dword v[40:41], v2, off
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -4745,20 +4745,20 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v41, 0
-; GFX10-NEXT:    v_mov_b32_e32 v42, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v42, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v5i8_ret at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v5i8_ret at abs32@lo
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[41:42], off
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[40:41], off
+; GFX10-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b64 v[3:4], 24, v[5:6]
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 8, v5
@@ -4768,21 +4768,21 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX10-NEXT:    v_lshlrev_b16 v3, 8, v3
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX10-NEXT:    v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 4
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX10-NEXT:    global_store_byte v[0:1], v4, off
-; GFX10-NEXT:    global_store_dword v[41:42], v2, off
+; GFX10-NEXT:    global_store_dword v[40:41], v2, off
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33
-; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -4796,20 +4796,20 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:8 ; 4-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
-; GFX11-NEXT:    scratch_store_b32 off, v42, s33
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:4
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33
+; GFX11-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v41, 0
-; GFX11-NEXT:    v_mov_b32_e32 v42, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v42, s0, 2
 ; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v5i8_ret at abs32@hi
 ; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v5i8_ret at abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    global_load_b64 v[5:6], v[41:42], off
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    global_load_b64 v[5:6], v[40:41], off
+; GFX11-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[5:6]
@@ -4822,10 +4822,10 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v3
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v42, 0
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v0
@@ -4835,12 +4835,12 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX11-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b8 v[0:1], v4, off
-; GFX11-NEXT:    global_store_b32 v[41:42], v2, off
+; GFX11-NEXT:    global_store_b32 v[40:41], v2, off
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_b32 v42, off, s33
-; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:4
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -4853,20 +4853,20 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:8 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v42, s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v42, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v41, 0
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v42, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s0, 2
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v5i8_ret at abs32@hi
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v5i8_ret at abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    global_load_dwordx2 v[5:6], v[41:42], off
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    global_load_dwordx2 v[5:6], v[40:41], off
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    v_lshrrev_b64 v[3:4], 24, v[5:6]
 ; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v1, 8, v5
@@ -4876,21 +4876,21 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v3, 8, v3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 4
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX10-SCRATCH-NEXT:    global_store_byte v[0:1], v4, off
-; GFX10-SCRATCH-NEXT:    global_store_dword v[41:42], v2, off
+; GFX10-SCRATCH-NEXT:    global_store_dword v[40:41], v2, off
 ; GFX10-SCRATCH-NEXT:    s_clause 0x1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33 offset:8 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -4911,19 +4911,19 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v41, 0
-; GFX9-NEXT:    v_mov_b32_e32 v42, 0
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[41:42], off
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[40:41], off
+; GFX9-NEXT:    v_writelane_b32 v42, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v42, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v8i8_ret at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v8i8_ret at abs32@lo
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
@@ -4944,15 +4944,15 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v3, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    global_store_dwordx2 v[41:42], v[3:4], off
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    global_store_dwordx2 v[40:41], v[3:4], off
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -4965,20 +4965,20 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v41, 0
-; GFX10-NEXT:    v_mov_b32_e32 v42, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v42, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v8i8_ret at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v8i8_ret at abs32@lo
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[41:42], off
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[40:41], off
+; GFX10-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 8, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
@@ -4993,21 +4993,21 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
 ; GFX10-NEXT:    v_lshlrev_b16 v7, 8, v7
 ; GFX10-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX10-NEXT:    v_lshlrev_b16 v3, 8, v3
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v5, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX10-NEXT:    v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    global_store_dwordx2 v[41:42], v[0:1], off
+; GFX10-NEXT:    global_store_dwordx2 v[40:41], v[0:1], off
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33
-; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -5021,31 +5021,31 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:8 ; 4-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
-; GFX11-NEXT:    scratch_store_b32 off, v42, s33
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:4
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33
+; GFX11-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v41, 0
-; GFX11-NEXT:    v_mov_b32_e32 v42, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v42, s0, 2
 ; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v8i8_ret at abs32@hi
 ; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v8i8_ret at abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    global_load_b64 v[0:1], v[41:42], off
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    global_load_b64 v[0:1], v[40:41], off
+; GFX11-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 8, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GFX11-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v8
+; GFX11-NEXT:    v_mov_b32_e32 v1, v8
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    v_lshlrev_b16 v5, 8, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
 ; GFX11-NEXT:    v_lshlrev_b16 v7, 8, v7
 ; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
@@ -5056,23 +5056,23 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
 ; GFX11-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX11-NEXT:    v_or_b32_e32 v5, v6, v7
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v4
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s30, v42, 0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX11-NEXT:    global_store_b64 v[41:42], v[0:1], off
+; GFX11-NEXT:    global_store_b64 v[40:41], v[0:1], off
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_b32 v42, off, s33
-; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:4
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -5085,20 +5085,20 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:8 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v42, s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v42, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v41, 0
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v42, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s0, 2
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v8i8_ret at abs32@hi
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v8i8_ret at abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    global_load_dwordx2 v[0:1], v[41:42], off
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    global_load_dwordx2 v[0:1], v[40:41], off
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v8, 8, v0
 ; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
@@ -5113,21 +5113,21 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v7, 8, v7
 ; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v3, 8, v3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v5, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-SCRATCH-NEXT:    global_store_dwordx2 v[41:42], v[0:1], off
+; GFX10-SCRATCH-NEXT:    global_store_dwordx2 v[40:41], v[0:1], off
 ; GFX10-SCRATCH-NEXT:    s_clause 0x1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33 offset:8 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -5148,24 +5148,24 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, 0
+; GFX9-NEXT:    v_mov_b32_e32 v42, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v41, 0
-; GFX9-NEXT:    v_mov_b32_e32 v43, 16
-; GFX9-NEXT:    v_mov_b32_e32 v42, 0
-; GFX9-NEXT:    v_mov_b32_e32 v44, 0
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[41:42], off
-; GFX9-NEXT:    global_load_dwordx4 v[16:19], v[43:44], off
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_mov_b32_e32 v43, 0
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[40:41], off
+; GFX9-NEXT:    global_load_dwordx4 v[16:19], v[42:43], off
+; GFX9-NEXT:    v_writelane_b32 v44, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v44, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3i8_ret at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3i8_ret at abs32@lo
 ; GFX9-NEXT:    s_addk_i32 s32, 0x800
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v44, s31, 1
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 8, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v0
@@ -5245,18 +5245,18 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v9, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v8, v8, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    global_store_dwordx4 v[43:44], v[0:3], off
-; GFX9-NEXT:    global_store_dwordx4 v[41:42], v[6:9], off
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    global_store_dwordx4 v[42:43], v[0:3], off
+; GFX9-NEXT:    global_store_dwordx4 v[40:41], v[6:9], off
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_readlane_b32 s31, v44, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v44, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v44, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xf800
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -5269,26 +5269,26 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    v_mov_b32_e32 v40, 0
+; GFX10-NEXT:    v_mov_b32_e32 v42, 16
 ; GFX10-NEXT:    v_mov_b32_e32 v41, 0
-; GFX10-NEXT:    v_mov_b32_e32 v43, 16
-; GFX10-NEXT:    v_mov_b32_e32 v42, 0
-; GFX10-NEXT:    v_mov_b32_e32 v44, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_mov_b32_e32 v43, 0
+; GFX10-NEXT:    v_writelane_b32 v44, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3i8_ret at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3i8_ret at abs32@lo
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[41:42], off
-; GFX10-NEXT:    global_load_dwordx4 v[16:19], v[43:44], off
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[40:41], off
+; GFX10-NEXT:    global_load_dwordx4 v[16:19], v[42:43], off
+; GFX10-NEXT:    v_writelane_b32 v44, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x400
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    v_writelane_b32 v44, s31, 1
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v35, 8, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v36, 16, v0
@@ -5368,18 +5368,18 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
 ; GFX10-NEXT:    v_or_b32_sdwa v8, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v7, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v3, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    global_store_dwordx4 v[43:44], v[7:10], off
-; GFX10-NEXT:    global_store_dwordx4 v[41:42], v[3:6], off
+; GFX10-NEXT:    global_store_dwordx4 v[42:43], v[7:10], off
+; GFX10-NEXT:    global_store_dwordx4 v[40:41], v[3:6], off
 ; GFX10-NEXT:    s_clause 0x3
-; GFX10-NEXT:    buffer_load_dword v44, off, s[0:3], s33
-; GFX10-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:4
-; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8
-; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:12
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    buffer_load_dword v43, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:4
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:8
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:12
+; GFX10-NEXT:    v_readlane_b32 s31, v44, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v44, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v44, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GFX10-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfc00
@@ -5393,24 +5393,24 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:16 ; 4-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b32 off, v44, s33 offset:16 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:12
-; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:8
-; GFX11-NEXT:    scratch_store_b32 off, v43, s33 offset:4
-; GFX11-NEXT:    scratch_store_b32 off, v44, s33
-; GFX11-NEXT:    v_mov_b32_e32 v41, 0
-; GFX11-NEXT:    v_dual_mov_b32 v42, 0 :: v_dual_mov_b32 v43, 16
-; GFX11-NEXT:    v_mov_b32_e32 v44, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:12
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:8
+; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:4
+; GFX11-NEXT:    scratch_store_b32 off, v43, s33
+; GFX11-NEXT:    v_mov_b32_e32 v40, 0
+; GFX11-NEXT:    v_dual_mov_b32 v41, 0 :: v_dual_mov_b32 v42, 16
+; GFX11-NEXT:    v_mov_b32_e32 v43, 0
+; GFX11-NEXT:    v_writelane_b32 v44, s0, 2
 ; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3i8_ret at abs32@hi
-; GFX11-NEXT:    global_load_b128 v[0:3], v[41:42], off
+; GFX11-NEXT:    global_load_b128 v[0:3], v[40:41], off
 ; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3i8_ret at abs32@lo
-; GFX11-NEXT:    global_load_b128 v[16:19], v[43:44], off
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    global_load_b128 v[16:19], v[42:43], off
+; GFX11-NEXT:    v_writelane_b32 v44, s30, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 32
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    v_writelane_b32 v44, s31, 1
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 8, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v0
@@ -5519,18 +5519,18 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
 ; GFX11-NEXT:    v_or_b32_e32 v7, v11, v12
 ; GFX11-NEXT:    v_or_b32_e32 v3, v0, v2
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_store_b128 v[43:44], v[7:10], off
-; GFX11-NEXT:    global_store_b128 v[41:42], v[3:6], off
+; GFX11-NEXT:    global_store_b128 v[42:43], v[7:10], off
+; GFX11-NEXT:    global_store_b128 v[40:41], v[3:6], off
 ; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    scratch_load_b32 v44, off, s33
-; GFX11-NEXT:    scratch_load_b32 v43, off, s33 offset:4
-; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:8
-; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:12
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    scratch_load_b32 v43, off, s33
+; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:4
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:8
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:12
+; GFX11-NEXT:    v_readlane_b32 s31, v44, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v44, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v44, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:16 ; 4-byte Folded Reload
+; GFX11-NEXT:    scratch_load_b32 v44, off, s33 offset:16 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_addk_i32 s32, 0xffe0
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -5543,26 +5543,26 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:16 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v44, s33 offset:16 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:12 ; 4-byte Folded Spill
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v42, s33 offset:8 ; 4-byte Folded Spill
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v43, s33 offset:4 ; 4-byte Folded Spill
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v44, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:12 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:8 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v42, s33 offset:4 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v43, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v40, 0
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v42, 16
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v41, 0
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v43, 16
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v42, 0
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v44, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v43, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v44, s0, 2
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3i8_ret at abs32@hi
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3i8_ret at abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_clause 0x1
-; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[0:3], v[41:42], off
-; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[16:19], v[43:44], off
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[0:3], v[40:41], off
+; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[16:19], v[42:43], off
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v44, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 32
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v44, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v35, 8, v0
 ; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v36, 16, v0
@@ -5642,18 +5642,18 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v8, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v7, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v3, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-SCRATCH-NEXT:    global_store_dwordx4 v[43:44], v[7:10], off
-; GFX10-SCRATCH-NEXT:    global_store_dwordx4 v[41:42], v[3:6], off
+; GFX10-SCRATCH-NEXT:    global_store_dwordx4 v[42:43], v[7:10], off
+; GFX10-SCRATCH-NEXT:    global_store_dwordx4 v[40:41], v[3:6], off
 ; GFX10-SCRATCH-NEXT:    s_clause 0x3
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v44, off, s33
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v43, off, s33 offset:4
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33 offset:8
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:12
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v43, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33 offset:4
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:8
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:12
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v44, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v44, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v44, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:16 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v44, off, s33 offset:16 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_addk_i32 s32, 0xffe0
@@ -8255,29 +8255,29 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    v_mov_b32_e32 v41, v0
+; GFX9-NEXT:    v_writelane_b32 v42, s34, 2
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX9-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX9-NEXT:    s_mov_b32 s35, external_i32_func_i32 at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_i32_func_i32 at abs32@lo
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    v_mov_b32_e32 v42, v1
+; GFX9-NEXT:    v_writelane_b32 v42, s31, 1
+; GFX9-NEXT:    v_mov_b32_e32 v41, v1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    global_store_dword v[41:42], v0, off
+; GFX9-NEXT:    global_store_dword v[40:41], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -8290,31 +8290,31 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX10-NEXT:    v_mov_b32_e32 v41, v0
+; GFX10-NEXT:    v_writelane_b32 v42, s34, 2
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    s_mov_b32 s35, external_i32_func_i32 at abs32@hi
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v42, s30, 0
 ; GFX10-NEXT:    s_mov_b32 s34, external_i32_func_i32 at abs32@lo
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_mov_b32_e32 v42, v1
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    v_mov_b32_e32 v41, v1
+; GFX10-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    global_store_dword v[41:42], v0, off
+; GFX10-NEXT:    global_store_dword v[40:41], v0, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33
-; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4
+; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -8328,30 +8328,30 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:8 ; 4-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v42, s0, 2
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
-; GFX11-NEXT:    scratch_store_b32 off, v42, s33
-; GFX11-NEXT:    v_dual_mov_b32 v42, v1 :: v_dual_mov_b32 v41, v0
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:4
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33
+; GFX11-NEXT:    v_dual_mov_b32 v41, v1 :: v_dual_mov_b32 v40, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 42
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v42, s30, 0
 ; GFX11-NEXT:    s_mov_b32 s1, external_i32_func_i32 at abs32@hi
 ; GFX11-NEXT:    s_mov_b32 s0, external_i32_func_i32 at abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    global_store_b32 v[41:42], v0, off dlc
+; GFX11-NEXT:    global_store_b32 v[40:41], v0, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_b32 v42, off, s33
-; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:4
+; GFX11-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -8364,31 +8364,31 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:8 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v42, s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v42, s33 ; 4-byte Folded Spill
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v41, v0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s0, 2
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_i32_func_i32 at abs32@hi
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_i32_func_i32 at abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v42, v1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v41, v1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    global_store_dword v[41:42], v0, off
+; GFX10-SCRATCH-NEXT:    global_store_dword v[40:41], v0, off
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-SCRATCH-NEXT:    s_clause 0x1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33 offset:8 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16

diff  --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
index ad55d49a1a96d2..a14e3d5673f829 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
@@ -322,30 +322,30 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v41, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_void at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_void at abs32@lo
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_writelane_b32 v41, s31, 1
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; def v31
 ; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    v_mov_b32_e32 v41, v31
+; GFX9-NEXT:    v_mov_b32_e32 v40, v31
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_mov_b32_e32 v31, v41
+; GFX9-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use v31
 ; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_readlane_b32 s31, v41, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v41, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -358,31 +358,31 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_void at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_void at abs32@lo
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    v_writelane_b32 v41, s30, 0
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; def v31
 ; GFX10-NEXT:    ;;#ASMEND
-; GFX10-NEXT:    v_mov_b32_e32 v41, v31
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    v_mov_b32_e32 v40, v31
+; GFX10-NEXT:    v_writelane_b32 v41, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_mov_b32_e32 v31, v41
+; GFX10-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; use v31
 ; GFX10-NEXT:    ;;#ASMEND
-; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    v_readlane_b32 s31, v41, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v41, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -396,31 +396,31 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:4 ; 4-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 2
 ; GFX11-NEXT:    s_mov_b32 s1, external_void_func_void at abs32@hi
 ; GFX11-NEXT:    s_mov_b32 s0, external_void_func_void at abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    scratch_store_b32 off, v41, s33 ; 4-byte Folded Spill
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    v_writelane_b32 v41, s30, 0
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; def v31
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    v_mov_b32_e32 v41, v31
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    v_mov_b32_e32 v40, v31
+; GFX11-NEXT:    v_writelane_b32 v41, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_mov_b32_e32 v31, v41
+; GFX11-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use v31
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    scratch_load_b32 v41, off, s33 ; 4-byte Folded Reload
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    v_readlane_b32 s31, v41, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v41, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:4 ; 4-byte Folded Reload
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -1184,16 +1184,16 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 3
+; GFX9-NEXT:    v_writelane_b32 v41, s4, 0
+; GFX9-NEXT:    v_writelane_b32 v41, s30, 1
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_void at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_void at abs32@lo
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_writelane_b32 v41, s31, 2
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; def s40
 ; GFX9-NEXT:    ;;#ASMEND
@@ -1201,21 +1201,21 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; def v32
 ; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    v_mov_b32_e32 v41, v32
+; GFX9-NEXT:    v_mov_b32_e32 v40, v32
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use s4
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use v41
+; GFX9-NEXT:    ; use v40
 ; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_readlane_b32 s31, v41, 2
+; GFX9-NEXT:    v_readlane_b32 s30, v41, 1
+; GFX9-NEXT:    v_readlane_b32 s4, v41, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 3
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -1228,15 +1228,15 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 3
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 3
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_void at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_void at abs32@lo
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    v_writelane_b32 v41, s4, 0
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; def s40
 ; GFX10-NEXT:    ;;#ASMEND
@@ -1244,23 +1244,23 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; def v32
 ; GFX10-NEXT:    ;;#ASMEND
-; GFX10-NEXT:    v_mov_b32_e32 v41, v32
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX10-NEXT:    v_mov_b32_e32 v40, v32
+; GFX10-NEXT:    v_writelane_b32 v41, s30, 1
+; GFX10-NEXT:    v_writelane_b32 v41, s31, 2
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; use s4
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    ;;#ASMSTART
-; GFX10-NEXT:    ; use v41
+; GFX10-NEXT:    ; use v40
 ; GFX10-NEXT:    ;;#ASMEND
-; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    v_readlane_b32 s31, v41, 2
+; GFX10-NEXT:    v_readlane_b32 s30, v41, 1
+; GFX10-NEXT:    v_readlane_b32 s4, v41, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 3
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -1274,14 +1274,14 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:4 ; 4-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 3
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 3
 ; GFX11-NEXT:    s_mov_b32 s1, external_void_func_void at abs32@hi
 ; GFX11-NEXT:    s_mov_b32 s0, external_void_func_void at abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    scratch_store_b32 off, v41, s33 ; 4-byte Folded Spill
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    v_writelane_b32 v41, s4, 0
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; def s40
 ; GFX11-NEXT:    ;;#ASMEND
@@ -1289,23 +1289,23 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; def v32
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    v_mov_b32_e32 v41, v32
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX11-NEXT:    v_mov_b32_e32 v40, v32
+; GFX11-NEXT:    v_writelane_b32 v41, s30, 1
+; GFX11-NEXT:    v_writelane_b32 v41, s31, 2
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use s4
 ; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    ;;#ASMSTART
-; GFX11-NEXT:    ; use v41
+; GFX11-NEXT:    ; use v40
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    scratch_load_b32 v41, off, s33 ; 4-byte Folded Reload
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    v_readlane_b32 s31, v41, 2
+; GFX11-NEXT:    v_readlane_b32 s30, v41, 1
+; GFX11-NEXT:    v_readlane_b32 s4, v41, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 3
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:4 ; 4-byte Folded Reload
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0

diff  --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
index 23502d1b36d182..c1d682689903ad 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
@@ -2764,27 +2764,26 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX9-NEXT:    s_mov_b32 s36, s33
 ; GFX9-NEXT:    s_add_i32 s33, s32, 0x7fc0
 ; GFX9-NEXT:    s_and_b32 s33, s33, 0xffff8000
-; GFX9-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_add_i32 s32, s32, 0x28000
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
@@ -2827,7 +2826,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:156
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:160
 ; GFX9-NEXT:    v_lshrrev_b32_e64 v0, 6, s33
-; GFX9-NEXT:    v_writelane_b32 v33, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v63, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, return_72xi32 at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, return_72xi32 at abs32@lo
 ; GFX9-NEXT:    v_add_u32_e32 v0, 0x200, v0
@@ -2862,41 +2861,41 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX9-NEXT:    v_mov_b32_e32 v29, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v31, 0
-; GFX9-NEXT:    v_writelane_b32 v33, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v63, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:636
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:640
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:644
-; GFX9-NEXT:    buffer_load_dword v34, off, s[0:3], s33 offset:648
-; GFX9-NEXT:    buffer_load_dword v35, off, s[0:3], s33 offset:652
-; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s33 offset:656
-; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s33 offset:660
-; GFX9-NEXT:    buffer_load_dword v38, off, s[0:3], s33 offset:664
-; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s33 offset:668
-; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s33 offset:672
-; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s33 offset:676
-; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s33 offset:680
-; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s33 offset:684
-; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s33 offset:688
-; GFX9-NEXT:    buffer_load_dword v53, off, s[0:3], s33 offset:692
-; GFX9-NEXT:    buffer_load_dword v54, off, s[0:3], s33 offset:696
-; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s33 offset:700
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:704
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:708
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:712
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:716
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:720
-; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:724
-; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:728
-; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:732
-; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:736
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:740
-; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:748
-; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:752
-; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:756
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:760
-; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s33 offset:764
-; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s33 offset:768
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:648
+; GFX9-NEXT:    buffer_load_dword v34, off, s[0:3], s33 offset:652
+; GFX9-NEXT:    buffer_load_dword v35, off, s[0:3], s33 offset:656
+; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s33 offset:660
+; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s33 offset:664
+; GFX9-NEXT:    buffer_load_dword v38, off, s[0:3], s33 offset:668
+; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s33 offset:672
+; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s33 offset:676
+; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s33 offset:680
+; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s33 offset:684
+; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s33 offset:688
+; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s33 offset:692
+; GFX9-NEXT:    buffer_load_dword v53, off, s[0:3], s33 offset:696
+; GFX9-NEXT:    buffer_load_dword v54, off, s[0:3], s33 offset:700
+; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s33 offset:704
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:708
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:712
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:716
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:720
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:724
+; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:728
+; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:732
+; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:736
+; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:740
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:748
+; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:752
+; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:756
+; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:760
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:764
+; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s33 offset:768
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:772
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:776
 ; GFX9-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:780
@@ -2953,38 +2952,38 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:16
-; GFX9-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:24
-; GFX9-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:32
-; GFX9-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:40
-; GFX9-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:48
-; GFX9-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:52
-; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:56
-; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:60
-; GFX9-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:64
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:68
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:72
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:80
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:84
-; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:88
-; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:92
-; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:96
-; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:100
-; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:104
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:24
+; GFX9-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:32
+; GFX9-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:40
+; GFX9-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:48
+; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:56
+; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:60
+; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:64
+; GFX9-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:68
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:76
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:80
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:84
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:88
+; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:92
+; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:96
+; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:100
+; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:104
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 24
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:108
-; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:112
-; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:116
-; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:120
-; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:124
-; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:128
-; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:132
+; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:112
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:116
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:120
+; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:124
+; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:128
+; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:132
 ; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:136
 ; GFX9-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:140
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:144
@@ -3003,26 +3002,25 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX9-NEXT:    v_add_u32_e32 v0, 0x400, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 42
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v33, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v33, 0
-; GFX9-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:1536 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_readlane_b32 s31, v63, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v63, 0
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s33 offset:1536 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_add_i32 s32, s32, 0xfffd8000
 ; GFX9-NEXT:    s_mov_b32 s33, s36
@@ -3036,27 +3034,27 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX10-NEXT:    s_add_i32 s33, s32, 0x3fe0
 ; GFX10-NEXT:    s_and_b32 s33, s33, 0xffffc000
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
-; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v63, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_add_i32 s32, s32, 0x14000
-; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    v_writelane_b32 v63, s30, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32
 ; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
@@ -3133,7 +3131,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX10-NEXT:    v_mov_b32_e32 v31, 0
 ; GFX10-NEXT:    s_mov_b32 s35, return_72xi32 at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, return_72xi32 at abs32@lo
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    v_writelane_b32 v63, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    s_clause 0x28
 ; GFX10-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:636
@@ -3154,21 +3152,21 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX10-NEXT:    buffer_load_dword v53, off, s[0:3], s33 offset:696
 ; GFX10-NEXT:    buffer_load_dword v54, off, s[0:3], s33 offset:700
 ; GFX10-NEXT:    buffer_load_dword v55, off, s[0:3], s33 offset:704
-; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:708
-; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:712
-; GFX10-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:716
-; GFX10-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:720
-; GFX10-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:724
-; GFX10-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:728
-; GFX10-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:732
-; GFX10-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:736
-; GFX10-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:740
-; GFX10-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:748
-; GFX10-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:752
-; GFX10-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:756
-; GFX10-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:760
-; GFX10-NEXT:    buffer_load_dword v62, off, s[0:3], s33 offset:764
-; GFX10-NEXT:    buffer_load_dword v63, off, s[0:3], s33 offset:768
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:708
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:712
+; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:716
+; GFX10-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:720
+; GFX10-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:724
+; GFX10-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:728
+; GFX10-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:732
+; GFX10-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:736
+; GFX10-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:740
+; GFX10-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:748
+; GFX10-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:752
+; GFX10-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:756
+; GFX10-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:760
+; GFX10-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:764
+; GFX10-NEXT:    buffer_load_dword v62, off, s[0:3], s33 offset:768
 ; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:772
 ; GFX10-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:776
 ; GFX10-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:780
@@ -3243,21 +3241,21 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX10-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:60
 ; GFX10-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:64
 ; GFX10-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:68
-; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:72
-; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76
-; GFX10-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:80
-; GFX10-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:84
-; GFX10-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:88
-; GFX10-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:92
-; GFX10-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:96
-; GFX10-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:100
-; GFX10-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:104
-; GFX10-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:112
-; GFX10-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:116
-; GFX10-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:120
-; GFX10-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:124
-; GFX10-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:128
-; GFX10-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:132
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:76
+; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:80
+; GFX10-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:84
+; GFX10-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:88
+; GFX10-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:92
+; GFX10-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:96
+; GFX10-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:100
+; GFX10-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:104
+; GFX10-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:112
+; GFX10-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:116
+; GFX10-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:120
+; GFX10-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:124
+; GFX10-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:128
+; GFX10-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:132
 ; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:136
 ; GFX10-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:140
 ; GFX10-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:144
@@ -3279,25 +3277,25 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x400, v0
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    s_clause 0xe
-; GFX10-NEXT:    buffer_load_dword v63, off, s[0:3], s33
-; GFX10-NEXT:    buffer_load_dword v62, off, s[0:3], s33 offset:4
-; GFX10-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:8
-; GFX10-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:12
-; GFX10-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:16
-; GFX10-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:20
-; GFX10-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:24
-; GFX10-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:28
-; GFX10-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:32
-; GFX10-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:36
-; GFX10-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:40
-; GFX10-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:44
-; GFX10-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:48
-; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:52
-; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:56
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    buffer_load_dword v62, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:4
+; GFX10-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:8
+; GFX10-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:12
+; GFX10-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:16
+; GFX10-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:20
+; GFX10-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:24
+; GFX10-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:28
+; GFX10-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:32
+; GFX10-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:36
+; GFX10-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:40
+; GFX10-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:44
+; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:48
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:52
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:56
+; GFX10-NEXT:    v_readlane_b32 s31, v63, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v63, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:1536 ; 4-byte Folded Reload
+; GFX10-NEXT:    buffer_load_dword v63, off, s[0:3], s33 offset:1536 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_add_i32 s32, s32, 0xfffec000
@@ -3313,7 +3311,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s33, s33, 0xfffffe00
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:1536 ; 4-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b32 off, v60, s33 offset:1536 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v4, 0
@@ -3323,22 +3321,19 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-NEXT:    s_addk_i32 s32, 0xa00
-; GFX11-NEXT:    s_clause 0xe
-; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:56
-; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:52
-; GFX11-NEXT:    scratch_store_b32 off, v43, s33 offset:48
-; GFX11-NEXT:    scratch_store_b32 off, v44, s33 offset:44
-; GFX11-NEXT:    scratch_store_b32 off, v45, s33 offset:40
-; GFX11-NEXT:    scratch_store_b32 off, v46, s33 offset:36
-; GFX11-NEXT:    scratch_store_b32 off, v47, s33 offset:32
-; GFX11-NEXT:    scratch_store_b32 off, v56, s33 offset:28
-; GFX11-NEXT:    scratch_store_b32 off, v57, s33 offset:24
-; GFX11-NEXT:    scratch_store_b32 off, v58, s33 offset:20
-; GFX11-NEXT:    scratch_store_b32 off, v59, s33 offset:16
-; GFX11-NEXT:    scratch_store_b32 off, v60, s33 offset:12
-; GFX11-NEXT:    scratch_store_b32 off, v61, s33 offset:8
-; GFX11-NEXT:    scratch_store_b32 off, v62, s33 offset:4
-; GFX11-NEXT:    scratch_store_b32 off, v63, s33
+; GFX11-NEXT:    s_clause 0xb
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:44
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:40
+; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:36
+; GFX11-NEXT:    scratch_store_b32 off, v43, s33 offset:32
+; GFX11-NEXT:    scratch_store_b32 off, v44, s33 offset:28
+; GFX11-NEXT:    scratch_store_b32 off, v45, s33 offset:24
+; GFX11-NEXT:    scratch_store_b32 off, v46, s33 offset:20
+; GFX11-NEXT:    scratch_store_b32 off, v47, s33 offset:16
+; GFX11-NEXT:    scratch_store_b32 off, v56, s33 offset:12
+; GFX11-NEXT:    scratch_store_b32 off, v57, s33 offset:8
+; GFX11-NEXT:    scratch_store_b32 off, v58, s33 offset:4
+; GFX11-NEXT:    scratch_store_b32 off, v59, s33
 ; GFX11-NEXT:    s_add_i32 s0, s32, 0xa0
 ; GFX11-NEXT:    s_add_i32 s1, s32, 0x90
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32
@@ -3361,7 +3356,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s0
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s1
 ; GFX11-NEXT:    s_add_i32 s0, s33, 0x200
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v60, s30, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v4, 0
@@ -3380,7 +3375,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    v_dual_mov_b32 v31, 0 :: v_dual_mov_b32 v30, 0
 ; GFX11-NEXT:    s_mov_b32 s45, return_72xi32 at abs32@hi
 ; GFX11-NEXT:    s_mov_b32 s44, return_72xi32 at abs32@lo
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    v_writelane_b32 v60, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[44:45]
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    scratch_load_b128 v[45:48], off, s33 offset:624
@@ -3391,22 +3386,16 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    s_clause 0x9
 ; GFX11-NEXT:    scratch_load_b128 v[48:51], off, s33 offset:656
 ; GFX11-NEXT:    scratch_load_b128 v[52:55], off, s33 offset:672
-; GFX11-NEXT:    scratch_load_b128 v[41:44], off, s33 offset:688
-; GFX11-NEXT:    scratch_load_b128 v[56:59], off, s33 offset:704
-; GFX11-NEXT:    scratch_load_b128 v[60:63], off, s33 offset:720
+; GFX11-NEXT:    scratch_load_b128 v[37:40], off, s33 offset:688
+; GFX11-NEXT:    scratch_load_b128 v[41:44], off, s33 offset:704
+; GFX11-NEXT:    scratch_load_b128 v[56:59], off, s33 offset:720
 ; GFX11-NEXT:    scratch_load_b128 v[12:15], off, s33 offset:736
 ; GFX11-NEXT:    scratch_load_b128 v[0:3], off, s33 offset:752
 ; GFX11-NEXT:    scratch_load_b128 v[4:7], off, s33 offset:768
 ; GFX11-NEXT:    scratch_load_b128 v[8:11], off, s33 offset:784
 ; GFX11-NEXT:    scratch_load_b128 v[16:19], off, s33 offset:512
-; GFX11-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-NEXT:    v_dual_mov_b32 v38, v53 :: v_dual_mov_b32 v37, v52
-; GFX11-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-NEXT:    v_dual_mov_b32 v39, v54 :: v_dual_mov_b32 v52, v44
-; GFX11-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-NEXT:    v_dual_mov_b32 v53, v56 :: v_dual_mov_b32 v54, v57
-; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    v_dual_mov_b32 v44, v62 :: v_dual_mov_b32 v57, v12
+; GFX11-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-NEXT:    v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v1, v4
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    scratch_store_b128 off, v[16:19], s33 offset:1588 ; 16-byte Folded Spill
 ; GFX11-NEXT:    s_clause 0x3
@@ -3414,13 +3403,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    scratch_load_b128 v[20:23], off, s33 offset:544
 ; GFX11-NEXT:    scratch_load_b128 v[24:27], off, s33 offset:560
 ; GFX11-NEXT:    scratch_load_b128 v[28:31], off, s33 offset:576
-; GFX11-NEXT:    v_mov_b32_e32 v56, v63
-; GFX11-NEXT:    v_mov_b32_e32 v12, v15
-; GFX11-NEXT:    v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v15, v2
-; GFX11-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX11-NEXT:    v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8
-; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    v_dual_mov_b32 v7, v10 :: v_dual_mov_b32 v8, v19
+; GFX11-NEXT:    v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v7, v10
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
 ; GFX11-NEXT:    v_mov_b32_e32 v10, v21
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -3432,47 +3415,53 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    scratch_store_b128 off, v[28:31], s33 offset:1540 ; 16-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b128 off, v[32:35], s32
-; GFX11-NEXT:    v_mov_b32_e32 v32, v36
+; GFX11-NEXT:    v_dual_mov_b32 v31, v47 :: v_dual_mov_b32 v32, v36
 ; GFX11-NEXT:    v_dual_mov_b32 v33, v48 :: v_dual_mov_b32 v34, v49
-; GFX11-NEXT:    v_dual_mov_b32 v35, v50 :: v_dual_mov_b32 v36, v51
-; GFX11-NEXT:    v_dual_mov_b32 v48, v55 :: v_dual_mov_b32 v49, v41
-; GFX11-NEXT:    v_mov_b32_e32 v50, v42
-; GFX11-NEXT:    v_dual_mov_b32 v55, v58 :: v_dual_mov_b32 v58, v13
-; GFX11-NEXT:    v_dual_mov_b32 v13, v0 :: v_dual_mov_b32 v0, v3
-; GFX11-NEXT:    v_dual_mov_b32 v3, v6 :: v_dual_mov_b32 v6, v9
+; GFX11-NEXT:    v_dual_mov_b32 v35, v50 :: v_dual_mov_b32 v48, v51
+; GFX11-NEXT:    v_dual_mov_b32 v49, v52 :: v_dual_mov_b32 v50, v53
+; GFX11-NEXT:    v_dual_mov_b32 v51, v54 :: v_dual_mov_b32 v36, v55
+; GFX11-NEXT:    v_dual_mov_b32 v53, v41 :: v_dual_mov_b32 v52, v40
+; GFX11-NEXT:    v_dual_mov_b32 v54, v42 :: v_dual_mov_b32 v41, v56
+; GFX11-NEXT:    v_dual_mov_b32 v55, v43 :: v_dual_mov_b32 v40, v44
+; GFX11-NEXT:    v_dual_mov_b32 v42, v57 :: v_dual_mov_b32 v57, v12
+; GFX11-NEXT:    v_dual_mov_b32 v43, v58 :: v_dual_mov_b32 v56, v59
+; GFX11-NEXT:    v_mov_b32_e32 v58, v13
+; GFX11-NEXT:    v_dual_mov_b32 v12, v15 :: v_dual_mov_b32 v13, v0
+; GFX11-NEXT:    v_dual_mov_b32 v15, v2 :: v_dual_mov_b32 v0, v3
+; GFX11-NEXT:    v_dual_mov_b32 v2, v5 :: v_dual_mov_b32 v3, v6
+; GFX11-NEXT:    v_dual_mov_b32 v5, v8 :: v_dual_mov_b32 v6, v9
+; GFX11-NEXT:    v_mov_b32_e32 v9, v20
 ; GFX11-NEXT:    scratch_store_b32 off, v11, s0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 0x90
-; GFX11-NEXT:    v_mov_b32_e32 v51, v43
-; GFX11-NEXT:    v_mov_b32_e32 v41, v59
+; GFX11-NEXT:    v_mov_b32_e32 v11, v22
 ; GFX11-NEXT:    scratch_store_b128 off, v[4:7], s0
-; GFX11-NEXT:    v_mov_b32_e32 v7, v18
 ; GFX11-NEXT:    s_add_i32 s0, s32, 0x80
-; GFX11-NEXT:    v_dual_mov_b32 v42, v60 :: v_dual_mov_b32 v43, v61
+; GFX11-NEXT:    v_mov_b32_e32 v5, v16
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s0
-; GFX11-NEXT:    v_dual_mov_b32 v0, 24 :: v_dual_mov_b32 v9, v20
+; GFX11-NEXT:    v_mov_b32_e32 v0, 24
 ; GFX11-NEXT:    s_add_i32 s0, s32, 0x70
-; GFX11-NEXT:    v_mov_b32_e32 v5, v16
+; GFX11-NEXT:    v_mov_b32_e32 v6, v17
 ; GFX11-NEXT:    scratch_store_b128 off, v[12:15], s0
+; GFX11-NEXT:    v_mov_b32_e32 v13, v24
 ; GFX11-NEXT:    s_add_i32 s0, s32, 0x6c
-; GFX11-NEXT:    v_dual_mov_b32 v6, v17 :: v_dual_mov_b32 v11, v22
+; GFX11-NEXT:    v_mov_b32_e32 v7, v18
 ; GFX11-NEXT:    scratch_store_b32 off, v0, s0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 0x60
-; GFX11-NEXT:    v_dual_mov_b32 v12, v23 :: v_dual_mov_b32 v29, v45
+; GFX11-NEXT:    v_dual_mov_b32 v8, v19 :: v_dual_mov_b32 v15, v26
 ; GFX11-NEXT:    scratch_store_b96 off, v[56:58], s0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 0x50
-; GFX11-NEXT:    v_mov_b32_e32 v13, v24
-; GFX11-NEXT:    scratch_store_b128 off, v[41:44], s0
+; GFX11-NEXT:    v_dual_mov_b32 v12, v23 :: v_dual_mov_b32 v29, v45
+; GFX11-NEXT:    scratch_store_b128 off, v[40:43], s0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 64
-; GFX11-NEXT:    v_dual_mov_b32 v14, v25 :: v_dual_mov_b32 v31, v47
+; GFX11-NEXT:    v_mov_b32_e32 v14, v25
 ; GFX11-NEXT:    scratch_store_b128 off, v[52:55], s0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 48
-; GFX11-NEXT:    v_mov_b32_e32 v15, v26
-; GFX11-NEXT:    scratch_store_b128 off, v[48:51], s0
-; GFX11-NEXT:    s_add_i32 s0, s32, 32
 ; GFX11-NEXT:    v_mov_b32_e32 v16, v27
 ; GFX11-NEXT:    scratch_store_b128 off, v[36:39], s0
-; GFX11-NEXT:    s_add_i32 s0, s32, 16
+; GFX11-NEXT:    s_add_i32 s0, s32, 32
 ; GFX11-NEXT:    v_mov_b32_e32 v30, v46
+; GFX11-NEXT:    scratch_store_b128 off, v[48:51], s0
+; GFX11-NEXT:    s_add_i32 s0, s32, 16
 ; GFX11-NEXT:    scratch_store_b128 off, v[32:35], s0
 ; GFX11-NEXT:    scratch_load_b128 v[1:4], off, s33 offset:1588 ; 16-byte Folded Reload
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -3485,26 +3474,23 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[44:45]
-; GFX11-NEXT:    s_clause 0xe
-; GFX11-NEXT:    scratch_load_b32 v63, off, s33
-; GFX11-NEXT:    scratch_load_b32 v62, off, s33 offset:4
-; GFX11-NEXT:    scratch_load_b32 v61, off, s33 offset:8
-; GFX11-NEXT:    scratch_load_b32 v60, off, s33 offset:12
-; GFX11-NEXT:    scratch_load_b32 v59, off, s33 offset:16
-; GFX11-NEXT:    scratch_load_b32 v58, off, s33 offset:20
-; GFX11-NEXT:    scratch_load_b32 v57, off, s33 offset:24
-; GFX11-NEXT:    scratch_load_b32 v56, off, s33 offset:28
-; GFX11-NEXT:    scratch_load_b32 v47, off, s33 offset:32
-; GFX11-NEXT:    scratch_load_b32 v46, off, s33 offset:36
-; GFX11-NEXT:    scratch_load_b32 v45, off, s33 offset:40
-; GFX11-NEXT:    scratch_load_b32 v44, off, s33 offset:44
-; GFX11-NEXT:    scratch_load_b32 v43, off, s33 offset:48
-; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:52
-; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:56
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_clause 0xb
+; GFX11-NEXT:    scratch_load_b32 v59, off, s33
+; GFX11-NEXT:    scratch_load_b32 v58, off, s33 offset:4
+; GFX11-NEXT:    scratch_load_b32 v57, off, s33 offset:8
+; GFX11-NEXT:    scratch_load_b32 v56, off, s33 offset:12
+; GFX11-NEXT:    scratch_load_b32 v47, off, s33 offset:16
+; GFX11-NEXT:    scratch_load_b32 v46, off, s33 offset:20
+; GFX11-NEXT:    scratch_load_b32 v45, off, s33 offset:24
+; GFX11-NEXT:    scratch_load_b32 v44, off, s33 offset:28
+; GFX11-NEXT:    scratch_load_b32 v43, off, s33 offset:32
+; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:36
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:40
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:44
+; GFX11-NEXT:    v_readlane_b32 s31, v60, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v60, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:1536 ; 4-byte Folded Reload
+; GFX11-NEXT:    scratch_load_b32 v60, off, s33 offset:1536 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX11-NEXT:    s_addk_i32 s32, 0xf600
 ; GFX11-NEXT:    s_mov_b32 s33, s46

diff  --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
index 65404637ca51bf..76ec1cc84f55b2 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
@@ -152,23 +152,23 @@ define amdgpu_gfx void @global_atomic_xchg_i32_noret_scalar(ptr addrspace(1) inr
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v1, s6, 0
+; SI-NEXT:    v_writelane_b32 v1, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_swap v1, off, s[4:7], 0
+; SI-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v1, 1
+; SI-NEXT:    v_readlane_b32 s6, v1, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -202,23 +202,23 @@ define amdgpu_gfx void @global_atomic_xchg_i32_noret_offset_scalar(ptr addrspace
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v1, s6, 0
+; SI-NEXT:    v_writelane_b32 v1, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_swap v1, off, s[4:7], 0 offset:16
+; SI-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 offset:16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v1, 1
+; SI-NEXT:    v_readlane_b32 s6, v1, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -830,23 +830,23 @@ define amdgpu_gfx void @global_atomic_xchg_f32_noret_scalar(ptr addrspace(1) inr
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v1, s6, 0
+; SI-NEXT:    v_writelane_b32 v1, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_swap v1, off, s[4:7], 0
+; SI-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v1, 1
+; SI-NEXT:    v_readlane_b32 s6, v1, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -958,23 +958,23 @@ define amdgpu_gfx void @global_atomic_xchg_f32_noret_offset_scalar(ptr addrspace
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v1, s6, 0
+; SI-NEXT:    v_writelane_b32 v1, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_swap v1, off, s[4:7], 0 offset:16
+; SI-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 offset:16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v1, 1
+; SI-NEXT:    v_readlane_b32 s6, v1, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -1416,23 +1416,23 @@ define amdgpu_gfx void @global_atomic_add_i32_noret_scalar(ptr addrspace(1) inre
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v1, s6, 0
+; SI-NEXT:    v_writelane_b32 v1, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_add v1, off, s[4:7], 0
+; SI-NEXT:    buffer_atomic_add v0, off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v1, 1
+; SI-NEXT:    v_readlane_b32 s6, v1, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -1466,23 +1466,23 @@ define amdgpu_gfx void @global_atomic_add_i32_noret_offset_scalar(ptr addrspace(
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v1, s6, 0
+; SI-NEXT:    v_writelane_b32 v1, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_add v1, off, s[4:7], 0 offset:16
+; SI-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 offset:16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v1, 1
+; SI-NEXT:    v_readlane_b32 s6, v1, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -1766,23 +1766,23 @@ define amdgpu_gfx void @global_atomic_sub_i32_noret_scalar(ptr addrspace(1) inre
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v1, s6, 0
+; SI-NEXT:    v_writelane_b32 v1, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_sub v1, off, s[4:7], 0
+; SI-NEXT:    buffer_atomic_sub v0, off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v1, 1
+; SI-NEXT:    v_readlane_b32 s6, v1, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -1816,23 +1816,23 @@ define amdgpu_gfx void @global_atomic_sub_i32_noret_offset_scalar(ptr addrspace(
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v1, s6, 0
+; SI-NEXT:    v_writelane_b32 v1, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_sub v1, off, s[4:7], 0 offset:16
+; SI-NEXT:    buffer_atomic_sub v0, off, s[4:7], 0 offset:16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v1, 1
+; SI-NEXT:    v_readlane_b32 s6, v1, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -2116,23 +2116,23 @@ define amdgpu_gfx void @global_atomic_and_i32_noret_scalar(ptr addrspace(1) inre
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v1, s6, 0
+; SI-NEXT:    v_writelane_b32 v1, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_and v1, off, s[4:7], 0
+; SI-NEXT:    buffer_atomic_and v0, off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v1, 1
+; SI-NEXT:    v_readlane_b32 s6, v1, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -2166,23 +2166,23 @@ define amdgpu_gfx void @global_atomic_and_i32_noret_offset_scalar(ptr addrspace(
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v1, s6, 0
+; SI-NEXT:    v_writelane_b32 v1, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_and v1, off, s[4:7], 0 offset:16
+; SI-NEXT:    buffer_atomic_and v0, off, s[4:7], 0 offset:16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v1, 1
+; SI-NEXT:    v_readlane_b32 s6, v1, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -2649,38 +2649,38 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_scalar(ptr addrspace(1) inr
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v4, s6, 0
+; SI-NEXT:    v_writelane_b32 v4, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
+; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, s34, v2
-; SI-NEXT:    v_not_b32_e32 v1, v1
+; SI-NEXT:    v_and_b32_e32 v0, s34, v1
+; SI-NEXT:    v_not_b32_e32 v0, v0
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v4, v2
 ; SI-NEXT:    v_mov_b32_e32 v3, v1
-; SI-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v2, v3
+; SI-NEXT:    v_mov_b32_e32 v1, v2
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB44_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v4, 1
+; SI-NEXT:    v_readlane_b32 s6, v4, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -2740,38 +2740,38 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_offset_scalar(ptr addrspace
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v4, s6, 0
+; SI-NEXT:    v_writelane_b32 v4, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0 offset:16
+; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0 offset:16
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, s34, v2
-; SI-NEXT:    v_not_b32_e32 v1, v1
+; SI-NEXT:    v_and_b32_e32 v0, s34, v1
+; SI-NEXT:    v_not_b32_e32 v0, v0
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v4, v2
 ; SI-NEXT:    v_mov_b32_e32 v3, v1
-; SI-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 offset:16 glc
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v2, v3
+; SI-NEXT:    v_mov_b32_e32 v1, v2
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB45_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v4, 1
+; SI-NEXT:    v_readlane_b32 s6, v4, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -2834,39 +2834,38 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_scalar(ptr addrspace(1) inreg
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v1, s6, 0
-; SI-NEXT:    v_writelane_b32 v1, s7, 1
+; SI-NEXT:    v_writelane_b32 v3, s6, 0
+; SI-NEXT:    v_writelane_b32 v3, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
+; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v4, v2
-; SI-NEXT:    v_and_b32_e32 v0, s34, v4
+; SI-NEXT:    v_mov_b32_e32 v2, v0
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_not_b32_e32 v3, v0
-; SI-NEXT:    v_mov_b32_e32 v2, v3
-; SI-NEXT:    v_mov_b32_e32 v3, v4
-; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
+; SI-NEXT:    v_and_b32_e32 v0, s34, v2
+; SI-NEXT:    v_not_b32_e32 v1, v0
+; SI-NEXT:    v_mov_b32_e32 v0, v1
+; SI-NEXT:    v_mov_b32_e32 v1, v2
+; SI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB46_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v0, v2
-; SI-NEXT:    v_readlane_b32 s7, v1, 1
-; SI-NEXT:    v_readlane_b32 s6, v1, 0
+; SI-NEXT:    v_readlane_b32 s7, v3, 1
+; SI-NEXT:    v_readlane_b32 s6, v3, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -2928,39 +2927,38 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_offset_scalar(ptr addrspace(1)
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v1, s6, 0
-; SI-NEXT:    v_writelane_b32 v1, s7, 1
+; SI-NEXT:    v_writelane_b32 v3, s6, 0
+; SI-NEXT:    v_writelane_b32 v3, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0 offset:16
+; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0 offset:16
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v4, v2
-; SI-NEXT:    v_and_b32_e32 v0, s34, v4
+; SI-NEXT:    v_mov_b32_e32 v2, v0
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_not_b32_e32 v3, v0
-; SI-NEXT:    v_mov_b32_e32 v2, v3
-; SI-NEXT:    v_mov_b32_e32 v3, v4
-; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
+; SI-NEXT:    v_and_b32_e32 v0, s34, v2
+; SI-NEXT:    v_not_b32_e32 v1, v0
+; SI-NEXT:    v_mov_b32_e32 v0, v1
+; SI-NEXT:    v_mov_b32_e32 v1, v2
+; SI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB47_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v0, v2
-; SI-NEXT:    v_readlane_b32 s7, v1, 1
-; SI-NEXT:    v_readlane_b32 s6, v1, 0
+; SI-NEXT:    v_readlane_b32 s7, v3, 1
+; SI-NEXT:    v_readlane_b32 s6, v3, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -3167,23 +3165,23 @@ define amdgpu_gfx void @global_atomic_or_i32_noret_scalar(ptr addrspace(1) inreg
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v1, s6, 0
+; SI-NEXT:    v_writelane_b32 v1, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_or v1, off, s[4:7], 0
+; SI-NEXT:    buffer_atomic_or v0, off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v1, 1
+; SI-NEXT:    v_readlane_b32 s6, v1, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -3217,23 +3215,23 @@ define amdgpu_gfx void @global_atomic_or_i32_noret_offset_scalar(ptr addrspace(1
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v1, s6, 0
+; SI-NEXT:    v_writelane_b32 v1, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_or v1, off, s[4:7], 0 offset:16
+; SI-NEXT:    buffer_atomic_or v0, off, s[4:7], 0 offset:16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v1, 1
+; SI-NEXT:    v_readlane_b32 s6, v1, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -3517,23 +3515,23 @@ define amdgpu_gfx void @global_atomic_xor_i32_noret_scalar(ptr addrspace(1) inre
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v1, s6, 0
+; SI-NEXT:    v_writelane_b32 v1, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_xor v1, off, s[4:7], 0
+; SI-NEXT:    buffer_atomic_xor v0, off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v1, 1
+; SI-NEXT:    v_readlane_b32 s6, v1, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -3567,23 +3565,23 @@ define amdgpu_gfx void @global_atomic_xor_i32_noret_offset_scalar(ptr addrspace(
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v1, s6, 0
+; SI-NEXT:    v_writelane_b32 v1, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_xor v1, off, s[4:7], 0 offset:16
+; SI-NEXT:    buffer_atomic_xor v0, off, s[4:7], 0 offset:16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v1, 1
+; SI-NEXT:    v_readlane_b32 s6, v1, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -4038,37 +4036,37 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_scalar(ptr addrspace(1) inre
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v4, s6, 0
+; SI-NEXT:    v_writelane_b32 v4, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
+; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB68_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_max_i32_e32 v1, s34, v2
+; SI-NEXT:    v_max_i32_e32 v0, s34, v1
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v4, v2
 ; SI-NEXT:    v_mov_b32_e32 v3, v1
-; SI-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v2, v3
+; SI-NEXT:    v_mov_b32_e32 v1, v2
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB68_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v4, 1
+; SI-NEXT:    v_readlane_b32 s6, v4, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -4126,37 +4124,37 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_offset_scalar(ptr addrspace(
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v4, s6, 0
+; SI-NEXT:    v_writelane_b32 v4, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0 offset:16
+; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0 offset:16
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB69_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_max_i32_e32 v1, s34, v2
+; SI-NEXT:    v_max_i32_e32 v0, s34, v1
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v4, v2
 ; SI-NEXT:    v_mov_b32_e32 v3, v1
-; SI-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 offset:16 glc
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v2, v3
+; SI-NEXT:    v_mov_b32_e32 v1, v2
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB69_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v4, 1
+; SI-NEXT:    v_readlane_b32 s6, v4, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -4217,38 +4215,37 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_scalar(ptr addrspace(1) inreg %
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v1, s6, 0
-; SI-NEXT:    v_writelane_b32 v1, s7, 1
+; SI-NEXT:    v_writelane_b32 v3, s6, 0
+; SI-NEXT:    v_writelane_b32 v3, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
+; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB70_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v4, v2
+; SI-NEXT:    v_mov_b32_e32 v2, v0
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_max_i32_e32 v3, s34, v4
-; SI-NEXT:    v_mov_b32_e32 v2, v3
-; SI-NEXT:    v_mov_b32_e32 v3, v4
-; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
+; SI-NEXT:    v_max_i32_e32 v1, s34, v2
+; SI-NEXT:    v_mov_b32_e32 v0, v1
+; SI-NEXT:    v_mov_b32_e32 v1, v2
+; SI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB70_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v0, v2
-; SI-NEXT:    v_readlane_b32 s7, v1, 1
-; SI-NEXT:    v_readlane_b32 s6, v1, 0
+; SI-NEXT:    v_readlane_b32 s7, v3, 1
+; SI-NEXT:    v_readlane_b32 s6, v3, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -4308,38 +4305,37 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_offset_scalar(ptr addrspace(1)
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v1, s6, 0
-; SI-NEXT:    v_writelane_b32 v1, s7, 1
+; SI-NEXT:    v_writelane_b32 v3, s6, 0
+; SI-NEXT:    v_writelane_b32 v3, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0 offset:16
+; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0 offset:16
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB71_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v4, v2
+; SI-NEXT:    v_mov_b32_e32 v2, v0
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_max_i32_e32 v3, s34, v4
-; SI-NEXT:    v_mov_b32_e32 v2, v3
-; SI-NEXT:    v_mov_b32_e32 v3, v4
-; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
+; SI-NEXT:    v_max_i32_e32 v1, s34, v2
+; SI-NEXT:    v_mov_b32_e32 v0, v1
+; SI-NEXT:    v_mov_b32_e32 v1, v2
+; SI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB71_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v0, v2
-; SI-NEXT:    v_readlane_b32 s7, v1, 1
-; SI-NEXT:    v_readlane_b32 s6, v1, 0
+; SI-NEXT:    v_readlane_b32 s7, v3, 1
+; SI-NEXT:    v_readlane_b32 s6, v3, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -5135,37 +5131,37 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_scalar(ptr addrspace(1) inr
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v4, s6, 0
+; SI-NEXT:    v_writelane_b32 v4, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
+; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB80_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_max_u32_e32 v1, s34, v2
+; SI-NEXT:    v_max_u32_e32 v0, s34, v1
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v4, v2
 ; SI-NEXT:    v_mov_b32_e32 v3, v1
-; SI-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v2, v3
+; SI-NEXT:    v_mov_b32_e32 v1, v2
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB80_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v4, 1
+; SI-NEXT:    v_readlane_b32 s6, v4, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -5223,37 +5219,37 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_offset_scalar(ptr addrspace
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v4, s6, 0
+; SI-NEXT:    v_writelane_b32 v4, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0 offset:16
+; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0 offset:16
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB81_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_max_u32_e32 v1, s34, v2
+; SI-NEXT:    v_max_u32_e32 v0, s34, v1
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v4, v2
 ; SI-NEXT:    v_mov_b32_e32 v3, v1
-; SI-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 offset:16 glc
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v2, v3
+; SI-NEXT:    v_mov_b32_e32 v1, v2
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB81_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v4, 1
+; SI-NEXT:    v_readlane_b32 s6, v4, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -5314,38 +5310,37 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_scalar(ptr addrspace(1) inreg
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v1, s6, 0
-; SI-NEXT:    v_writelane_b32 v1, s7, 1
+; SI-NEXT:    v_writelane_b32 v3, s6, 0
+; SI-NEXT:    v_writelane_b32 v3, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
+; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB82_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v4, v2
+; SI-NEXT:    v_mov_b32_e32 v2, v0
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_max_u32_e32 v3, s34, v4
-; SI-NEXT:    v_mov_b32_e32 v2, v3
-; SI-NEXT:    v_mov_b32_e32 v3, v4
-; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
+; SI-NEXT:    v_max_u32_e32 v1, s34, v2
+; SI-NEXT:    v_mov_b32_e32 v0, v1
+; SI-NEXT:    v_mov_b32_e32 v1, v2
+; SI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB82_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v0, v2
-; SI-NEXT:    v_readlane_b32 s7, v1, 1
-; SI-NEXT:    v_readlane_b32 s6, v1, 0
+; SI-NEXT:    v_readlane_b32 s7, v3, 1
+; SI-NEXT:    v_readlane_b32 s6, v3, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -5405,38 +5400,37 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_offset_scalar(ptr addrspace(1)
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v1, s6, 0
-; SI-NEXT:    v_writelane_b32 v1, s7, 1
+; SI-NEXT:    v_writelane_b32 v3, s6, 0
+; SI-NEXT:    v_writelane_b32 v3, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0 offset:16
+; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0 offset:16
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB83_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v4, v2
+; SI-NEXT:    v_mov_b32_e32 v2, v0
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_max_u32_e32 v3, s34, v4
-; SI-NEXT:    v_mov_b32_e32 v2, v3
-; SI-NEXT:    v_mov_b32_e32 v3, v4
-; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
+; SI-NEXT:    v_max_u32_e32 v1, s34, v2
+; SI-NEXT:    v_mov_b32_e32 v0, v1
+; SI-NEXT:    v_mov_b32_e32 v1, v2
+; SI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB83_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v0, v2
-; SI-NEXT:    v_readlane_b32 s7, v1, 1
-; SI-NEXT:    v_readlane_b32 s6, v1, 0
+; SI-NEXT:    v_readlane_b32 s7, v3, 1
+; SI-NEXT:    v_readlane_b32 s6, v3, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -6137,37 +6131,37 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_scalar(ptr addrspace(1) inr
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v4, s6, 0
+; SI-NEXT:    v_writelane_b32 v4, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
+; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB91_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_min_u32_e32 v1, s34, v2
+; SI-NEXT:    v_min_u32_e32 v0, s34, v1
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v4, v2
 ; SI-NEXT:    v_mov_b32_e32 v3, v1
-; SI-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v2, v3
+; SI-NEXT:    v_mov_b32_e32 v1, v2
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB91_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v4, 1
+; SI-NEXT:    v_readlane_b32 s6, v4, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -6225,37 +6219,37 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_offset_scalar(ptr addrspace
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v4, s6, 0
+; SI-NEXT:    v_writelane_b32 v4, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0 offset:16
+; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0 offset:16
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB92_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_min_u32_e32 v1, s34, v2
+; SI-NEXT:    v_min_u32_e32 v0, s34, v1
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v4, v2
 ; SI-NEXT:    v_mov_b32_e32 v3, v1
-; SI-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 offset:16 glc
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v2, v3
+; SI-NEXT:    v_mov_b32_e32 v1, v2
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB92_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v4, 1
+; SI-NEXT:    v_readlane_b32 s6, v4, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -6316,38 +6310,37 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_scalar(ptr addrspace(1) inreg
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v1, s6, 0
-; SI-NEXT:    v_writelane_b32 v1, s7, 1
+; SI-NEXT:    v_writelane_b32 v3, s6, 0
+; SI-NEXT:    v_writelane_b32 v3, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
+; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB93_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v4, v2
+; SI-NEXT:    v_mov_b32_e32 v2, v0
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_min_u32_e32 v3, s34, v4
-; SI-NEXT:    v_mov_b32_e32 v2, v3
-; SI-NEXT:    v_mov_b32_e32 v3, v4
-; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
+; SI-NEXT:    v_min_u32_e32 v1, s34, v2
+; SI-NEXT:    v_mov_b32_e32 v0, v1
+; SI-NEXT:    v_mov_b32_e32 v1, v2
+; SI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB93_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v0, v2
-; SI-NEXT:    v_readlane_b32 s7, v1, 1
-; SI-NEXT:    v_readlane_b32 s6, v1, 0
+; SI-NEXT:    v_readlane_b32 s7, v3, 1
+; SI-NEXT:    v_readlane_b32 s6, v3, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -6407,38 +6400,37 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_offset_scalar(ptr addrspace(1)
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v1, s6, 0
-; SI-NEXT:    v_writelane_b32 v1, s7, 1
+; SI-NEXT:    v_writelane_b32 v3, s6, 0
+; SI-NEXT:    v_writelane_b32 v3, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0 offset:16
+; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0 offset:16
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB94_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v4, v2
+; SI-NEXT:    v_mov_b32_e32 v2, v0
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_min_u32_e32 v3, s34, v4
-; SI-NEXT:    v_mov_b32_e32 v2, v3
-; SI-NEXT:    v_mov_b32_e32 v3, v4
-; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
+; SI-NEXT:    v_min_u32_e32 v1, s34, v2
+; SI-NEXT:    v_mov_b32_e32 v0, v1
+; SI-NEXT:    v_mov_b32_e32 v1, v2
+; SI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB94_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v0, v2
-; SI-NEXT:    v_readlane_b32 s7, v1, 1
-; SI-NEXT:    v_readlane_b32 s6, v1, 0
+; SI-NEXT:    v_readlane_b32 s7, v3, 1
+; SI-NEXT:    v_readlane_b32 s6, v3, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -6814,37 +6806,37 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_scalar(ptr addrspace(1) inre
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v4, s6, 0
+; SI-NEXT:    v_writelane_b32 v4, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
+; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB99_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_min_i32_e32 v1, s34, v2
+; SI-NEXT:    v_min_i32_e32 v0, s34, v1
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v4, v2
 ; SI-NEXT:    v_mov_b32_e32 v3, v1
-; SI-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v2, v3
+; SI-NEXT:    v_mov_b32_e32 v1, v2
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB99_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v4, 1
+; SI-NEXT:    v_readlane_b32 s6, v4, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -6902,37 +6894,37 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_offset_scalar(ptr addrspace(
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v4, s6, 0
+; SI-NEXT:    v_writelane_b32 v4, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0 offset:16
+; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0 offset:16
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB100_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_min_i32_e32 v1, s34, v2
+; SI-NEXT:    v_min_i32_e32 v0, s34, v1
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v4, v2
 ; SI-NEXT:    v_mov_b32_e32 v3, v1
-; SI-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 offset:16 glc
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v2, v3
+; SI-NEXT:    v_mov_b32_e32 v1, v2
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB100_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v4, 1
+; SI-NEXT:    v_readlane_b32 s6, v4, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -6993,38 +6985,37 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_scalar(ptr addrspace(1) inreg %
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v1, s6, 0
-; SI-NEXT:    v_writelane_b32 v1, s7, 1
+; SI-NEXT:    v_writelane_b32 v3, s6, 0
+; SI-NEXT:    v_writelane_b32 v3, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
+; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB101_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v4, v2
+; SI-NEXT:    v_mov_b32_e32 v2, v0
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_min_i32_e32 v3, s34, v4
-; SI-NEXT:    v_mov_b32_e32 v2, v3
-; SI-NEXT:    v_mov_b32_e32 v3, v4
-; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
+; SI-NEXT:    v_min_i32_e32 v1, s34, v2
+; SI-NEXT:    v_mov_b32_e32 v0, v1
+; SI-NEXT:    v_mov_b32_e32 v1, v2
+; SI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB101_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v0, v2
-; SI-NEXT:    v_readlane_b32 s7, v1, 1
-; SI-NEXT:    v_readlane_b32 s6, v1, 0
+; SI-NEXT:    v_readlane_b32 s7, v3, 1
+; SI-NEXT:    v_readlane_b32 s6, v3, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -7084,38 +7075,37 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_offset_scalar(ptr addrspace(1)
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v1, s6, 0
-; SI-NEXT:    v_writelane_b32 v1, s7, 1
+; SI-NEXT:    v_writelane_b32 v3, s6, 0
+; SI-NEXT:    v_writelane_b32 v3, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0 offset:16
+; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0 offset:16
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB102_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v4, v2
+; SI-NEXT:    v_mov_b32_e32 v2, v0
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_min_i32_e32 v3, s34, v4
-; SI-NEXT:    v_mov_b32_e32 v2, v3
-; SI-NEXT:    v_mov_b32_e32 v3, v4
-; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
+; SI-NEXT:    v_min_i32_e32 v1, s34, v2
+; SI-NEXT:    v_mov_b32_e32 v0, v1
+; SI-NEXT:    v_mov_b32_e32 v1, v2
+; SI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB102_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v0, v2
-; SI-NEXT:    v_readlane_b32 s7, v1, 1
-; SI-NEXT:    v_readlane_b32 s6, v1, 0
+; SI-NEXT:    v_readlane_b32 s7, v3, 1
+; SI-NEXT:    v_readlane_b32 s6, v3, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -7727,23 +7717,23 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i32_noret_scalar(ptr addrspace(1
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v1, s6, 0
+; SI-NEXT:    v_writelane_b32 v1, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_inc v1, off, s[4:7], 0
+; SI-NEXT:    buffer_atomic_inc v0, off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v1, 1
+; SI-NEXT:    v_readlane_b32 s6, v1, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -7777,23 +7767,23 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i32_noret_offset_scalar(ptr addr
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v1, s6, 0
+; SI-NEXT:    v_writelane_b32 v1, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_inc v1, off, s[4:7], 0 offset:16
+; SI-NEXT:    buffer_atomic_inc v0, off, s[4:7], 0 offset:16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v1, 1
+; SI-NEXT:    v_readlane_b32 s6, v1, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -8077,23 +8067,23 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_scalar(ptr addrspace(1
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v1, s6, 0
+; SI-NEXT:    v_writelane_b32 v1, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_dec v1, off, s[4:7], 0
+; SI-NEXT:    buffer_atomic_dec v0, off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v1, 1
+; SI-NEXT:    v_readlane_b32 s6, v1, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -8127,23 +8117,23 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_offset_scalar(ptr addr
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v1, s6, 0
+; SI-NEXT:    v_writelane_b32 v1, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_dec v1, off, s[4:7], 0 offset:16
+; SI-NEXT:    buffer_atomic_dec v0, off, s[4:7], 0 offset:16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v1, 1
+; SI-NEXT:    v_readlane_b32 s6, v1, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
index 34457781a99992..d137f471910dc6 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
@@ -154,25 +154,25 @@ define amdgpu_gfx void @global_atomic_xchg_i64_noret_scalar(ptr addrspace(1) inr
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v2, s6, 0
+; SI-NEXT:    v_writelane_b32 v2, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s7
 ; SI-NEXT:    s_mov_b32 s35, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s35
-; SI-NEXT:    v_mov_b32_e32 v2, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s35
+; SI-NEXT:    v_mov_b32_e32 v1, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_swap_x2 v[1:2], off, s[4:7], 0
+; SI-NEXT:    buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v2, 1
+; SI-NEXT:    v_readlane_b32 s6, v2, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -208,23 +208,23 @@ define amdgpu_gfx void @global_atomic_xchg_i64_noret_offset_scalar(ptr addrspace
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
-; SI-NEXT:    v_mov_b32_e32 v1, s6
-; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    v_writelane_b32 v2, s6, 0
+; SI-NEXT:    v_writelane_b32 v2, s7, 1
+; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    v_mov_b32_e32 v1, s7
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_swap_x2 v[1:2], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v2, 1
+; SI-NEXT:    v_readlane_b32 s6, v2, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -846,25 +846,25 @@ define amdgpu_gfx void @global_atomic_xchg_f64_noret_scalar(ptr addrspace(1) inr
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v2, s6, 0
+; SI-NEXT:    v_writelane_b32 v2, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s7
 ; SI-NEXT:    s_mov_b32 s35, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s35
-; SI-NEXT:    v_mov_b32_e32 v2, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s35
+; SI-NEXT:    v_mov_b32_e32 v1, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_swap_x2 v[1:2], off, s[4:7], 0
+; SI-NEXT:    buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v2, 1
+; SI-NEXT:    v_readlane_b32 s6, v2, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -978,23 +978,23 @@ define amdgpu_gfx void @global_atomic_xchg_f64_noret_offset_scalar(ptr addrspace
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
-; SI-NEXT:    v_mov_b32_e32 v1, s6
-; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    v_writelane_b32 v2, s6, 0
+; SI-NEXT:    v_writelane_b32 v2, s7, 1
+; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    v_mov_b32_e32 v1, s7
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_swap_x2 v[1:2], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v2, 1
+; SI-NEXT:    v_readlane_b32 s6, v2, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -1446,25 +1446,25 @@ define amdgpu_gfx void @global_atomic_add_i64_noret_scalar(ptr addrspace(1) inre
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v2, s6, 0
+; SI-NEXT:    v_writelane_b32 v2, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s7
 ; SI-NEXT:    s_mov_b32 s35, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s35
-; SI-NEXT:    v_mov_b32_e32 v2, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s35
+; SI-NEXT:    v_mov_b32_e32 v1, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_add_x2 v[1:2], off, s[4:7], 0
+; SI-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v2, 1
+; SI-NEXT:    v_readlane_b32 s6, v2, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -1500,23 +1500,23 @@ define amdgpu_gfx void @global_atomic_add_i64_noret_offset_scalar(ptr addrspace(
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
-; SI-NEXT:    v_mov_b32_e32 v1, s6
-; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    v_writelane_b32 v2, s6, 0
+; SI-NEXT:    v_writelane_b32 v2, s7, 1
+; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    v_mov_b32_e32 v1, s7
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_add_x2 v[1:2], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v2, 1
+; SI-NEXT:    v_readlane_b32 s6, v2, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -1810,25 +1810,25 @@ define amdgpu_gfx void @global_atomic_sub_i64_noret_scalar(ptr addrspace(1) inre
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v2, s6, 0
+; SI-NEXT:    v_writelane_b32 v2, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s7
 ; SI-NEXT:    s_mov_b32 s35, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s35
-; SI-NEXT:    v_mov_b32_e32 v2, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s35
+; SI-NEXT:    v_mov_b32_e32 v1, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_sub_x2 v[1:2], off, s[4:7], 0
+; SI-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v2, 1
+; SI-NEXT:    v_readlane_b32 s6, v2, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -1864,23 +1864,23 @@ define amdgpu_gfx void @global_atomic_sub_i64_noret_offset_scalar(ptr addrspace(
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
-; SI-NEXT:    v_mov_b32_e32 v1, s6
-; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    v_writelane_b32 v2, s6, 0
+; SI-NEXT:    v_writelane_b32 v2, s7, 1
+; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    v_mov_b32_e32 v1, s7
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_sub_x2 v[1:2], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v2, 1
+; SI-NEXT:    v_readlane_b32 s6, v2, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -2174,25 +2174,25 @@ define amdgpu_gfx void @global_atomic_and_i64_noret_scalar(ptr addrspace(1) inre
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v2, s6, 0
+; SI-NEXT:    v_writelane_b32 v2, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s7
 ; SI-NEXT:    s_mov_b32 s35, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s35
-; SI-NEXT:    v_mov_b32_e32 v2, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s35
+; SI-NEXT:    v_mov_b32_e32 v1, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_and_x2 v[1:2], off, s[4:7], 0
+; SI-NEXT:    buffer_atomic_and_x2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v2, 1
+; SI-NEXT:    v_readlane_b32 s6, v2, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -2228,23 +2228,23 @@ define amdgpu_gfx void @global_atomic_and_i64_noret_offset_scalar(ptr addrspace(
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
-; SI-NEXT:    v_mov_b32_e32 v1, s6
-; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    v_writelane_b32 v2, s6, 0
+; SI-NEXT:    v_writelane_b32 v2, s7, 1
+; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    v_mov_b32_e32 v1, s7
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_and_x2 v[1:2], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_atomic_and_x2 v[0:1], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v2, 1
+; SI-NEXT:    v_readlane_b32 s6, v2, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -2772,44 +2772,44 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_scalar(ptr addrspace(1) inr
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v8, s6, 0
+; SI-NEXT:    v_writelane_b32 v8, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s7
 ; SI-NEXT:    s_mov_b32 s35, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[3:4], off, s[4:7], 0
+; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, s34, v4
+; SI-NEXT:    v_and_b32_e32 v0, s34, v3
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v5, s35, v3
-; SI-NEXT:    v_not_b32_e32 v2, v1
-; SI-NEXT:    v_not_b32_e32 v1, v5
-; SI-NEXT:    v_mov_b32_e32 v8, v4
+; SI-NEXT:    v_and_b32_e32 v4, s35, v2
+; SI-NEXT:    v_not_b32_e32 v1, v0
+; SI-NEXT:    v_not_b32_e32 v0, v4
 ; SI-NEXT:    v_mov_b32_e32 v7, v3
 ; SI-NEXT:    v_mov_b32_e32 v6, v2
 ; SI-NEXT:    v_mov_b32_e32 v5, v1
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v4, v0
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v2, v4
 ; SI-NEXT:    v_mov_b32_e32 v3, v5
-; SI-NEXT:    v_mov_b32_e32 v4, v6
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB44_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v8, 1
+; SI-NEXT:    v_readlane_b32 s6, v8, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -2877,44 +2877,44 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_offset_scalar(ptr addrspace
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v8, s6, 0
+; SI-NEXT:    v_writelane_b32 v8, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s7
 ; SI-NEXT:    s_mov_b32 s35, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, s34, v4
+; SI-NEXT:    v_and_b32_e32 v0, s34, v3
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v5, s35, v3
-; SI-NEXT:    v_not_b32_e32 v2, v1
-; SI-NEXT:    v_not_b32_e32 v1, v5
-; SI-NEXT:    v_mov_b32_e32 v8, v4
+; SI-NEXT:    v_and_b32_e32 v4, s35, v2
+; SI-NEXT:    v_not_b32_e32 v1, v0
+; SI-NEXT:    v_not_b32_e32 v0, v4
 ; SI-NEXT:    v_mov_b32_e32 v7, v3
 ; SI-NEXT:    v_mov_b32_e32 v6, v2
 ; SI-NEXT:    v_mov_b32_e32 v5, v1
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 offset:32 glc
+; SI-NEXT:    v_mov_b32_e32 v4, v0
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v2, v4
 ; SI-NEXT:    v_mov_b32_e32 v3, v5
-; SI-NEXT:    v_mov_b32_e32 v4, v6
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB45_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v8, 1
+; SI-NEXT:    v_readlane_b32 s6, v8, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -2983,46 +2983,44 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_scalar(ptr addrspace(1) inreg
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v2, s6, 0
-; SI-NEXT:    v_writelane_b32 v2, s7, 1
+; SI-NEXT:    v_writelane_b32 v6, s6, 0
+; SI-NEXT:    v_writelane_b32 v6, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s7
 ; SI-NEXT:    s_mov_b32 s35, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[3:4], off, s[4:7], 0
+; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v8, v4
-; SI-NEXT:    v_mov_b32_e32 v7, v3
-; SI-NEXT:    v_and_b32_e32 v0, s34, v8
-; SI-NEXT:    v_and_b32_e32 v1, s35, v7
+; SI-NEXT:    v_mov_b32_e32 v5, v1
+; SI-NEXT:    v_mov_b32_e32 v4, v0
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_not_b32_e32 v6, v0
-; SI-NEXT:    v_not_b32_e32 v5, v1
+; SI-NEXT:    v_and_b32_e32 v0, s34, v5
+; SI-NEXT:    v_and_b32_e32 v1, s35, v4
+; SI-NEXT:    v_not_b32_e32 v3, v0
+; SI-NEXT:    v_not_b32_e32 v2, v1
+; SI-NEXT:    v_mov_b32_e32 v0, v2
+; SI-NEXT:    v_mov_b32_e32 v1, v3
+; SI-NEXT:    v_mov_b32_e32 v2, v4
 ; SI-NEXT:    v_mov_b32_e32 v3, v5
-; SI-NEXT:    v_mov_b32_e32 v4, v6
-; SI-NEXT:    v_mov_b32_e32 v5, v7
-; SI-NEXT:    v_mov_b32_e32 v6, v8
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 glc
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB46_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v0, v3
-; SI-NEXT:    v_mov_b32_e32 v1, v4
-; SI-NEXT:    v_readlane_b32 s7, v2, 1
-; SI-NEXT:    v_readlane_b32 s6, v2, 0
+; SI-NEXT:    v_readlane_b32 s7, v6, 1
+; SI-NEXT:    v_readlane_b32 s6, v6, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -3090,46 +3088,44 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_offset_scalar(ptr addrspace(1)
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v2, s6, 0
-; SI-NEXT:    v_writelane_b32 v2, s7, 1
+; SI-NEXT:    v_writelane_b32 v6, s6, 0
+; SI-NEXT:    v_writelane_b32 v6, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s7
 ; SI-NEXT:    s_mov_b32 s35, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v8, v4
-; SI-NEXT:    v_mov_b32_e32 v7, v3
-; SI-NEXT:    v_and_b32_e32 v0, s34, v8
-; SI-NEXT:    v_and_b32_e32 v1, s35, v7
+; SI-NEXT:    v_mov_b32_e32 v5, v1
+; SI-NEXT:    v_mov_b32_e32 v4, v0
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_not_b32_e32 v6, v0
-; SI-NEXT:    v_not_b32_e32 v5, v1
+; SI-NEXT:    v_and_b32_e32 v0, s34, v5
+; SI-NEXT:    v_and_b32_e32 v1, s35, v4
+; SI-NEXT:    v_not_b32_e32 v3, v0
+; SI-NEXT:    v_not_b32_e32 v2, v1
+; SI-NEXT:    v_mov_b32_e32 v0, v2
+; SI-NEXT:    v_mov_b32_e32 v1, v3
+; SI-NEXT:    v_mov_b32_e32 v2, v4
 ; SI-NEXT:    v_mov_b32_e32 v3, v5
-; SI-NEXT:    v_mov_b32_e32 v4, v6
-; SI-NEXT:    v_mov_b32_e32 v5, v7
-; SI-NEXT:    v_mov_b32_e32 v6, v8
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 offset:32 glc
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB47_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v0, v3
-; SI-NEXT:    v_mov_b32_e32 v1, v4
-; SI-NEXT:    v_readlane_b32 s7, v2, 1
-; SI-NEXT:    v_readlane_b32 s6, v2, 0
+; SI-NEXT:    v_readlane_b32 s7, v6, 1
+; SI-NEXT:    v_readlane_b32 s6, v6, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -3344,25 +3340,25 @@ define amdgpu_gfx void @global_atomic_or_i64_noret_scalar(ptr addrspace(1) inreg
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v2, s6, 0
+; SI-NEXT:    v_writelane_b32 v2, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s7
 ; SI-NEXT:    s_mov_b32 s35, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s35
-; SI-NEXT:    v_mov_b32_e32 v2, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s35
+; SI-NEXT:    v_mov_b32_e32 v1, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_or_x2 v[1:2], off, s[4:7], 0
+; SI-NEXT:    buffer_atomic_or_x2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v2, 1
+; SI-NEXT:    v_readlane_b32 s6, v2, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -3398,23 +3394,23 @@ define amdgpu_gfx void @global_atomic_or_i64_noret_offset_scalar(ptr addrspace(1
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
-; SI-NEXT:    v_mov_b32_e32 v1, s6
-; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    v_writelane_b32 v2, s6, 0
+; SI-NEXT:    v_writelane_b32 v2, s7, 1
+; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    v_mov_b32_e32 v1, s7
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_or_x2 v[1:2], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_atomic_or_x2 v[0:1], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v2, 1
+; SI-NEXT:    v_readlane_b32 s6, v2, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -3708,25 +3704,25 @@ define amdgpu_gfx void @global_atomic_xor_i64_noret_scalar(ptr addrspace(1) inre
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v2, s6, 0
+; SI-NEXT:    v_writelane_b32 v2, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s7
 ; SI-NEXT:    s_mov_b32 s35, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s35
-; SI-NEXT:    v_mov_b32_e32 v2, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s35
+; SI-NEXT:    v_mov_b32_e32 v1, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_xor_x2 v[1:2], off, s[4:7], 0
+; SI-NEXT:    buffer_atomic_xor_x2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v2, 1
+; SI-NEXT:    v_readlane_b32 s6, v2, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -3762,23 +3758,23 @@ define amdgpu_gfx void @global_atomic_xor_i64_noret_offset_scalar(ptr addrspace(
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
-; SI-NEXT:    v_mov_b32_e32 v1, s6
-; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    v_writelane_b32 v2, s6, 0
+; SI-NEXT:    v_writelane_b32 v2, s7, 1
+; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    v_mov_b32_e32 v1, s7
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_xor_x2 v[1:2], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_atomic_xor_x2 v[0:1], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v2, 1
+; SI-NEXT:    v_readlane_b32 s6, v2, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -4294,45 +4290,45 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inre
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v10, s6, 0
+; SI-NEXT:    v_writelane_b32 v10, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[3:4], off, s[4:7], 0
+; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
-; SI-NEXT:    v_mov_b32_e32 v5, s35
-; SI-NEXT:    v_mov_b32_e32 v6, s34
+; SI-NEXT:    v_mov_b32_e32 v4, s35
+; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB68_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[34:35], v[3:4]
-; SI-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[34:35], v[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v10, v4
 ; SI-NEXT:    v_mov_b32_e32 v9, v3
 ; SI-NEXT:    v_mov_b32_e32 v8, v2
 ; SI-NEXT:    v_mov_b32_e32 v7, v1
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], off, s[4:7], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v6, v0
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[3:4]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v2, v6
 ; SI-NEXT:    v_mov_b32_e32 v3, v7
-; SI-NEXT:    v_mov_b32_e32 v4, v8
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB68_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v10, 1
+; SI-NEXT:    v_readlane_b32 s6, v10, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -4402,45 +4398,45 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_offset_scalar(ptr addrspace(
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v10, s6, 0
+; SI-NEXT:    v_writelane_b32 v10, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
-; SI-NEXT:    v_mov_b32_e32 v5, s35
-; SI-NEXT:    v_mov_b32_e32 v6, s34
+; SI-NEXT:    v_mov_b32_e32 v4, s35
+; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB69_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[34:35], v[3:4]
-; SI-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[34:35], v[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v10, v4
 ; SI-NEXT:    v_mov_b32_e32 v9, v3
 ; SI-NEXT:    v_mov_b32_e32 v8, v2
 ; SI-NEXT:    v_mov_b32_e32 v7, v1
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], off, s[4:7], 0 offset:32 glc
+; SI-NEXT:    v_mov_b32_e32 v6, v0
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[3:4]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v2, v6
 ; SI-NEXT:    v_mov_b32_e32 v3, v7
-; SI-NEXT:    v_mov_b32_e32 v4, v8
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB69_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v10, 1
+; SI-NEXT:    v_readlane_b32 s6, v10, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -4511,47 +4507,45 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_scalar(ptr addrspace(1) inreg %
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v2, s6, 0
-; SI-NEXT:    v_writelane_b32 v2, s7, 1
+; SI-NEXT:    v_writelane_b32 v10, s6, 0
+; SI-NEXT:    v_writelane_b32 v10, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[3:4], off, s[4:7], 0
+; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
-; SI-NEXT:    v_mov_b32_e32 v0, s35
-; SI-NEXT:    v_mov_b32_e32 v1, s34
+; SI-NEXT:    v_mov_b32_e32 v4, s35
+; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB70_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v8, v4
-; SI-NEXT:    v_mov_b32_e32 v7, v3
-; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[34:35], v[7:8]
+; SI-NEXT:    v_mov_b32_e32 v9, v1
+; SI-NEXT:    v_mov_b32_e32 v8, v0
+; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[34:35], v[8:9]
+; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cndmask_b32_e32 v6, v0, v8, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v5, v1, v7, vcc
-; SI-NEXT:    v_mov_b32_e32 v3, v5
-; SI-NEXT:    v_mov_b32_e32 v4, v6
-; SI-NEXT:    v_mov_b32_e32 v5, v7
-; SI-NEXT:    v_mov_b32_e32 v6, v8
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v0, v6
+; SI-NEXT:    v_mov_b32_e32 v1, v7
+; SI-NEXT:    v_mov_b32_e32 v2, v8
+; SI-NEXT:    v_mov_b32_e32 v3, v9
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB70_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v0, v3
-; SI-NEXT:    v_mov_b32_e32 v1, v4
-; SI-NEXT:    v_readlane_b32 s7, v2, 1
-; SI-NEXT:    v_readlane_b32 s6, v2, 0
+; SI-NEXT:    v_readlane_b32 s7, v10, 1
+; SI-NEXT:    v_readlane_b32 s6, v10, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -4621,47 +4615,45 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_offset_scalar(ptr addrspace(1)
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v2, s6, 0
-; SI-NEXT:    v_writelane_b32 v2, s7, 1
+; SI-NEXT:    v_writelane_b32 v10, s6, 0
+; SI-NEXT:    v_writelane_b32 v10, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
-; SI-NEXT:    v_mov_b32_e32 v0, s35
-; SI-NEXT:    v_mov_b32_e32 v1, s34
+; SI-NEXT:    v_mov_b32_e32 v4, s35
+; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB71_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v8, v4
-; SI-NEXT:    v_mov_b32_e32 v7, v3
-; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[34:35], v[7:8]
+; SI-NEXT:    v_mov_b32_e32 v9, v1
+; SI-NEXT:    v_mov_b32_e32 v8, v0
+; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[34:35], v[8:9]
+; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cndmask_b32_e32 v6, v0, v8, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v5, v1, v7, vcc
-; SI-NEXT:    v_mov_b32_e32 v3, v5
-; SI-NEXT:    v_mov_b32_e32 v4, v6
-; SI-NEXT:    v_mov_b32_e32 v5, v7
-; SI-NEXT:    v_mov_b32_e32 v6, v8
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 offset:32 glc
+; SI-NEXT:    v_mov_b32_e32 v0, v6
+; SI-NEXT:    v_mov_b32_e32 v1, v7
+; SI-NEXT:    v_mov_b32_e32 v2, v8
+; SI-NEXT:    v_mov_b32_e32 v3, v9
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB71_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v0, v3
-; SI-NEXT:    v_mov_b32_e32 v1, v4
-; SI-NEXT:    v_readlane_b32 s7, v2, 1
-; SI-NEXT:    v_readlane_b32 s6, v2, 0
+; SI-NEXT:    v_readlane_b32 s7, v10, 1
+; SI-NEXT:    v_readlane_b32 s6, v10, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -5576,45 +5568,45 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inr
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v10, s6, 0
+; SI-NEXT:    v_writelane_b32 v10, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[3:4], off, s[4:7], 0
+; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
-; SI-NEXT:    v_mov_b32_e32 v5, s35
-; SI-NEXT:    v_mov_b32_e32 v6, s34
+; SI-NEXT:    v_mov_b32_e32 v4, s35
+; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB80_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[34:35], v[3:4]
-; SI-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[34:35], v[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v10, v4
 ; SI-NEXT:    v_mov_b32_e32 v9, v3
 ; SI-NEXT:    v_mov_b32_e32 v8, v2
 ; SI-NEXT:    v_mov_b32_e32 v7, v1
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], off, s[4:7], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v6, v0
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[3:4]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v2, v6
 ; SI-NEXT:    v_mov_b32_e32 v3, v7
-; SI-NEXT:    v_mov_b32_e32 v4, v8
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB80_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v10, 1
+; SI-NEXT:    v_readlane_b32 s6, v10, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -5684,45 +5676,45 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_offset_scalar(ptr addrspace
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v10, s6, 0
+; SI-NEXT:    v_writelane_b32 v10, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
-; SI-NEXT:    v_mov_b32_e32 v5, s35
-; SI-NEXT:    v_mov_b32_e32 v6, s34
+; SI-NEXT:    v_mov_b32_e32 v4, s35
+; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB81_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[34:35], v[3:4]
-; SI-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[34:35], v[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v10, v4
 ; SI-NEXT:    v_mov_b32_e32 v9, v3
 ; SI-NEXT:    v_mov_b32_e32 v8, v2
 ; SI-NEXT:    v_mov_b32_e32 v7, v1
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], off, s[4:7], 0 offset:32 glc
+; SI-NEXT:    v_mov_b32_e32 v6, v0
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[3:4]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v2, v6
 ; SI-NEXT:    v_mov_b32_e32 v3, v7
-; SI-NEXT:    v_mov_b32_e32 v4, v8
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB81_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v10, 1
+; SI-NEXT:    v_readlane_b32 s6, v10, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -5793,47 +5785,45 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_scalar(ptr addrspace(1) inreg
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v2, s6, 0
-; SI-NEXT:    v_writelane_b32 v2, s7, 1
+; SI-NEXT:    v_writelane_b32 v10, s6, 0
+; SI-NEXT:    v_writelane_b32 v10, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[3:4], off, s[4:7], 0
+; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
-; SI-NEXT:    v_mov_b32_e32 v0, s35
-; SI-NEXT:    v_mov_b32_e32 v1, s34
+; SI-NEXT:    v_mov_b32_e32 v4, s35
+; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB82_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v8, v4
-; SI-NEXT:    v_mov_b32_e32 v7, v3
-; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[34:35], v[7:8]
+; SI-NEXT:    v_mov_b32_e32 v9, v1
+; SI-NEXT:    v_mov_b32_e32 v8, v0
+; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[34:35], v[8:9]
+; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cndmask_b32_e32 v6, v0, v8, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v5, v1, v7, vcc
-; SI-NEXT:    v_mov_b32_e32 v3, v5
-; SI-NEXT:    v_mov_b32_e32 v4, v6
-; SI-NEXT:    v_mov_b32_e32 v5, v7
-; SI-NEXT:    v_mov_b32_e32 v6, v8
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v0, v6
+; SI-NEXT:    v_mov_b32_e32 v1, v7
+; SI-NEXT:    v_mov_b32_e32 v2, v8
+; SI-NEXT:    v_mov_b32_e32 v3, v9
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB82_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v0, v3
-; SI-NEXT:    v_mov_b32_e32 v1, v4
-; SI-NEXT:    v_readlane_b32 s7, v2, 1
-; SI-NEXT:    v_readlane_b32 s6, v2, 0
+; SI-NEXT:    v_readlane_b32 s7, v10, 1
+; SI-NEXT:    v_readlane_b32 s6, v10, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -5903,47 +5893,45 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_offset_scalar(ptr addrspace(1)
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v2, s6, 0
-; SI-NEXT:    v_writelane_b32 v2, s7, 1
+; SI-NEXT:    v_writelane_b32 v10, s6, 0
+; SI-NEXT:    v_writelane_b32 v10, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
-; SI-NEXT:    v_mov_b32_e32 v0, s35
-; SI-NEXT:    v_mov_b32_e32 v1, s34
+; SI-NEXT:    v_mov_b32_e32 v4, s35
+; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB83_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v8, v4
-; SI-NEXT:    v_mov_b32_e32 v7, v3
-; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[34:35], v[7:8]
+; SI-NEXT:    v_mov_b32_e32 v9, v1
+; SI-NEXT:    v_mov_b32_e32 v8, v0
+; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[34:35], v[8:9]
+; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cndmask_b32_e32 v6, v0, v8, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v5, v1, v7, vcc
-; SI-NEXT:    v_mov_b32_e32 v3, v5
-; SI-NEXT:    v_mov_b32_e32 v4, v6
-; SI-NEXT:    v_mov_b32_e32 v5, v7
-; SI-NEXT:    v_mov_b32_e32 v6, v8
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 offset:32 glc
+; SI-NEXT:    v_mov_b32_e32 v0, v6
+; SI-NEXT:    v_mov_b32_e32 v1, v7
+; SI-NEXT:    v_mov_b32_e32 v2, v8
+; SI-NEXT:    v_mov_b32_e32 v3, v9
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB83_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v0, v3
-; SI-NEXT:    v_mov_b32_e32 v1, v4
-; SI-NEXT:    v_readlane_b32 s7, v2, 1
-; SI-NEXT:    v_readlane_b32 s6, v2, 0
+; SI-NEXT:    v_readlane_b32 s7, v10, 1
+; SI-NEXT:    v_readlane_b32 s6, v10, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -6746,45 +6734,45 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inr
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v10, s6, 0
+; SI-NEXT:    v_writelane_b32 v10, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[3:4], off, s[4:7], 0
+; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
-; SI-NEXT:    v_mov_b32_e32 v5, s35
-; SI-NEXT:    v_mov_b32_e32 v6, s34
+; SI-NEXT:    v_mov_b32_e32 v4, s35
+; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB91_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cmp_ge_u64_e32 vcc, s[34:35], v[3:4]
-; SI-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; SI-NEXT:    v_cmp_ge_u64_e32 vcc, s[34:35], v[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v10, v4
 ; SI-NEXT:    v_mov_b32_e32 v9, v3
 ; SI-NEXT:    v_mov_b32_e32 v8, v2
 ; SI-NEXT:    v_mov_b32_e32 v7, v1
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], off, s[4:7], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v6, v0
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[3:4]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v2, v6
 ; SI-NEXT:    v_mov_b32_e32 v3, v7
-; SI-NEXT:    v_mov_b32_e32 v4, v8
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB91_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v10, 1
+; SI-NEXT:    v_readlane_b32 s6, v10, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -6854,45 +6842,45 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_offset_scalar(ptr addrspace
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v10, s6, 0
+; SI-NEXT:    v_writelane_b32 v10, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
-; SI-NEXT:    v_mov_b32_e32 v5, s35
-; SI-NEXT:    v_mov_b32_e32 v6, s34
+; SI-NEXT:    v_mov_b32_e32 v4, s35
+; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB92_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cmp_ge_u64_e32 vcc, s[34:35], v[3:4]
-; SI-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; SI-NEXT:    v_cmp_ge_u64_e32 vcc, s[34:35], v[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v10, v4
 ; SI-NEXT:    v_mov_b32_e32 v9, v3
 ; SI-NEXT:    v_mov_b32_e32 v8, v2
 ; SI-NEXT:    v_mov_b32_e32 v7, v1
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], off, s[4:7], 0 offset:32 glc
+; SI-NEXT:    v_mov_b32_e32 v6, v0
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[3:4]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v2, v6
 ; SI-NEXT:    v_mov_b32_e32 v3, v7
-; SI-NEXT:    v_mov_b32_e32 v4, v8
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB92_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v10, 1
+; SI-NEXT:    v_readlane_b32 s6, v10, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -6963,47 +6951,45 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_scalar(ptr addrspace(1) inreg
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v2, s6, 0
-; SI-NEXT:    v_writelane_b32 v2, s7, 1
+; SI-NEXT:    v_writelane_b32 v10, s6, 0
+; SI-NEXT:    v_writelane_b32 v10, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[3:4], off, s[4:7], 0
+; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
-; SI-NEXT:    v_mov_b32_e32 v0, s35
-; SI-NEXT:    v_mov_b32_e32 v1, s34
+; SI-NEXT:    v_mov_b32_e32 v4, s35
+; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB93_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v8, v4
-; SI-NEXT:    v_mov_b32_e32 v7, v3
-; SI-NEXT:    v_cmp_ge_u64_e32 vcc, s[34:35], v[7:8]
+; SI-NEXT:    v_mov_b32_e32 v9, v1
+; SI-NEXT:    v_mov_b32_e32 v8, v0
+; SI-NEXT:    v_cmp_ge_u64_e32 vcc, s[34:35], v[8:9]
+; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cndmask_b32_e32 v6, v0, v8, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v5, v1, v7, vcc
-; SI-NEXT:    v_mov_b32_e32 v3, v5
-; SI-NEXT:    v_mov_b32_e32 v4, v6
-; SI-NEXT:    v_mov_b32_e32 v5, v7
-; SI-NEXT:    v_mov_b32_e32 v6, v8
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v0, v6
+; SI-NEXT:    v_mov_b32_e32 v1, v7
+; SI-NEXT:    v_mov_b32_e32 v2, v8
+; SI-NEXT:    v_mov_b32_e32 v3, v9
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB93_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v0, v3
-; SI-NEXT:    v_mov_b32_e32 v1, v4
-; SI-NEXT:    v_readlane_b32 s7, v2, 1
-; SI-NEXT:    v_readlane_b32 s6, v2, 0
+; SI-NEXT:    v_readlane_b32 s7, v10, 1
+; SI-NEXT:    v_readlane_b32 s6, v10, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -7073,47 +7059,45 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_offset_scalar(ptr addrspace(1)
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v2, s6, 0
-; SI-NEXT:    v_writelane_b32 v2, s7, 1
+; SI-NEXT:    v_writelane_b32 v10, s6, 0
+; SI-NEXT:    v_writelane_b32 v10, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
-; SI-NEXT:    v_mov_b32_e32 v0, s35
-; SI-NEXT:    v_mov_b32_e32 v1, s34
+; SI-NEXT:    v_mov_b32_e32 v4, s35
+; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB94_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v8, v4
-; SI-NEXT:    v_mov_b32_e32 v7, v3
-; SI-NEXT:    v_cmp_ge_u64_e32 vcc, s[34:35], v[7:8]
+; SI-NEXT:    v_mov_b32_e32 v9, v1
+; SI-NEXT:    v_mov_b32_e32 v8, v0
+; SI-NEXT:    v_cmp_ge_u64_e32 vcc, s[34:35], v[8:9]
+; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cndmask_b32_e32 v6, v0, v8, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v5, v1, v7, vcc
-; SI-NEXT:    v_mov_b32_e32 v3, v5
-; SI-NEXT:    v_mov_b32_e32 v4, v6
-; SI-NEXT:    v_mov_b32_e32 v5, v7
-; SI-NEXT:    v_mov_b32_e32 v6, v8
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 offset:32 glc
+; SI-NEXT:    v_mov_b32_e32 v0, v6
+; SI-NEXT:    v_mov_b32_e32 v1, v7
+; SI-NEXT:    v_mov_b32_e32 v2, v8
+; SI-NEXT:    v_mov_b32_e32 v3, v9
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB94_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v0, v3
-; SI-NEXT:    v_mov_b32_e32 v1, v4
-; SI-NEXT:    v_readlane_b32 s7, v2, 1
-; SI-NEXT:    v_readlane_b32 s6, v2, 0
+; SI-NEXT:    v_readlane_b32 s7, v10, 1
+; SI-NEXT:    v_readlane_b32 s6, v10, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -7552,45 +7536,45 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inre
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v10, s6, 0
+; SI-NEXT:    v_writelane_b32 v10, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[3:4], off, s[4:7], 0
+; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
-; SI-NEXT:    v_mov_b32_e32 v5, s35
-; SI-NEXT:    v_mov_b32_e32 v6, s34
+; SI-NEXT:    v_mov_b32_e32 v4, s35
+; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB99_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[34:35], v[3:4]
-; SI-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[34:35], v[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v10, v4
 ; SI-NEXT:    v_mov_b32_e32 v9, v3
 ; SI-NEXT:    v_mov_b32_e32 v8, v2
 ; SI-NEXT:    v_mov_b32_e32 v7, v1
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], off, s[4:7], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v6, v0
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[3:4]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v2, v6
 ; SI-NEXT:    v_mov_b32_e32 v3, v7
-; SI-NEXT:    v_mov_b32_e32 v4, v8
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB99_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v10, 1
+; SI-NEXT:    v_readlane_b32 s6, v10, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -7660,45 +7644,45 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_offset_scalar(ptr addrspace(
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v10, s6, 0
+; SI-NEXT:    v_writelane_b32 v10, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
-; SI-NEXT:    v_mov_b32_e32 v5, s35
-; SI-NEXT:    v_mov_b32_e32 v6, s34
+; SI-NEXT:    v_mov_b32_e32 v4, s35
+; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB100_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[34:35], v[3:4]
-; SI-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[34:35], v[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v10, v4
 ; SI-NEXT:    v_mov_b32_e32 v9, v3
 ; SI-NEXT:    v_mov_b32_e32 v8, v2
 ; SI-NEXT:    v_mov_b32_e32 v7, v1
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], off, s[4:7], 0 offset:32 glc
+; SI-NEXT:    v_mov_b32_e32 v6, v0
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[3:4]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v2, v6
 ; SI-NEXT:    v_mov_b32_e32 v3, v7
-; SI-NEXT:    v_mov_b32_e32 v4, v8
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB100_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v10, 1
+; SI-NEXT:    v_readlane_b32 s6, v10, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -7769,47 +7753,45 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_scalar(ptr addrspace(1) inreg %
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v2, s6, 0
-; SI-NEXT:    v_writelane_b32 v2, s7, 1
+; SI-NEXT:    v_writelane_b32 v10, s6, 0
+; SI-NEXT:    v_writelane_b32 v10, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[3:4], off, s[4:7], 0
+; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
-; SI-NEXT:    v_mov_b32_e32 v0, s35
-; SI-NEXT:    v_mov_b32_e32 v1, s34
+; SI-NEXT:    v_mov_b32_e32 v4, s35
+; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB101_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v8, v4
-; SI-NEXT:    v_mov_b32_e32 v7, v3
-; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[34:35], v[7:8]
+; SI-NEXT:    v_mov_b32_e32 v9, v1
+; SI-NEXT:    v_mov_b32_e32 v8, v0
+; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[34:35], v[8:9]
+; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cndmask_b32_e32 v6, v0, v8, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v5, v1, v7, vcc
-; SI-NEXT:    v_mov_b32_e32 v3, v5
-; SI-NEXT:    v_mov_b32_e32 v4, v6
-; SI-NEXT:    v_mov_b32_e32 v5, v7
-; SI-NEXT:    v_mov_b32_e32 v6, v8
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v0, v6
+; SI-NEXT:    v_mov_b32_e32 v1, v7
+; SI-NEXT:    v_mov_b32_e32 v2, v8
+; SI-NEXT:    v_mov_b32_e32 v3, v9
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB101_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v0, v3
-; SI-NEXT:    v_mov_b32_e32 v1, v4
-; SI-NEXT:    v_readlane_b32 s7, v2, 1
-; SI-NEXT:    v_readlane_b32 s6, v2, 0
+; SI-NEXT:    v_readlane_b32 s7, v10, 1
+; SI-NEXT:    v_readlane_b32 s6, v10, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -7879,47 +7861,45 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_offset_scalar(ptr addrspace(1)
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v2, s6, 0
-; SI-NEXT:    v_writelane_b32 v2, s7, 1
+; SI-NEXT:    v_writelane_b32 v10, s6, 0
+; SI-NEXT:    v_writelane_b32 v10, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
-; SI-NEXT:    v_mov_b32_e32 v0, s35
-; SI-NEXT:    v_mov_b32_e32 v1, s34
+; SI-NEXT:    v_mov_b32_e32 v4, s35
+; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB102_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v8, v4
-; SI-NEXT:    v_mov_b32_e32 v7, v3
-; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[34:35], v[7:8]
+; SI-NEXT:    v_mov_b32_e32 v9, v1
+; SI-NEXT:    v_mov_b32_e32 v8, v0
+; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[34:35], v[8:9]
+; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cndmask_b32_e32 v6, v0, v8, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v5, v1, v7, vcc
-; SI-NEXT:    v_mov_b32_e32 v3, v5
-; SI-NEXT:    v_mov_b32_e32 v4, v6
-; SI-NEXT:    v_mov_b32_e32 v5, v7
-; SI-NEXT:    v_mov_b32_e32 v6, v8
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 offset:32 glc
+; SI-NEXT:    v_mov_b32_e32 v0, v6
+; SI-NEXT:    v_mov_b32_e32 v1, v7
+; SI-NEXT:    v_mov_b32_e32 v2, v8
+; SI-NEXT:    v_mov_b32_e32 v3, v9
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB102_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v0, v3
-; SI-NEXT:    v_mov_b32_e32 v1, v4
-; SI-NEXT:    v_readlane_b32 s7, v2, 1
-; SI-NEXT:    v_readlane_b32 s6, v2, 0
+; SI-NEXT:    v_readlane_b32 s7, v10, 1
+; SI-NEXT:    v_readlane_b32 s6, v10, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -8601,25 +8581,25 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i64_noret_scalar(ptr addrspace(1
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v2, s6, 0
+; SI-NEXT:    v_writelane_b32 v2, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s7
 ; SI-NEXT:    s_mov_b32 s35, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s35
-; SI-NEXT:    v_mov_b32_e32 v2, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s35
+; SI-NEXT:    v_mov_b32_e32 v1, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_inc_x2 v[1:2], off, s[4:7], 0
+; SI-NEXT:    buffer_atomic_inc_x2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v2, 1
+; SI-NEXT:    v_readlane_b32 s6, v2, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -8655,23 +8635,23 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i64_noret_offset_scalar(ptr addr
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
-; SI-NEXT:    v_mov_b32_e32 v1, s6
-; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    v_writelane_b32 v2, s6, 0
+; SI-NEXT:    v_writelane_b32 v2, s7, 1
+; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    v_mov_b32_e32 v1, s7
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_inc_x2 v[1:2], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_atomic_inc_x2 v[0:1], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v2, 1
+; SI-NEXT:    v_readlane_b32 s6, v2, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -8965,25 +8945,25 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_scalar(ptr addrspace(1
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v2, s6, 0
+; SI-NEXT:    v_writelane_b32 v2, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s7
 ; SI-NEXT:    s_mov_b32 s35, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s35
-; SI-NEXT:    v_mov_b32_e32 v2, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s35
+; SI-NEXT:    v_mov_b32_e32 v1, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_dec_x2 v[1:2], off, s[4:7], 0
+; SI-NEXT:    buffer_atomic_dec_x2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v2, 1
+; SI-NEXT:    v_readlane_b32 s6, v2, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -9019,23 +8999,23 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_offset_scalar(ptr addr
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
-; SI-NEXT:    v_mov_b32_e32 v1, s6
-; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    v_writelane_b32 v2, s6, 0
+; SI-NEXT:    v_writelane_b32 v2, s7, 1
+; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    v_mov_b32_e32 v1, s7
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_dec_x2 v[1:2], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_atomic_dec_x2 v[0:1], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v2, 1
+; SI-NEXT:    v_readlane_b32 s6, v2, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
index e456b7d2e8b9b0..297b5180dfe9bd 100644
--- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
+++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
@@ -6,209 +6,209 @@ define void @main(i1 %arg) #0 {
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
-; CHECK-NEXT:    v_writelane_b32 v1, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v1, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v1, s36, 2
-; CHECK-NEXT:    v_writelane_b32 v1, s37, 3
-; CHECK-NEXT:    v_writelane_b32 v1, s38, 4
-; CHECK-NEXT:    v_writelane_b32 v1, s39, 5
-; CHECK-NEXT:    v_writelane_b32 v1, s40, 6
-; CHECK-NEXT:    v_writelane_b32 v1, s41, 7
-; CHECK-NEXT:    v_writelane_b32 v1, s42, 8
-; CHECK-NEXT:    v_writelane_b32 v1, s43, 9
-; CHECK-NEXT:    v_writelane_b32 v1, s44, 10
-; CHECK-NEXT:    v_writelane_b32 v1, s45, 11
-; CHECK-NEXT:    v_writelane_b32 v1, s46, 12
-; CHECK-NEXT:    v_writelane_b32 v1, s47, 13
-; CHECK-NEXT:    v_writelane_b32 v1, s48, 14
-; CHECK-NEXT:    v_writelane_b32 v1, s49, 15
+; CHECK-NEXT:    v_writelane_b32 v8, s30, 0
+; CHECK-NEXT:    v_writelane_b32 v8, s31, 1
+; CHECK-NEXT:    v_writelane_b32 v8, s36, 2
+; CHECK-NEXT:    v_writelane_b32 v8, s37, 3
+; CHECK-NEXT:    v_writelane_b32 v8, s38, 4
+; CHECK-NEXT:    v_writelane_b32 v8, s39, 5
+; CHECK-NEXT:    v_writelane_b32 v8, s40, 6
+; CHECK-NEXT:    v_writelane_b32 v8, s41, 7
+; CHECK-NEXT:    v_writelane_b32 v8, s42, 8
+; CHECK-NEXT:    v_writelane_b32 v8, s43, 9
+; CHECK-NEXT:    v_writelane_b32 v8, s44, 10
+; CHECK-NEXT:    v_writelane_b32 v8, s45, 11
+; CHECK-NEXT:    v_writelane_b32 v8, s46, 12
+; CHECK-NEXT:    v_writelane_b32 v8, s47, 13
+; CHECK-NEXT:    v_writelane_b32 v8, s48, 14
+; CHECK-NEXT:    v_writelane_b32 v8, s49, 15
 ; CHECK-NEXT:    s_getpc_b64 s[24:25]
-; CHECK-NEXT:    v_writelane_b32 v1, s50, 16
+; CHECK-NEXT:    v_writelane_b32 v8, s50, 16
 ; CHECK-NEXT:    s_movk_i32 s4, 0xf0
 ; CHECK-NEXT:    s_mov_b32 s5, s24
-; CHECK-NEXT:    v_writelane_b32 v1, s51, 17
+; CHECK-NEXT:    v_writelane_b32 v8, s51, 17
 ; CHECK-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0x0
-; CHECK-NEXT:    ; implicit-def: $vgpr5 : SGPR spill to VGPR lane
+; CHECK-NEXT:    ; implicit-def: $vgpr4 : SGPR spill to VGPR lane
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
 ; CHECK-NEXT:    s_load_dwordx4 s[28:31], s[4:5], 0x0
 ; CHECK-NEXT:    s_movk_i32 s4, 0x130
 ; CHECK-NEXT:    s_mov_b32 s5, s24
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_writelane_b32 v5, s36, 0
-; CHECK-NEXT:    v_writelane_b32 v5, s37, 1
-; CHECK-NEXT:    v_writelane_b32 v5, s38, 2
-; CHECK-NEXT:    v_writelane_b32 v5, s39, 3
-; CHECK-NEXT:    v_writelane_b32 v5, s40, 4
-; CHECK-NEXT:    v_writelane_b32 v5, s41, 5
-; CHECK-NEXT:    v_writelane_b32 v5, s42, 6
-; CHECK-NEXT:    v_writelane_b32 v5, s43, 7
-; CHECK-NEXT:    v_writelane_b32 v5, s44, 8
-; CHECK-NEXT:    v_writelane_b32 v5, s45, 9
-; CHECK-NEXT:    v_writelane_b32 v5, s46, 10
+; CHECK-NEXT:    v_writelane_b32 v4, s36, 0
+; CHECK-NEXT:    v_writelane_b32 v4, s37, 1
+; CHECK-NEXT:    v_writelane_b32 v4, s38, 2
+; CHECK-NEXT:    v_writelane_b32 v4, s39, 3
+; CHECK-NEXT:    v_writelane_b32 v4, s40, 4
+; CHECK-NEXT:    v_writelane_b32 v4, s41, 5
+; CHECK-NEXT:    v_writelane_b32 v4, s42, 6
+; CHECK-NEXT:    v_writelane_b32 v4, s43, 7
+; CHECK-NEXT:    v_writelane_b32 v4, s44, 8
+; CHECK-NEXT:    v_writelane_b32 v4, s45, 9
+; CHECK-NEXT:    v_writelane_b32 v4, s46, 10
 ; CHECK-NEXT:    s_load_dwordx16 s[4:19], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v5, s47, 11
-; CHECK-NEXT:    v_writelane_b32 v5, s48, 12
-; CHECK-NEXT:    v_writelane_b32 v5, s49, 13
+; CHECK-NEXT:    v_writelane_b32 v4, s47, 11
+; CHECK-NEXT:    v_writelane_b32 v4, s48, 12
+; CHECK-NEXT:    v_writelane_b32 v4, s49, 13
 ; CHECK-NEXT:    s_mov_b32 s20, 0
-; CHECK-NEXT:    v_mov_b32_e32 v2, 0
-; CHECK-NEXT:    v_writelane_b32 v5, s50, 14
-; CHECK-NEXT:    v_mov_b32_e32 v6, s28
-; CHECK-NEXT:    v_mov_b32_e32 v7, v2
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    v_writelane_b32 v4, s50, 14
+; CHECK-NEXT:    v_mov_b32_e32 v5, s28
+; CHECK-NEXT:    v_mov_b32_e32 v6, v1
 ; CHECK-NEXT:    s_mov_b32 s21, s20
 ; CHECK-NEXT:    s_mov_b32 s22, s20
 ; CHECK-NEXT:    s_mov_b32 s23, s20
-; CHECK-NEXT:    v_writelane_b32 v5, s51, 15
-; CHECK-NEXT:    v_mov_b32_e32 v3, v2
-; CHECK-NEXT:    image_sample_lz v6, v[6:7], s[44:51], s[20:23] dmask:0x1
+; CHECK-NEXT:    v_writelane_b32 v4, s51, 15
+; CHECK-NEXT:    v_mov_b32_e32 v2, v1
+; CHECK-NEXT:    image_sample_lz v5, v[5:6], s[44:51], s[20:23] dmask:0x1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_writelane_b32 v5, s4, 16
-; CHECK-NEXT:    v_writelane_b32 v5, s5, 17
-; CHECK-NEXT:    v_writelane_b32 v5, s6, 18
-; CHECK-NEXT:    v_writelane_b32 v5, s7, 19
-; CHECK-NEXT:    v_writelane_b32 v5, s8, 20
-; CHECK-NEXT:    v_writelane_b32 v5, s9, 21
-; CHECK-NEXT:    image_sample_lz v7, v[2:3], s[4:11], s[20:23] dmask:0x1
-; CHECK-NEXT:    v_writelane_b32 v5, s10, 22
-; CHECK-NEXT:    v_writelane_b32 v5, s11, 23
-; CHECK-NEXT:    v_writelane_b32 v5, s12, 24
-; CHECK-NEXT:    v_writelane_b32 v5, s13, 25
-; CHECK-NEXT:    v_writelane_b32 v5, s14, 26
-; CHECK-NEXT:    v_writelane_b32 v5, s15, 27
-; CHECK-NEXT:    v_writelane_b32 v5, s16, 28
-; CHECK-NEXT:    v_writelane_b32 v1, s52, 18
-; CHECK-NEXT:    v_writelane_b32 v5, s17, 29
-; CHECK-NEXT:    v_writelane_b32 v1, s53, 19
-; CHECK-NEXT:    v_writelane_b32 v5, s18, 30
-; CHECK-NEXT:    v_writelane_b32 v1, s54, 20
-; CHECK-NEXT:    v_writelane_b32 v5, s19, 31
+; CHECK-NEXT:    v_writelane_b32 v4, s4, 16
+; CHECK-NEXT:    v_writelane_b32 v4, s5, 17
+; CHECK-NEXT:    v_writelane_b32 v4, s6, 18
+; CHECK-NEXT:    v_writelane_b32 v4, s7, 19
+; CHECK-NEXT:    v_writelane_b32 v4, s8, 20
+; CHECK-NEXT:    v_writelane_b32 v4, s9, 21
+; CHECK-NEXT:    image_sample_lz v6, v[1:2], s[4:11], s[20:23] dmask:0x1
+; CHECK-NEXT:    v_writelane_b32 v4, s10, 22
+; CHECK-NEXT:    v_writelane_b32 v4, s11, 23
+; CHECK-NEXT:    v_writelane_b32 v4, s12, 24
+; CHECK-NEXT:    v_writelane_b32 v4, s13, 25
+; CHECK-NEXT:    v_writelane_b32 v4, s14, 26
+; CHECK-NEXT:    v_writelane_b32 v4, s15, 27
+; CHECK-NEXT:    v_writelane_b32 v4, s16, 28
+; CHECK-NEXT:    v_writelane_b32 v8, s52, 18
+; CHECK-NEXT:    v_writelane_b32 v4, s17, 29
+; CHECK-NEXT:    v_writelane_b32 v8, s53, 19
+; CHECK-NEXT:    v_writelane_b32 v4, s18, 30
+; CHECK-NEXT:    v_writelane_b32 v8, s54, 20
+; CHECK-NEXT:    v_writelane_b32 v4, s19, 31
 ; CHECK-NEXT:    s_mov_b32 s4, 48
 ; CHECK-NEXT:    s_mov_b32 s5, s24
-; CHECK-NEXT:    v_writelane_b32 v1, s55, 21
+; CHECK-NEXT:    v_writelane_b32 v8, s55, 21
 ; CHECK-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v1, s56, 22
-; CHECK-NEXT:    v_writelane_b32 v1, s57, 23
-; CHECK-NEXT:    v_writelane_b32 v1, s58, 24
-; CHECK-NEXT:    v_writelane_b32 v1, s59, 25
-; CHECK-NEXT:    v_writelane_b32 v1, s60, 26
+; CHECK-NEXT:    v_writelane_b32 v8, s56, 22
+; CHECK-NEXT:    v_writelane_b32 v8, s57, 23
+; CHECK-NEXT:    v_writelane_b32 v8, s58, 24
+; CHECK-NEXT:    v_writelane_b32 v8, s59, 25
+; CHECK-NEXT:    v_writelane_b32 v8, s60, 26
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_writelane_b32 v5, s4, 32
-; CHECK-NEXT:    v_writelane_b32 v1, s61, 27
-; CHECK-NEXT:    v_writelane_b32 v5, s5, 33
-; CHECK-NEXT:    v_writelane_b32 v1, s62, 28
-; CHECK-NEXT:    v_writelane_b32 v5, s6, 34
-; CHECK-NEXT:    v_writelane_b32 v1, s63, 29
-; CHECK-NEXT:    v_writelane_b32 v5, s7, 35
-; CHECK-NEXT:    v_writelane_b32 v1, s64, 30
-; CHECK-NEXT:    v_writelane_b32 v5, s8, 36
-; CHECK-NEXT:    v_writelane_b32 v1, s65, 31
-; CHECK-NEXT:    v_writelane_b32 v5, s9, 37
-; CHECK-NEXT:    v_writelane_b32 v1, s66, 32
+; CHECK-NEXT:    v_writelane_b32 v4, s4, 32
+; CHECK-NEXT:    v_writelane_b32 v8, s61, 27
+; CHECK-NEXT:    v_writelane_b32 v4, s5, 33
+; CHECK-NEXT:    v_writelane_b32 v8, s62, 28
+; CHECK-NEXT:    v_writelane_b32 v4, s6, 34
+; CHECK-NEXT:    v_writelane_b32 v8, s63, 29
+; CHECK-NEXT:    v_writelane_b32 v4, s7, 35
+; CHECK-NEXT:    v_writelane_b32 v8, s64, 30
+; CHECK-NEXT:    v_writelane_b32 v4, s8, 36
+; CHECK-NEXT:    v_writelane_b32 v8, s65, 31
+; CHECK-NEXT:    v_writelane_b32 v4, s9, 37
+; CHECK-NEXT:    v_writelane_b32 v8, s66, 32
 ; CHECK-NEXT:    s_movk_i32 s26, 0x1f0
 ; CHECK-NEXT:    s_movk_i32 s28, 0x2f0
 ; CHECK-NEXT:    s_mov_b32 s27, s24
 ; CHECK-NEXT:    s_mov_b32 s29, s24
-; CHECK-NEXT:    v_writelane_b32 v5, s10, 38
-; CHECK-NEXT:    v_writelane_b32 v1, s67, 33
-; CHECK-NEXT:    v_writelane_b32 v5, s11, 39
+; CHECK-NEXT:    v_writelane_b32 v4, s10, 38
+; CHECK-NEXT:    v_writelane_b32 v8, s67, 33
+; CHECK-NEXT:    v_writelane_b32 v4, s11, 39
 ; CHECK-NEXT:    s_load_dwordx16 s[52:67], s[26:27], 0x0
 ; CHECK-NEXT:    s_load_dwordx16 s[4:19], s[28:29], 0x0
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; CHECK-NEXT:    s_xor_b64 s[24:25], vcc, -1
-; CHECK-NEXT:    ; implicit-def: $vgpr4 : SGPR spill to VGPR lane
+; CHECK-NEXT:    ; implicit-def: $vgpr3 : SGPR spill to VGPR lane
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_mul_f32_e32 v0, v7, v6
+; CHECK-NEXT:    v_mul_f32_e32 v0, v6, v5
 ; CHECK-NEXT:    s_and_saveexec_b64 s[26:27], s[24:25]
 ; CHECK-NEXT:    s_xor_b64 s[26:27], exec, s[26:27]
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_3
 ; CHECK-NEXT:  ; %bb.1: ; %bb48
-; CHECK-NEXT:    v_readlane_b32 s36, v5, 0
-; CHECK-NEXT:    v_readlane_b32 s44, v5, 8
-; CHECK-NEXT:    v_readlane_b32 s45, v5, 9
-; CHECK-NEXT:    v_readlane_b32 s46, v5, 10
-; CHECK-NEXT:    v_readlane_b32 s47, v5, 11
-; CHECK-NEXT:    v_readlane_b32 s48, v5, 12
-; CHECK-NEXT:    v_readlane_b32 s49, v5, 13
-; CHECK-NEXT:    v_readlane_b32 s50, v5, 14
-; CHECK-NEXT:    v_readlane_b32 s51, v5, 15
+; CHECK-NEXT:    v_readlane_b32 s36, v4, 0
+; CHECK-NEXT:    v_readlane_b32 s44, v4, 8
+; CHECK-NEXT:    v_readlane_b32 s45, v4, 9
+; CHECK-NEXT:    v_readlane_b32 s46, v4, 10
+; CHECK-NEXT:    v_readlane_b32 s47, v4, 11
+; CHECK-NEXT:    v_readlane_b32 s48, v4, 12
+; CHECK-NEXT:    v_readlane_b32 s49, v4, 13
+; CHECK-NEXT:    v_readlane_b32 s50, v4, 14
+; CHECK-NEXT:    v_readlane_b32 s51, v4, 15
 ; CHECK-NEXT:    s_and_b64 vcc, exec, -1
-; CHECK-NEXT:    v_readlane_b32 s37, v5, 1
-; CHECK-NEXT:    v_readlane_b32 s38, v5, 2
-; CHECK-NEXT:    v_readlane_b32 s39, v5, 3
-; CHECK-NEXT:    v_readlane_b32 s40, v5, 4
-; CHECK-NEXT:    image_sample_lz v6, v[2:3], s[44:51], s[20:23] dmask:0x1
-; CHECK-NEXT:    v_mov_b32_e32 v3, 0
-; CHECK-NEXT:    v_readlane_b32 s41, v5, 5
-; CHECK-NEXT:    v_readlane_b32 s42, v5, 6
-; CHECK-NEXT:    v_readlane_b32 s43, v5, 7
+; CHECK-NEXT:    v_readlane_b32 s37, v4, 1
+; CHECK-NEXT:    v_readlane_b32 s38, v4, 2
+; CHECK-NEXT:    v_readlane_b32 s39, v4, 3
+; CHECK-NEXT:    v_readlane_b32 s40, v4, 4
+; CHECK-NEXT:    image_sample_lz v5, v[1:2], s[44:51], s[20:23] dmask:0x1
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0
+; CHECK-NEXT:    v_readlane_b32 s41, v4, 5
+; CHECK-NEXT:    v_readlane_b32 s42, v4, 6
+; CHECK-NEXT:    v_readlane_b32 s43, v4, 7
 ; CHECK-NEXT:  .LBB0_2: ; %bb50
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    v_readlane_b32 s36, v5, 32
-; CHECK-NEXT:    v_readlane_b32 s40, v5, 36
-; CHECK-NEXT:    v_readlane_b32 s41, v5, 37
-; CHECK-NEXT:    v_readlane_b32 s42, v5, 38
-; CHECK-NEXT:    v_readlane_b32 s43, v5, 39
+; CHECK-NEXT:    v_readlane_b32 s36, v4, 32
+; CHECK-NEXT:    v_readlane_b32 s40, v4, 36
+; CHECK-NEXT:    v_readlane_b32 s41, v4, 37
+; CHECK-NEXT:    v_readlane_b32 s42, v4, 38
+; CHECK-NEXT:    v_readlane_b32 s43, v4, 39
 ; CHECK-NEXT:    s_mov_b32 s21, s20
 ; CHECK-NEXT:    s_mov_b32 s22, s20
 ; CHECK-NEXT:    s_mov_b32 s23, s20
-; CHECK-NEXT:    v_readlane_b32 s37, v5, 33
-; CHECK-NEXT:    v_readlane_b32 s38, v5, 34
+; CHECK-NEXT:    v_readlane_b32 s37, v4, 33
+; CHECK-NEXT:    v_readlane_b32 s38, v4, 34
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    image_sample_lz v7, v[2:3], s[60:67], s[40:43] dmask:0x1
-; CHECK-NEXT:    v_readlane_b32 s39, v5, 35
-; CHECK-NEXT:    image_sample_lz v2, v[2:3], s[12:19], s[20:23] dmask:0x1
+; CHECK-NEXT:    image_sample_lz v6, v[1:2], s[60:67], s[40:43] dmask:0x1
+; CHECK-NEXT:    v_readlane_b32 s39, v4, 35
+; CHECK-NEXT:    image_sample_lz v1, v[1:2], s[12:19], s[20:23] dmask:0x1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_sub_f32_e32 v2, v2, v7
-; CHECK-NEXT:    v_mul_f32_e32 v2, v2, v0
-; CHECK-NEXT:    v_mul_f32_e32 v2, v2, v6
+; CHECK-NEXT:    v_sub_f32_e32 v1, v1, v6
+; CHECK-NEXT:    v_mul_f32_e32 v1, v1, v0
+; CHECK-NEXT:    v_mul_f32_e32 v1, v1, v5
 ; CHECK-NEXT:    s_mov_b64 vcc, vcc
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB0_2
 ; CHECK-NEXT:  .LBB0_3: ; %Flow14
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_readlane_b32 s12, v5, 32
-; CHECK-NEXT:    v_readlane_b32 s13, v5, 33
-; CHECK-NEXT:    v_readlane_b32 s14, v5, 34
-; CHECK-NEXT:    v_readlane_b32 s15, v5, 35
-; CHECK-NEXT:    v_readlane_b32 s16, v5, 36
-; CHECK-NEXT:    v_readlane_b32 s17, v5, 37
-; CHECK-NEXT:    v_readlane_b32 s18, v5, 38
-; CHECK-NEXT:    v_readlane_b32 s19, v5, 39
-; CHECK-NEXT:    v_writelane_b32 v5, s4, 40
-; CHECK-NEXT:    v_writelane_b32 v5, s5, 41
-; CHECK-NEXT:    v_writelane_b32 v5, s6, 42
-; CHECK-NEXT:    v_writelane_b32 v5, s7, 43
-; CHECK-NEXT:    v_writelane_b32 v5, s8, 44
-; CHECK-NEXT:    v_writelane_b32 v5, s9, 45
-; CHECK-NEXT:    v_writelane_b32 v5, s10, 46
-; CHECK-NEXT:    v_writelane_b32 v5, s11, 47
-; CHECK-NEXT:    v_writelane_b32 v5, s12, 48
-; CHECK-NEXT:    v_writelane_b32 v5, s13, 49
-; CHECK-NEXT:    v_writelane_b32 v5, s14, 50
-; CHECK-NEXT:    v_writelane_b32 v5, s15, 51
-; CHECK-NEXT:    v_writelane_b32 v5, s16, 52
-; CHECK-NEXT:    v_writelane_b32 v5, s17, 53
-; CHECK-NEXT:    v_writelane_b32 v5, s18, 54
-; CHECK-NEXT:    v_writelane_b32 v5, s19, 55
-; CHECK-NEXT:    v_writelane_b32 v5, s52, 56
-; CHECK-NEXT:    v_writelane_b32 v4, s60, 0
-; CHECK-NEXT:    v_writelane_b32 v5, s53, 57
-; CHECK-NEXT:    v_writelane_b32 v4, s61, 1
-; CHECK-NEXT:    v_writelane_b32 v5, s54, 58
-; CHECK-NEXT:    v_writelane_b32 v4, s62, 2
-; CHECK-NEXT:    v_writelane_b32 v5, s55, 59
-; CHECK-NEXT:    v_writelane_b32 v4, s63, 3
-; CHECK-NEXT:    v_writelane_b32 v5, s56, 60
-; CHECK-NEXT:    v_writelane_b32 v4, s64, 4
-; CHECK-NEXT:    v_writelane_b32 v5, s57, 61
-; CHECK-NEXT:    v_writelane_b32 v4, s65, 5
-; CHECK-NEXT:    v_writelane_b32 v5, s58, 62
-; CHECK-NEXT:    v_writelane_b32 v4, s66, 6
-; CHECK-NEXT:    v_writelane_b32 v5, s59, 63
-; CHECK-NEXT:    v_writelane_b32 v4, s67, 7
+; CHECK-NEXT:    v_readlane_b32 s12, v4, 32
+; CHECK-NEXT:    v_readlane_b32 s13, v4, 33
+; CHECK-NEXT:    v_readlane_b32 s14, v4, 34
+; CHECK-NEXT:    v_readlane_b32 s15, v4, 35
+; CHECK-NEXT:    v_readlane_b32 s16, v4, 36
+; CHECK-NEXT:    v_readlane_b32 s17, v4, 37
+; CHECK-NEXT:    v_readlane_b32 s18, v4, 38
+; CHECK-NEXT:    v_readlane_b32 s19, v4, 39
+; CHECK-NEXT:    v_writelane_b32 v4, s4, 40
+; CHECK-NEXT:    v_writelane_b32 v4, s5, 41
+; CHECK-NEXT:    v_writelane_b32 v4, s6, 42
+; CHECK-NEXT:    v_writelane_b32 v4, s7, 43
+; CHECK-NEXT:    v_writelane_b32 v4, s8, 44
+; CHECK-NEXT:    v_writelane_b32 v4, s9, 45
+; CHECK-NEXT:    v_writelane_b32 v4, s10, 46
+; CHECK-NEXT:    v_writelane_b32 v4, s11, 47
+; CHECK-NEXT:    v_writelane_b32 v4, s12, 48
+; CHECK-NEXT:    v_writelane_b32 v4, s13, 49
+; CHECK-NEXT:    v_writelane_b32 v4, s14, 50
+; CHECK-NEXT:    v_writelane_b32 v4, s15, 51
+; CHECK-NEXT:    v_writelane_b32 v4, s16, 52
+; CHECK-NEXT:    v_writelane_b32 v4, s17, 53
+; CHECK-NEXT:    v_writelane_b32 v4, s18, 54
+; CHECK-NEXT:    v_writelane_b32 v4, s19, 55
+; CHECK-NEXT:    v_writelane_b32 v4, s52, 56
+; CHECK-NEXT:    v_writelane_b32 v3, s60, 0
+; CHECK-NEXT:    v_writelane_b32 v4, s53, 57
+; CHECK-NEXT:    v_writelane_b32 v3, s61, 1
+; CHECK-NEXT:    v_writelane_b32 v4, s54, 58
+; CHECK-NEXT:    v_writelane_b32 v3, s62, 2
+; CHECK-NEXT:    v_writelane_b32 v4, s55, 59
+; CHECK-NEXT:    v_writelane_b32 v3, s63, 3
+; CHECK-NEXT:    v_writelane_b32 v4, s56, 60
+; CHECK-NEXT:    v_writelane_b32 v3, s64, 4
+; CHECK-NEXT:    v_writelane_b32 v4, s57, 61
+; CHECK-NEXT:    v_writelane_b32 v3, s65, 5
+; CHECK-NEXT:    v_writelane_b32 v4, s58, 62
+; CHECK-NEXT:    v_writelane_b32 v3, s66, 6
+; CHECK-NEXT:    v_writelane_b32 v4, s59, 63
+; CHECK-NEXT:    v_writelane_b32 v3, s67, 7
 ; CHECK-NEXT:    s_andn2_saveexec_b64 s[20:21], s[26:27]
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_10
 ; CHECK-NEXT:  ; %bb.4: ; %bb32
@@ -218,102 +218,102 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:  ; %bb.5: ; %bb43
 ; CHECK-NEXT:    s_mov_b32 s8, 0
 ; CHECK-NEXT:    s_mov_b32 s9, s8
-; CHECK-NEXT:    v_mov_b32_e32 v2, s8
-; CHECK-NEXT:    v_readlane_b32 s36, v5, 0
-; CHECK-NEXT:    v_mov_b32_e32 v3, s9
+; CHECK-NEXT:    v_mov_b32_e32 v0, s8
+; CHECK-NEXT:    v_readlane_b32 s36, v4, 0
+; CHECK-NEXT:    v_mov_b32_e32 v1, s9
 ; CHECK-NEXT:    s_mov_b32 s10, s8
 ; CHECK-NEXT:    s_mov_b32 s11, s8
-; CHECK-NEXT:    v_readlane_b32 s37, v5, 1
-; CHECK-NEXT:    v_readlane_b32 s38, v5, 2
-; CHECK-NEXT:    v_readlane_b32 s39, v5, 3
-; CHECK-NEXT:    v_readlane_b32 s40, v5, 4
-; CHECK-NEXT:    v_readlane_b32 s41, v5, 5
-; CHECK-NEXT:    v_readlane_b32 s42, v5, 6
-; CHECK-NEXT:    v_readlane_b32 s43, v5, 7
-; CHECK-NEXT:    v_readlane_b32 s44, v5, 8
-; CHECK-NEXT:    v_readlane_b32 s45, v5, 9
-; CHECK-NEXT:    v_readlane_b32 s46, v5, 10
-; CHECK-NEXT:    v_readlane_b32 s47, v5, 11
-; CHECK-NEXT:    v_readlane_b32 s48, v5, 12
-; CHECK-NEXT:    v_readlane_b32 s49, v5, 13
-; CHECK-NEXT:    v_readlane_b32 s50, v5, 14
-; CHECK-NEXT:    v_readlane_b32 s51, v5, 15
-; CHECK-NEXT:    image_sample_lz v6, v[2:3], s[36:43], s[8:11] dmask:0x1
-; CHECK-NEXT:    v_readlane_b32 s36, v5, 16
-; CHECK-NEXT:    v_readlane_b32 s44, v5, 24
-; CHECK-NEXT:    v_readlane_b32 s45, v5, 25
-; CHECK-NEXT:    v_readlane_b32 s46, v5, 26
-; CHECK-NEXT:    v_readlane_b32 s47, v5, 27
-; CHECK-NEXT:    v_readlane_b32 s48, v5, 28
-; CHECK-NEXT:    v_readlane_b32 s49, v5, 29
-; CHECK-NEXT:    v_readlane_b32 s50, v5, 30
-; CHECK-NEXT:    v_readlane_b32 s51, v5, 31
-; CHECK-NEXT:    v_mov_b32_e32 v7, 0
-; CHECK-NEXT:    v_mov_b32_e32 v8, v7
-; CHECK-NEXT:    v_readlane_b32 s37, v5, 17
-; CHECK-NEXT:    v_readlane_b32 s38, v5, 18
-; CHECK-NEXT:    v_readlane_b32 s39, v5, 19
-; CHECK-NEXT:    image_sample_lz v2, v[2:3], s[44:51], s[12:15] dmask:0x1
-; CHECK-NEXT:    v_readlane_b32 s40, v5, 20
-; CHECK-NEXT:    v_readlane_b32 s41, v5, 21
-; CHECK-NEXT:    v_readlane_b32 s42, v5, 22
-; CHECK-NEXT:    v_readlane_b32 s43, v5, 23
-; CHECK-NEXT:    ; implicit-def: $vgpr0
+; CHECK-NEXT:    v_readlane_b32 s37, v4, 1
+; CHECK-NEXT:    v_readlane_b32 s38, v4, 2
+; CHECK-NEXT:    v_readlane_b32 s39, v4, 3
+; CHECK-NEXT:    v_readlane_b32 s40, v4, 4
+; CHECK-NEXT:    v_readlane_b32 s41, v4, 5
+; CHECK-NEXT:    v_readlane_b32 s42, v4, 6
+; CHECK-NEXT:    v_readlane_b32 s43, v4, 7
+; CHECK-NEXT:    v_readlane_b32 s44, v4, 8
+; CHECK-NEXT:    v_readlane_b32 s45, v4, 9
+; CHECK-NEXT:    v_readlane_b32 s46, v4, 10
+; CHECK-NEXT:    v_readlane_b32 s47, v4, 11
+; CHECK-NEXT:    v_readlane_b32 s48, v4, 12
+; CHECK-NEXT:    v_readlane_b32 s49, v4, 13
+; CHECK-NEXT:    v_readlane_b32 s50, v4, 14
+; CHECK-NEXT:    v_readlane_b32 s51, v4, 15
+; CHECK-NEXT:    image_sample_lz v5, v[0:1], s[36:43], s[8:11] dmask:0x1
+; CHECK-NEXT:    v_readlane_b32 s36, v4, 16
+; CHECK-NEXT:    v_readlane_b32 s44, v4, 24
+; CHECK-NEXT:    v_readlane_b32 s45, v4, 25
+; CHECK-NEXT:    v_readlane_b32 s46, v4, 26
+; CHECK-NEXT:    v_readlane_b32 s47, v4, 27
+; CHECK-NEXT:    v_readlane_b32 s48, v4, 28
+; CHECK-NEXT:    v_readlane_b32 s49, v4, 29
+; CHECK-NEXT:    v_readlane_b32 s50, v4, 30
+; CHECK-NEXT:    v_readlane_b32 s51, v4, 31
+; CHECK-NEXT:    v_mov_b32_e32 v6, 0
+; CHECK-NEXT:    v_mov_b32_e32 v7, v6
+; CHECK-NEXT:    v_readlane_b32 s37, v4, 17
+; CHECK-NEXT:    v_readlane_b32 s38, v4, 18
+; CHECK-NEXT:    v_readlane_b32 s39, v4, 19
+; CHECK-NEXT:    image_sample_lz v0, v[0:1], s[44:51], s[12:15] dmask:0x1
+; CHECK-NEXT:    v_readlane_b32 s40, v4, 20
+; CHECK-NEXT:    v_readlane_b32 s41, v4, 21
+; CHECK-NEXT:    v_readlane_b32 s42, v4, 22
+; CHECK-NEXT:    v_readlane_b32 s43, v4, 23
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_dwordx3 v[6:8], off, s[8:11], 0
+; CHECK-NEXT:    buffer_store_dwordx3 v[5:7], off, s[8:11], 0
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_dwordx4 v[2:5], off, s[8:11], 0
+; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; CHECK-NEXT:    ; implicit-def: $vgpr0
 ; CHECK-NEXT:  .LBB0_6: ; %Flow12
 ; CHECK-NEXT:    s_or_saveexec_b64 s[4:5], s[22:23]
-; CHECK-NEXT:    v_readlane_b32 s52, v5, 40
-; CHECK-NEXT:    v_readlane_b32 s53, v5, 41
-; CHECK-NEXT:    v_readlane_b32 s54, v5, 42
-; CHECK-NEXT:    v_readlane_b32 s55, v5, 43
-; CHECK-NEXT:    v_readlane_b32 s56, v5, 44
-; CHECK-NEXT:    v_readlane_b32 s57, v5, 45
-; CHECK-NEXT:    v_readlane_b32 s58, v5, 46
-; CHECK-NEXT:    v_readlane_b32 s59, v5, 47
-; CHECK-NEXT:    v_readlane_b32 s60, v5, 48
-; CHECK-NEXT:    v_readlane_b32 s61, v5, 49
-; CHECK-NEXT:    v_readlane_b32 s62, v5, 50
-; CHECK-NEXT:    v_readlane_b32 s63, v5, 51
-; CHECK-NEXT:    v_readlane_b32 s64, v5, 52
-; CHECK-NEXT:    v_readlane_b32 s65, v5, 53
-; CHECK-NEXT:    v_readlane_b32 s66, v5, 54
-; CHECK-NEXT:    v_readlane_b32 s67, v5, 55
+; CHECK-NEXT:    v_readlane_b32 s52, v4, 40
+; CHECK-NEXT:    v_readlane_b32 s53, v4, 41
+; CHECK-NEXT:    v_readlane_b32 s54, v4, 42
+; CHECK-NEXT:    v_readlane_b32 s55, v4, 43
+; CHECK-NEXT:    v_readlane_b32 s56, v4, 44
+; CHECK-NEXT:    v_readlane_b32 s57, v4, 45
+; CHECK-NEXT:    v_readlane_b32 s58, v4, 46
+; CHECK-NEXT:    v_readlane_b32 s59, v4, 47
+; CHECK-NEXT:    v_readlane_b32 s60, v4, 48
+; CHECK-NEXT:    v_readlane_b32 s61, v4, 49
+; CHECK-NEXT:    v_readlane_b32 s62, v4, 50
+; CHECK-NEXT:    v_readlane_b32 s63, v4, 51
+; CHECK-NEXT:    v_readlane_b32 s64, v4, 52
+; CHECK-NEXT:    v_readlane_b32 s65, v4, 53
+; CHECK-NEXT:    v_readlane_b32 s66, v4, 54
+; CHECK-NEXT:    v_readlane_b32 s67, v4, 55
 ; CHECK-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_9
 ; CHECK-NEXT:  ; %bb.7: ; %bb33.preheader
 ; CHECK-NEXT:    s_mov_b32 s8, 0
 ; CHECK-NEXT:    s_mov_b32 s6, s8
 ; CHECK-NEXT:    s_mov_b32 s7, s8
-; CHECK-NEXT:    v_mov_b32_e32 v2, s6
-; CHECK-NEXT:    v_readlane_b32 s36, v5, 56
+; CHECK-NEXT:    v_mov_b32_e32 v1, s6
+; CHECK-NEXT:    v_readlane_b32 s36, v4, 56
 ; CHECK-NEXT:    s_mov_b32 s9, s8
 ; CHECK-NEXT:    s_mov_b32 s10, s8
 ; CHECK-NEXT:    s_mov_b32 s11, s8
-; CHECK-NEXT:    v_mov_b32_e32 v3, s7
-; CHECK-NEXT:    v_readlane_b32 s37, v5, 57
-; CHECK-NEXT:    v_readlane_b32 s38, v5, 58
-; CHECK-NEXT:    v_readlane_b32 s39, v5, 59
-; CHECK-NEXT:    v_readlane_b32 s40, v5, 60
-; CHECK-NEXT:    v_readlane_b32 s41, v5, 61
-; CHECK-NEXT:    v_readlane_b32 s42, v5, 62
-; CHECK-NEXT:    v_readlane_b32 s43, v5, 63
+; CHECK-NEXT:    v_mov_b32_e32 v2, s7
+; CHECK-NEXT:    v_readlane_b32 s37, v4, 57
+; CHECK-NEXT:    v_readlane_b32 s38, v4, 58
+; CHECK-NEXT:    v_readlane_b32 s39, v4, 59
+; CHECK-NEXT:    v_readlane_b32 s40, v4, 60
+; CHECK-NEXT:    v_readlane_b32 s41, v4, 61
+; CHECK-NEXT:    v_readlane_b32 s42, v4, 62
+; CHECK-NEXT:    v_readlane_b32 s43, v4, 63
 ; CHECK-NEXT:    s_nop 4
-; CHECK-NEXT:    image_sample_lz v6, v[2:3], s[36:43], s[8:11] dmask:0x1
-; CHECK-NEXT:    image_sample_lz v7, v[2:3], s[52:59], s[8:11] dmask:0x1
-; CHECK-NEXT:    ; kill: killed $vgpr2_vgpr3
+; CHECK-NEXT:    image_sample_lz v5, v[1:2], s[36:43], s[8:11] dmask:0x1
+; CHECK-NEXT:    image_sample_lz v6, v[1:2], s[52:59], s[8:11] dmask:0x1
+; CHECK-NEXT:    ; kill: killed $vgpr1_vgpr2
 ; CHECK-NEXT:    s_mov_b64 s[12:13], s[36:37]
 ; CHECK-NEXT:    s_and_b64 vcc, exec, 0
-; CHECK-NEXT:    v_readlane_b32 s44, v4, 0
-; CHECK-NEXT:    v_readlane_b32 s45, v4, 1
-; CHECK-NEXT:    v_readlane_b32 s46, v4, 2
-; CHECK-NEXT:    v_readlane_b32 s47, v4, 3
-; CHECK-NEXT:    v_readlane_b32 s48, v4, 4
-; CHECK-NEXT:    v_readlane_b32 s49, v4, 5
-; CHECK-NEXT:    v_readlane_b32 s50, v4, 6
-; CHECK-NEXT:    v_readlane_b32 s51, v4, 7
+; CHECK-NEXT:    v_readlane_b32 s44, v3, 0
+; CHECK-NEXT:    v_readlane_b32 s45, v3, 1
+; CHECK-NEXT:    v_readlane_b32 s46, v3, 2
+; CHECK-NEXT:    v_readlane_b32 s47, v3, 3
+; CHECK-NEXT:    v_readlane_b32 s48, v3, 4
+; CHECK-NEXT:    v_readlane_b32 s49, v3, 5
+; CHECK-NEXT:    v_readlane_b32 s50, v3, 6
+; CHECK-NEXT:    v_readlane_b32 s51, v3, 7
 ; CHECK-NEXT:    s_mov_b64 s[14:15], s[38:39]
 ; CHECK-NEXT:    s_mov_b64 s[16:17], s[40:41]
 ; CHECK-NEXT:    s_mov_b64 s[18:19], s[42:43]
@@ -321,59 +321,59 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    ; kill: killed $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59
 ; CHECK-NEXT:    ; kill: killed $sgpr8_sgpr9_sgpr10 killed $sgpr11
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_sub_f32_e32 v2, v7, v6
-; CHECK-NEXT:    v_mul_f32_e32 v0, v2, v0
-; CHECK-NEXT:    v_mov_b32_e32 v2, 0
+; CHECK-NEXT:    v_sub_f32_e32 v1, v6, v5
+; CHECK-NEXT:    v_mul_f32_e32 v0, v1, v0
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:  .LBB0_8: ; %bb33
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    v_add_f32_e32 v3, v2, v0
-; CHECK-NEXT:    v_sub_f32_e32 v2, v2, v3
+; CHECK-NEXT:    v_add_f32_e32 v2, v1, v0
+; CHECK-NEXT:    v_sub_f32_e32 v1, v1, v2
 ; CHECK-NEXT:    s_mov_b64 vcc, vcc
 ; CHECK-NEXT:    s_cbranch_vccz .LBB0_8
 ; CHECK-NEXT:  .LBB0_9: ; %Flow13
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; CHECK-NEXT:  .LBB0_10: ; %UnifiedReturnBlock
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[20:21]
-; CHECK-NEXT:    v_readlane_b32 s67, v1, 33
-; CHECK-NEXT:    v_readlane_b32 s66, v1, 32
-; CHECK-NEXT:    v_readlane_b32 s65, v1, 31
-; CHECK-NEXT:    v_readlane_b32 s64, v1, 30
-; CHECK-NEXT:    v_readlane_b32 s63, v1, 29
-; CHECK-NEXT:    v_readlane_b32 s62, v1, 28
-; CHECK-NEXT:    v_readlane_b32 s61, v1, 27
-; CHECK-NEXT:    v_readlane_b32 s60, v1, 26
-; CHECK-NEXT:    v_readlane_b32 s59, v1, 25
-; CHECK-NEXT:    v_readlane_b32 s58, v1, 24
-; CHECK-NEXT:    v_readlane_b32 s57, v1, 23
-; CHECK-NEXT:    v_readlane_b32 s56, v1, 22
-; CHECK-NEXT:    v_readlane_b32 s55, v1, 21
-; CHECK-NEXT:    v_readlane_b32 s54, v1, 20
-; CHECK-NEXT:    v_readlane_b32 s53, v1, 19
-; CHECK-NEXT:    v_readlane_b32 s52, v1, 18
-; CHECK-NEXT:    v_readlane_b32 s51, v1, 17
-; CHECK-NEXT:    v_readlane_b32 s50, v1, 16
-; CHECK-NEXT:    v_readlane_b32 s49, v1, 15
-; CHECK-NEXT:    v_readlane_b32 s48, v1, 14
-; CHECK-NEXT:    v_readlane_b32 s47, v1, 13
-; CHECK-NEXT:    v_readlane_b32 s46, v1, 12
-; CHECK-NEXT:    v_readlane_b32 s45, v1, 11
-; CHECK-NEXT:    v_readlane_b32 s44, v1, 10
-; CHECK-NEXT:    v_readlane_b32 s43, v1, 9
-; CHECK-NEXT:    v_readlane_b32 s42, v1, 8
-; CHECK-NEXT:    v_readlane_b32 s41, v1, 7
-; CHECK-NEXT:    v_readlane_b32 s40, v1, 6
-; CHECK-NEXT:    v_readlane_b32 s39, v1, 5
-; CHECK-NEXT:    v_readlane_b32 s38, v1, 4
-; CHECK-NEXT:    v_readlane_b32 s37, v1, 3
-; CHECK-NEXT:    v_readlane_b32 s36, v1, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v1, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v1, 0
-; CHECK-NEXT:    ; kill: killed $vgpr5
+; CHECK-NEXT:    v_readlane_b32 s67, v8, 33
+; CHECK-NEXT:    v_readlane_b32 s66, v8, 32
+; CHECK-NEXT:    v_readlane_b32 s65, v8, 31
+; CHECK-NEXT:    v_readlane_b32 s64, v8, 30
+; CHECK-NEXT:    v_readlane_b32 s63, v8, 29
+; CHECK-NEXT:    v_readlane_b32 s62, v8, 28
+; CHECK-NEXT:    v_readlane_b32 s61, v8, 27
+; CHECK-NEXT:    v_readlane_b32 s60, v8, 26
+; CHECK-NEXT:    v_readlane_b32 s59, v8, 25
+; CHECK-NEXT:    v_readlane_b32 s58, v8, 24
+; CHECK-NEXT:    v_readlane_b32 s57, v8, 23
+; CHECK-NEXT:    v_readlane_b32 s56, v8, 22
+; CHECK-NEXT:    v_readlane_b32 s55, v8, 21
+; CHECK-NEXT:    v_readlane_b32 s54, v8, 20
+; CHECK-NEXT:    v_readlane_b32 s53, v8, 19
+; CHECK-NEXT:    v_readlane_b32 s52, v8, 18
+; CHECK-NEXT:    v_readlane_b32 s51, v8, 17
+; CHECK-NEXT:    v_readlane_b32 s50, v8, 16
+; CHECK-NEXT:    v_readlane_b32 s49, v8, 15
+; CHECK-NEXT:    v_readlane_b32 s48, v8, 14
+; CHECK-NEXT:    v_readlane_b32 s47, v8, 13
+; CHECK-NEXT:    v_readlane_b32 s46, v8, 12
+; CHECK-NEXT:    v_readlane_b32 s45, v8, 11
+; CHECK-NEXT:    v_readlane_b32 s44, v8, 10
+; CHECK-NEXT:    v_readlane_b32 s43, v8, 9
+; CHECK-NEXT:    v_readlane_b32 s42, v8, 8
+; CHECK-NEXT:    v_readlane_b32 s41, v8, 7
+; CHECK-NEXT:    v_readlane_b32 s40, v8, 6
+; CHECK-NEXT:    v_readlane_b32 s39, v8, 5
+; CHECK-NEXT:    v_readlane_b32 s38, v8, 4
+; CHECK-NEXT:    v_readlane_b32 s37, v8, 3
+; CHECK-NEXT:    v_readlane_b32 s36, v8, 2
+; CHECK-NEXT:    v_readlane_b32 s31, v8, 1
+; CHECK-NEXT:    v_readlane_b32 s30, v8, 0
 ; CHECK-NEXT:    ; kill: killed $vgpr4
+; CHECK-NEXT:    ; kill: killed $vgpr3
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
index 3aaf04c94cda58..408199bbc92237 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
@@ -1042,92 +1042,92 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) {
 ; GCN-NEXT:    s_mov_b32 s10, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    v_writelane_b32 v40, s30, 0
-; GCN-NEXT:    v_writelane_b32 v40, s31, 1
-; GCN-NEXT:    v_writelane_b32 v40, s34, 2
-; GCN-NEXT:    v_writelane_b32 v40, s35, 3
-; GCN-NEXT:    v_writelane_b32 v40, s36, 4
-; GCN-NEXT:    v_writelane_b32 v40, s37, 5
-; GCN-NEXT:    v_writelane_b32 v40, s38, 6
-; GCN-NEXT:    v_writelane_b32 v40, s39, 7
-; GCN-NEXT:    v_writelane_b32 v40, s40, 8
-; GCN-NEXT:    v_writelane_b32 v40, s41, 9
-; GCN-NEXT:    v_writelane_b32 v40, s42, 10
-; GCN-NEXT:    v_writelane_b32 v40, s43, 11
-; GCN-NEXT:    v_writelane_b32 v40, s44, 12
-; GCN-NEXT:    v_writelane_b32 v40, s45, 13
-; GCN-NEXT:    v_writelane_b32 v40, s46, 14
-; GCN-NEXT:    v_writelane_b32 v40, s47, 15
-; GCN-NEXT:    v_writelane_b32 v40, s48, 16
-; GCN-NEXT:    v_writelane_b32 v40, s49, 17
-; GCN-NEXT:    v_writelane_b32 v40, s50, 18
-; GCN-NEXT:    v_writelane_b32 v40, s51, 19
-; GCN-NEXT:    v_writelane_b32 v40, s52, 20
-; GCN-NEXT:    v_writelane_b32 v40, s53, 21
-; GCN-NEXT:    v_writelane_b32 v40, s54, 22
-; GCN-NEXT:    v_writelane_b32 v40, s55, 23
-; GCN-NEXT:    v_writelane_b32 v40, s56, 24
-; GCN-NEXT:    v_writelane_b32 v40, s57, 25
-; GCN-NEXT:    v_writelane_b32 v40, s58, 26
-; GCN-NEXT:    v_writelane_b32 v40, s59, 27
-; GCN-NEXT:    v_writelane_b32 v40, s60, 28
-; GCN-NEXT:    v_writelane_b32 v40, s61, 29
-; GCN-NEXT:    v_writelane_b32 v40, s62, 30
-; GCN-NEXT:    v_writelane_b32 v40, s63, 31
-; GCN-NEXT:    v_mov_b32_e32 v41, v0
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    v_writelane_b32 v41, s30, 0
+; GCN-NEXT:    v_writelane_b32 v41, s31, 1
+; GCN-NEXT:    v_writelane_b32 v41, s34, 2
+; GCN-NEXT:    v_writelane_b32 v41, s35, 3
+; GCN-NEXT:    v_writelane_b32 v41, s36, 4
+; GCN-NEXT:    v_writelane_b32 v41, s37, 5
+; GCN-NEXT:    v_writelane_b32 v41, s38, 6
+; GCN-NEXT:    v_writelane_b32 v41, s39, 7
+; GCN-NEXT:    v_writelane_b32 v41, s40, 8
+; GCN-NEXT:    v_writelane_b32 v41, s41, 9
+; GCN-NEXT:    v_writelane_b32 v41, s42, 10
+; GCN-NEXT:    v_writelane_b32 v41, s43, 11
+; GCN-NEXT:    v_writelane_b32 v41, s44, 12
+; GCN-NEXT:    v_writelane_b32 v41, s45, 13
+; GCN-NEXT:    v_writelane_b32 v41, s46, 14
+; GCN-NEXT:    v_writelane_b32 v41, s47, 15
+; GCN-NEXT:    v_writelane_b32 v41, s48, 16
+; GCN-NEXT:    v_writelane_b32 v41, s49, 17
+; GCN-NEXT:    v_writelane_b32 v41, s50, 18
+; GCN-NEXT:    v_writelane_b32 v41, s51, 19
+; GCN-NEXT:    v_writelane_b32 v41, s52, 20
+; GCN-NEXT:    v_writelane_b32 v41, s53, 21
+; GCN-NEXT:    v_writelane_b32 v41, s54, 22
+; GCN-NEXT:    v_writelane_b32 v41, s55, 23
+; GCN-NEXT:    v_writelane_b32 v41, s56, 24
+; GCN-NEXT:    v_writelane_b32 v41, s57, 25
+; GCN-NEXT:    v_writelane_b32 v41, s58, 26
+; GCN-NEXT:    v_writelane_b32 v41, s59, 27
+; GCN-NEXT:    v_writelane_b32 v41, s60, 28
+; GCN-NEXT:    v_writelane_b32 v41, s61, 29
+; GCN-NEXT:    v_writelane_b32 v41, s62, 30
+; GCN-NEXT:    v_writelane_b32 v41, s63, 31
+; GCN-NEXT:    v_mov_b32_e32 v40, v0
 ; GCN-NEXT:    s_mov_b64 s[4:5], exec
 ; GCN-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GCN-NEXT:    v_readfirstlane_b32 s6, v1
 ; GCN-NEXT:    v_readfirstlane_b32 s7, v2
 ; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[6:7], v[1:2]
 ; GCN-NEXT:    s_and_saveexec_b64 s[8:9], vcc
-; GCN-NEXT:    v_mov_b32_e32 v0, v41
+; GCN-NEXT:    v_mov_b32_e32 v0, v40
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GCN-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GCN-NEXT:    s_xor_b64 exec, exec, s[8:9]
 ; GCN-NEXT:    s_cbranch_execnz .LBB7_1
 ; GCN-NEXT:  ; %bb.2:
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    v_mov_b32_e32 v0, v41
-; GCN-NEXT:    v_readlane_b32 s63, v40, 31
-; GCN-NEXT:    v_readlane_b32 s62, v40, 30
-; GCN-NEXT:    v_readlane_b32 s61, v40, 29
-; GCN-NEXT:    v_readlane_b32 s60, v40, 28
-; GCN-NEXT:    v_readlane_b32 s59, v40, 27
-; GCN-NEXT:    v_readlane_b32 s58, v40, 26
-; GCN-NEXT:    v_readlane_b32 s57, v40, 25
-; GCN-NEXT:    v_readlane_b32 s56, v40, 24
-; GCN-NEXT:    v_readlane_b32 s55, v40, 23
-; GCN-NEXT:    v_readlane_b32 s54, v40, 22
-; GCN-NEXT:    v_readlane_b32 s53, v40, 21
-; GCN-NEXT:    v_readlane_b32 s52, v40, 20
-; GCN-NEXT:    v_readlane_b32 s51, v40, 19
-; GCN-NEXT:    v_readlane_b32 s50, v40, 18
-; GCN-NEXT:    v_readlane_b32 s49, v40, 17
-; GCN-NEXT:    v_readlane_b32 s48, v40, 16
-; GCN-NEXT:    v_readlane_b32 s47, v40, 15
-; GCN-NEXT:    v_readlane_b32 s46, v40, 14
-; GCN-NEXT:    v_readlane_b32 s45, v40, 13
-; GCN-NEXT:    v_readlane_b32 s44, v40, 12
-; GCN-NEXT:    v_readlane_b32 s43, v40, 11
-; GCN-NEXT:    v_readlane_b32 s42, v40, 10
-; GCN-NEXT:    v_readlane_b32 s41, v40, 9
-; GCN-NEXT:    v_readlane_b32 s40, v40, 8
-; GCN-NEXT:    v_readlane_b32 s39, v40, 7
-; GCN-NEXT:    v_readlane_b32 s38, v40, 6
-; GCN-NEXT:    v_readlane_b32 s37, v40, 5
-; GCN-NEXT:    v_readlane_b32 s36, v40, 4
-; GCN-NEXT:    v_readlane_b32 s35, v40, 3
-; GCN-NEXT:    v_readlane_b32 s34, v40, 2
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
-; GCN-NEXT:    v_readlane_b32 s30, v40, 0
-; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    v_mov_b32_e32 v0, v40
+; GCN-NEXT:    v_readlane_b32 s63, v41, 31
+; GCN-NEXT:    v_readlane_b32 s62, v41, 30
+; GCN-NEXT:    v_readlane_b32 s61, v41, 29
+; GCN-NEXT:    v_readlane_b32 s60, v41, 28
+; GCN-NEXT:    v_readlane_b32 s59, v41, 27
+; GCN-NEXT:    v_readlane_b32 s58, v41, 26
+; GCN-NEXT:    v_readlane_b32 s57, v41, 25
+; GCN-NEXT:    v_readlane_b32 s56, v41, 24
+; GCN-NEXT:    v_readlane_b32 s55, v41, 23
+; GCN-NEXT:    v_readlane_b32 s54, v41, 22
+; GCN-NEXT:    v_readlane_b32 s53, v41, 21
+; GCN-NEXT:    v_readlane_b32 s52, v41, 20
+; GCN-NEXT:    v_readlane_b32 s51, v41, 19
+; GCN-NEXT:    v_readlane_b32 s50, v41, 18
+; GCN-NEXT:    v_readlane_b32 s49, v41, 17
+; GCN-NEXT:    v_readlane_b32 s48, v41, 16
+; GCN-NEXT:    v_readlane_b32 s47, v41, 15
+; GCN-NEXT:    v_readlane_b32 s46, v41, 14
+; GCN-NEXT:    v_readlane_b32 s45, v41, 13
+; GCN-NEXT:    v_readlane_b32 s44, v41, 12
+; GCN-NEXT:    v_readlane_b32 s43, v41, 11
+; GCN-NEXT:    v_readlane_b32 s42, v41, 10
+; GCN-NEXT:    v_readlane_b32 s41, v41, 9
+; GCN-NEXT:    v_readlane_b32 s40, v41, 8
+; GCN-NEXT:    v_readlane_b32 s39, v41, 7
+; GCN-NEXT:    v_readlane_b32 s38, v41, 6
+; GCN-NEXT:    v_readlane_b32 s37, v41, 5
+; GCN-NEXT:    v_readlane_b32 s36, v41, 4
+; GCN-NEXT:    v_readlane_b32 s35, v41, 3
+; GCN-NEXT:    v_readlane_b32 s34, v41, 2
+; GCN-NEXT:    v_readlane_b32 s31, v41, 1
+; GCN-NEXT:    v_readlane_b32 s30, v41, 0
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    s_mov_b32 s33, s10
@@ -1140,92 +1140,92 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) {
 ; GISEL-NEXT:    s_mov_b32 s10, s33
 ; GISEL-NEXT:    s_mov_b32 s33, s32
 ; GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GISEL-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GISEL-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
 ; GISEL-NEXT:    s_addk_i32 s32, 0x400
-; GISEL-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
-; GISEL-NEXT:    v_writelane_b32 v40, s30, 0
-; GISEL-NEXT:    v_writelane_b32 v40, s31, 1
-; GISEL-NEXT:    v_writelane_b32 v40, s34, 2
-; GISEL-NEXT:    v_writelane_b32 v40, s35, 3
-; GISEL-NEXT:    v_writelane_b32 v40, s36, 4
-; GISEL-NEXT:    v_writelane_b32 v40, s37, 5
-; GISEL-NEXT:    v_writelane_b32 v40, s38, 6
-; GISEL-NEXT:    v_writelane_b32 v40, s39, 7
-; GISEL-NEXT:    v_writelane_b32 v40, s40, 8
-; GISEL-NEXT:    v_writelane_b32 v40, s41, 9
-; GISEL-NEXT:    v_writelane_b32 v40, s42, 10
-; GISEL-NEXT:    v_writelane_b32 v40, s43, 11
-; GISEL-NEXT:    v_writelane_b32 v40, s44, 12
-; GISEL-NEXT:    v_writelane_b32 v40, s45, 13
-; GISEL-NEXT:    v_writelane_b32 v40, s46, 14
-; GISEL-NEXT:    v_writelane_b32 v40, s47, 15
-; GISEL-NEXT:    v_writelane_b32 v40, s48, 16
-; GISEL-NEXT:    v_writelane_b32 v40, s49, 17
-; GISEL-NEXT:    v_writelane_b32 v40, s50, 18
-; GISEL-NEXT:    v_writelane_b32 v40, s51, 19
-; GISEL-NEXT:    v_writelane_b32 v40, s52, 20
-; GISEL-NEXT:    v_writelane_b32 v40, s53, 21
-; GISEL-NEXT:    v_writelane_b32 v40, s54, 22
-; GISEL-NEXT:    v_writelane_b32 v40, s55, 23
-; GISEL-NEXT:    v_writelane_b32 v40, s56, 24
-; GISEL-NEXT:    v_writelane_b32 v40, s57, 25
-; GISEL-NEXT:    v_writelane_b32 v40, s58, 26
-; GISEL-NEXT:    v_writelane_b32 v40, s59, 27
-; GISEL-NEXT:    v_writelane_b32 v40, s60, 28
-; GISEL-NEXT:    v_writelane_b32 v40, s61, 29
-; GISEL-NEXT:    v_writelane_b32 v40, s62, 30
-; GISEL-NEXT:    v_writelane_b32 v40, s63, 31
-; GISEL-NEXT:    v_mov_b32_e32 v41, v0
+; GISEL-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GISEL-NEXT:    v_writelane_b32 v41, s30, 0
+; GISEL-NEXT:    v_writelane_b32 v41, s31, 1
+; GISEL-NEXT:    v_writelane_b32 v41, s34, 2
+; GISEL-NEXT:    v_writelane_b32 v41, s35, 3
+; GISEL-NEXT:    v_writelane_b32 v41, s36, 4
+; GISEL-NEXT:    v_writelane_b32 v41, s37, 5
+; GISEL-NEXT:    v_writelane_b32 v41, s38, 6
+; GISEL-NEXT:    v_writelane_b32 v41, s39, 7
+; GISEL-NEXT:    v_writelane_b32 v41, s40, 8
+; GISEL-NEXT:    v_writelane_b32 v41, s41, 9
+; GISEL-NEXT:    v_writelane_b32 v41, s42, 10
+; GISEL-NEXT:    v_writelane_b32 v41, s43, 11
+; GISEL-NEXT:    v_writelane_b32 v41, s44, 12
+; GISEL-NEXT:    v_writelane_b32 v41, s45, 13
+; GISEL-NEXT:    v_writelane_b32 v41, s46, 14
+; GISEL-NEXT:    v_writelane_b32 v41, s47, 15
+; GISEL-NEXT:    v_writelane_b32 v41, s48, 16
+; GISEL-NEXT:    v_writelane_b32 v41, s49, 17
+; GISEL-NEXT:    v_writelane_b32 v41, s50, 18
+; GISEL-NEXT:    v_writelane_b32 v41, s51, 19
+; GISEL-NEXT:    v_writelane_b32 v41, s52, 20
+; GISEL-NEXT:    v_writelane_b32 v41, s53, 21
+; GISEL-NEXT:    v_writelane_b32 v41, s54, 22
+; GISEL-NEXT:    v_writelane_b32 v41, s55, 23
+; GISEL-NEXT:    v_writelane_b32 v41, s56, 24
+; GISEL-NEXT:    v_writelane_b32 v41, s57, 25
+; GISEL-NEXT:    v_writelane_b32 v41, s58, 26
+; GISEL-NEXT:    v_writelane_b32 v41, s59, 27
+; GISEL-NEXT:    v_writelane_b32 v41, s60, 28
+; GISEL-NEXT:    v_writelane_b32 v41, s61, 29
+; GISEL-NEXT:    v_writelane_b32 v41, s62, 30
+; GISEL-NEXT:    v_writelane_b32 v41, s63, 31
+; GISEL-NEXT:    v_mov_b32_e32 v40, v0
 ; GISEL-NEXT:    s_mov_b64 s[4:5], exec
 ; GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GISEL-NEXT:    v_readfirstlane_b32 s6, v1
 ; GISEL-NEXT:    v_readfirstlane_b32 s7, v2
 ; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[6:7], v[1:2]
 ; GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
-; GISEL-NEXT:    v_mov_b32_e32 v0, v41
+; GISEL-NEXT:    v_mov_b32_e32 v0, v40
 ; GISEL-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GISEL-NEXT:    ; implicit-def: $vgpr1
 ; GISEL-NEXT:    s_xor_b64 exec, exec, s[8:9]
 ; GISEL-NEXT:    s_cbranch_execnz .LBB7_1
 ; GISEL-NEXT:  ; %bb.2:
 ; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
-; GISEL-NEXT:    v_mov_b32_e32 v0, v41
-; GISEL-NEXT:    v_readlane_b32 s63, v40, 31
-; GISEL-NEXT:    v_readlane_b32 s62, v40, 30
-; GISEL-NEXT:    v_readlane_b32 s61, v40, 29
-; GISEL-NEXT:    v_readlane_b32 s60, v40, 28
-; GISEL-NEXT:    v_readlane_b32 s59, v40, 27
-; GISEL-NEXT:    v_readlane_b32 s58, v40, 26
-; GISEL-NEXT:    v_readlane_b32 s57, v40, 25
-; GISEL-NEXT:    v_readlane_b32 s56, v40, 24
-; GISEL-NEXT:    v_readlane_b32 s55, v40, 23
-; GISEL-NEXT:    v_readlane_b32 s54, v40, 22
-; GISEL-NEXT:    v_readlane_b32 s53, v40, 21
-; GISEL-NEXT:    v_readlane_b32 s52, v40, 20
-; GISEL-NEXT:    v_readlane_b32 s51, v40, 19
-; GISEL-NEXT:    v_readlane_b32 s50, v40, 18
-; GISEL-NEXT:    v_readlane_b32 s49, v40, 17
-; GISEL-NEXT:    v_readlane_b32 s48, v40, 16
-; GISEL-NEXT:    v_readlane_b32 s47, v40, 15
-; GISEL-NEXT:    v_readlane_b32 s46, v40, 14
-; GISEL-NEXT:    v_readlane_b32 s45, v40, 13
-; GISEL-NEXT:    v_readlane_b32 s44, v40, 12
-; GISEL-NEXT:    v_readlane_b32 s43, v40, 11
-; GISEL-NEXT:    v_readlane_b32 s42, v40, 10
-; GISEL-NEXT:    v_readlane_b32 s41, v40, 9
-; GISEL-NEXT:    v_readlane_b32 s40, v40, 8
-; GISEL-NEXT:    v_readlane_b32 s39, v40, 7
-; GISEL-NEXT:    v_readlane_b32 s38, v40, 6
-; GISEL-NEXT:    v_readlane_b32 s37, v40, 5
-; GISEL-NEXT:    v_readlane_b32 s36, v40, 4
-; GISEL-NEXT:    v_readlane_b32 s35, v40, 3
-; GISEL-NEXT:    v_readlane_b32 s34, v40, 2
-; GISEL-NEXT:    v_readlane_b32 s31, v40, 1
-; GISEL-NEXT:    v_readlane_b32 s30, v40, 0
-; GISEL-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; GISEL-NEXT:    v_mov_b32_e32 v0, v40
+; GISEL-NEXT:    v_readlane_b32 s63, v41, 31
+; GISEL-NEXT:    v_readlane_b32 s62, v41, 30
+; GISEL-NEXT:    v_readlane_b32 s61, v41, 29
+; GISEL-NEXT:    v_readlane_b32 s60, v41, 28
+; GISEL-NEXT:    v_readlane_b32 s59, v41, 27
+; GISEL-NEXT:    v_readlane_b32 s58, v41, 26
+; GISEL-NEXT:    v_readlane_b32 s57, v41, 25
+; GISEL-NEXT:    v_readlane_b32 s56, v41, 24
+; GISEL-NEXT:    v_readlane_b32 s55, v41, 23
+; GISEL-NEXT:    v_readlane_b32 s54, v41, 22
+; GISEL-NEXT:    v_readlane_b32 s53, v41, 21
+; GISEL-NEXT:    v_readlane_b32 s52, v41, 20
+; GISEL-NEXT:    v_readlane_b32 s51, v41, 19
+; GISEL-NEXT:    v_readlane_b32 s50, v41, 18
+; GISEL-NEXT:    v_readlane_b32 s49, v41, 17
+; GISEL-NEXT:    v_readlane_b32 s48, v41, 16
+; GISEL-NEXT:    v_readlane_b32 s47, v41, 15
+; GISEL-NEXT:    v_readlane_b32 s46, v41, 14
+; GISEL-NEXT:    v_readlane_b32 s45, v41, 13
+; GISEL-NEXT:    v_readlane_b32 s44, v41, 12
+; GISEL-NEXT:    v_readlane_b32 s43, v41, 11
+; GISEL-NEXT:    v_readlane_b32 s42, v41, 10
+; GISEL-NEXT:    v_readlane_b32 s41, v41, 9
+; GISEL-NEXT:    v_readlane_b32 s40, v41, 8
+; GISEL-NEXT:    v_readlane_b32 s39, v41, 7
+; GISEL-NEXT:    v_readlane_b32 s38, v41, 6
+; GISEL-NEXT:    v_readlane_b32 s37, v41, 5
+; GISEL-NEXT:    v_readlane_b32 s36, v41, 4
+; GISEL-NEXT:    v_readlane_b32 s35, v41, 3
+; GISEL-NEXT:    v_readlane_b32 s34, v41, 2
+; GISEL-NEXT:    v_readlane_b32 s31, v41, 1
+; GISEL-NEXT:    v_readlane_b32 s30, v41, 0
+; GISEL-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GISEL-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GISEL-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
 ; GISEL-NEXT:    s_addk_i32 s32, 0xfc00
 ; GISEL-NEXT:    s_mov_b32 s33, s10

diff  --git a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll
index cfec77e68eae96..833aba9b26afd6 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll
@@ -13,7 +13,7 @@ define fastcc i32 @foo() {
   ; CHECK-NEXT:   $sgpr16 = S_MOV_B32 $sgpr33
   ; CHECK-NEXT:   $sgpr33 = S_MOV_B32 $sgpr32
   ; CHECK-NEXT:   $sgpr17 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-  ; CHECK-NEXT:   BUFFER_STORE_DWORD_OFFSET $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
+  ; CHECK-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
   ; CHECK-NEXT:   $exec_lo = S_MOV_B32 killed $sgpr17
   ; CHECK-NEXT:   $sgpr32 = frame-setup S_ADDK_I32 $sgpr32, 512, implicit-def dead $scc
   ; CHECK-NEXT:   $vgpr40 = V_WRITELANE_B32 killed $sgpr16, 2, undef $vgpr40
@@ -26,8 +26,8 @@ define fastcc i32 @foo() {
   ; CHECK-NEXT:   BUFFER_GL0_INV implicit $exec
   ; CHECK-NEXT:   BUFFER_GL1_INV implicit $exec
   ; CHECK-NEXT:   renamable $sgpr16_sgpr17 = S_LOAD_DWORDX2_IMM killed renamable $sgpr16_sgpr17, 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
-  ; CHECK-NEXT:   $vgpr40 = V_WRITELANE_B32 killed $sgpr30, 0, $vgpr40
-  ; CHECK-NEXT:   $vgpr40 = V_WRITELANE_B32 killed $sgpr31, 1, $vgpr40
+  ; CHECK-NEXT:   $vgpr40 = V_WRITELANE_B32 killed $sgpr30, 0, killed $vgpr40
+  ; CHECK-NEXT:   $vgpr40 = V_WRITELANE_B32 killed $sgpr31, 1, killed $vgpr40
   ; CHECK-NEXT:   S_WAITCNT 49279
   ; CHECK-NEXT:   dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr16_sgpr17, @bar, csr_amdgpu, implicit killed $sgpr4_sgpr5, implicit killed $sgpr6_sgpr7, implicit killed $sgpr8_sgpr9, implicit killed $sgpr10_sgpr11, implicit killed $sgpr12, implicit killed $sgpr13, implicit killed $sgpr14, implicit killed $sgpr15, implicit killed $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $vcc_lo = S_MOV_B32 $exec_lo
@@ -43,7 +43,7 @@ define fastcc i32 @foo() {
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   $sgpr31 = V_READLANE_B32 $vgpr40, 1
   ; CHECK-NEXT:   $sgpr30 = V_READLANE_B32 $vgpr40, 0
-  ; CHECK-NEXT:   $sgpr4 = V_READLANE_B32 $vgpr40, 2
+  ; CHECK-NEXT:   $sgpr4 = V_READLANE_B32 killed $vgpr40, 2
   ; CHECK-NEXT:   $sgpr5 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
   ; CHECK-NEXT:   $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
   ; CHECK-NEXT:   $exec_lo = S_MOV_B32 killed $sgpr5

diff  --git a/llvm/test/CodeGen/AMDGPU/ipra.ll b/llvm/test/CodeGen/AMDGPU/ipra.ll
index 07693371278701..6c8646968b6762 100644
--- a/llvm/test/CodeGen/AMDGPU/ipra.ll
+++ b/llvm/test/CodeGen/AMDGPU/ipra.ll
@@ -42,11 +42,11 @@ define amdgpu_kernel void @kernel_call() #0 {
 ; GCN-LABEL: {{^}}func_regular_call:
 ; GCN-NOT: buffer_load
 ; GCN-NOT: readlane
-; GCN: flat_load_dword v9
+; GCN: flat_load_dword v8
 ; GCN: s_swappc_b64
 ; GCN-NOT: buffer_load
 ; GCN-NOT: readlane
-; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v9
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v8
 
 ; GCN: ; NumSgprs: 34
 ; GCN: ; NumVgprs: 10
@@ -72,9 +72,9 @@ define void @func_tail_call() #1 {
 }
 
 ; GCN-LABEL: {{^}}func_call_tail_call:
-; GCN: flat_load_dword v9
+; GCN: flat_load_dword v8
 ; GCN: s_swappc_b64
-; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v9
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v8
 ; GCN: s_setpc_b64
 
 ; GCN: ; NumSgprs: 34

diff  --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
index 797b13044e7226..1e9994dd8e6efd 100644
--- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
@@ -191,47 +191,47 @@ define void @slsr1_1(i32 %b.arg, i32 %s.arg) #0 {
 ; GFX9-NEXT:    s_mov_b32 s4, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 5
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v43, s4, 5
+; GFX9-NEXT:    v_writelane_b32 v43, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v43, s31, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0x800
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s36, 3
+; GFX9-NEXT:    v_writelane_b32 v43, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v43, s36, 3
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, foo at gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, foo at gotpcrel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s37, 4
+; GFX9-NEXT:    v_writelane_b32 v43, s37, 4
 ; GFX9-NEXT:    s_load_dwordx2 s[36:37], s[4:5], 0x0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v41, v1
-; GFX9-NEXT:    v_mov_b32_e32 v42, v0
-; GFX9-NEXT:    v_mul_u32_u24_e32 v0, v42, v41
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, v1
+; GFX9-NEXT:    v_mov_b32_e32 v41, v0
+; GFX9-NEXT:    v_mul_u32_u24_e32 v0, v41, v40
 ; GFX9-NEXT:    s_mov_b32 s34, s15
-; GFX9-NEXT:    v_and_b32_e32 v43, 0xffffff, v41
+; GFX9-NEXT:    v_and_b32_e32 v42, 0xffffff, v40
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[36:37]
-; GFX9-NEXT:    v_mad_u32_u24 v41, v42, v41, v43
+; GFX9-NEXT:    v_mad_u32_u24 v40, v41, v40, v42
 ; GFX9-NEXT:    s_mov_b32 s15, s34
-; GFX9-NEXT:    v_mov_b32_e32 v0, v41
+; GFX9-NEXT:    v_mov_b32_e32 v0, v40
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[36:37]
-; GFX9-NEXT:    v_add_u32_e32 v0, v41, v43
+; GFX9-NEXT:    v_add_u32_e32 v0, v40, v42
 ; GFX9-NEXT:    s_mov_b32 s15, s34
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[36:37]
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s37, v40, 4
-; GFX9-NEXT:    v_readlane_b32 s36, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 5
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_readlane_b32 s37, v43, 4
+; GFX9-NEXT:    v_readlane_b32 s36, v43, 3
+; GFX9-NEXT:    v_readlane_b32 s34, v43, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v43, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v43, 0
+; GFX9-NEXT:    v_readlane_b32 s4, v43, 5
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xf800
 ; GFX9-NEXT:    s_mov_b32 s33, s4

diff  --git a/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll b/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll
index e0efd09f1f14be..4ea77d1d1ac159 100644
--- a/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll
+++ b/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll
@@ -8,23 +8,23 @@ define void @test_remat_s_getpc_b64() {
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v0, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v2, s30, 0
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v0, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-NEXT:    v_mov_b32_e32 v2, s5
-; GFX9-NEXT:    global_store_dwordx2 v[1:2], v[1:2], off
-; GFX9-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -33,23 +33,23 @@ define void @test_remat_s_getpc_b64() {
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b32 off, v2, s32 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    v_writelane_b32 v0, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v2, s30, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    v_writelane_b32 v0, s31, 1
+; GFX11-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
-; GFX11-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v0, 0
-; GFX11-NEXT:    global_store_b64 v[1:2], v[1:2], off
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT:    scratch_load_b32 v2, off, s32 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -62,26 +62,26 @@ define void @test_remat_s_getpc_b64() {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX12-NEXT:    scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
+; GFX12-NEXT:    scratch_store_b32 off, v2, s32 ; 4-byte Folded Spill
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s0
-; GFX12-NEXT:    v_writelane_b32 v0, s30, 0
+; GFX12-NEXT:    v_writelane_b32 v2, s30, 0
 ; GFX12-NEXT:    s_getpc_b64 s[0:1]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ;;#ASMEND
-; GFX12-NEXT:    v_writelane_b32 v0, s31, 1
+; GFX12-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ;;#ASMEND
 ; GFX12-NEXT:    s_getpc_b64 s[0:1]
 ; GFX12-NEXT:    s_sext_i32_i16 s1, s1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX12-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
-; GFX12-NEXT:    v_readlane_b32 s30, v0, 0
-; GFX12-NEXT:    global_store_b64 v[1:2], v[1:2], off
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX12-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX12-NEXT:    global_store_b64 v[0:1], v[0:1], off
 ; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX12-NEXT:    scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
+; GFX12-NEXT:    scratch_load_b32 v2, off, s32 ; 4-byte Folded Reload
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
index 4544b177351eeb..6a2532147f886c 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
@@ -28,183 +28,183 @@ body:             |
   ; GCN-LABEL: name: test_main
   ; GCN: bb.0:
   ; GCN-NEXT:   successors: %bb.1(0x80000000)
-  ; GCN-NEXT:   liveins: $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $sgpr102, $sgpr103, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr5
+  ; GCN-NEXT:   liveins: $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $sgpr102, $sgpr103, $vgpr0, $vgpr2, $vgpr3, $vgpr4, $vgpr5
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   $sgpr0 = COPY $sgpr33
   ; GCN-NEXT:   $sgpr33 = frame-setup COPY $sgpr32
   ; GCN-NEXT:   $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR $vgpr1, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.69, addrspace 5)
-  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR $vgpr2, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.70, addrspace 5)
-  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR $vgpr3, $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.71, addrspace 5)
-  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR $vgpr5, $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.72, addrspace 5)
-  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr4, $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.74, addrspace 5)
+  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR $vgpr3, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.69, addrspace 5)
+  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR $vgpr4, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.70, addrspace 5)
+  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR $vgpr5, $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.71, addrspace 5)
+  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR $vgpr2, $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.72, addrspace 5)
+  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.74, addrspace 5)
   ; GCN-NEXT:   $exec_lo = S_MOV_B32 killed $sgpr1
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr0, 4, undef $vgpr3
+  ; GCN-NEXT:   $vgpr5 = SI_SPILL_S32_TO_VGPR $sgpr0, 4, undef $vgpr5
   ; GCN-NEXT:   $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 24, implicit-def dead $scc
-  ; GCN-NEXT:   renamable $vgpr5 = IMPLICIT_DEF
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr6, 2, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr7, 3, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr8, 4, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr9, 5, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr10, 6, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr11, 7, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr12, 8, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr13, 9, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr14, 10, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr15, 11, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr16, 12, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr17, 13, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr18, 14, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr19, 15, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr20, 16, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr21, 17, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr22, 18, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr23, 19, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr24, 20, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr25, 21, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr26, 22, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr27, 23, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr28, 24, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr29, 25, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr30, 26, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr31, 27, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr64, 28, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr65, 29, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr66, 30, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr67, 31, $vgpr1
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr68, 0, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr69, 1, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr70, 2, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr71, 3, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr72, 4, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr73, 5, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr74, 6, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr75, 7, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr76, 8, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr77, 9, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr78, 10, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr79, 11, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr80, 12, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr81, 13, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr82, 14, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr83, 15, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr84, 16, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr85, 17, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr86, 18, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr87, 19, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr88, 20, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr89, 21, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr90, 22, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr91, 23, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr92, 24, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr93, 25, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr94, 26, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr95, 27, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr96, 28, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr97, 29, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr98, 30, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr99, 31, $vgpr2
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr100, 0, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr101, 1, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr102, 2, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr103, 3, $vgpr3
+  ; GCN-NEXT:   renamable $vgpr2 = IMPLICIT_DEF
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr6, 2, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr7, 3, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr8, 4, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr9, 5, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr10, 6, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr11, 7, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr12, 8, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr13, 9, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr14, 10, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr15, 11, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr16, 12, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr17, 13, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr18, 14, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr19, 15, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr20, 16, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr21, 17, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr22, 18, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr23, 19, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr24, 20, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr25, 21, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr26, 22, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr27, 23, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr28, 24, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr29, 25, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr30, 26, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr31, 27, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr64, 28, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr65, 29, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr66, 30, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr67, 31, $vgpr3
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr68, 0, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr69, 1, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr70, 2, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr71, 3, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr72, 4, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr73, 5, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr74, 6, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr75, 7, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr76, 8, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr77, 9, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr78, 10, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr79, 11, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr80, 12, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr81, 13, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr82, 14, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr83, 15, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr84, 16, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr85, 17, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr86, 18, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr87, 19, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr88, 20, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr89, 21, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr90, 22, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr91, 23, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr92, 24, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr93, 25, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr94, 26, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr95, 27, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr96, 28, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr97, 29, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr98, 30, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr99, 31, $vgpr4
+  ; GCN-NEXT:   $vgpr5 = SI_SPILL_S32_TO_VGPR $sgpr100, 0, $vgpr5
+  ; GCN-NEXT:   $vgpr5 = SI_SPILL_S32_TO_VGPR $sgpr101, 1, $vgpr5
+  ; GCN-NEXT:   $vgpr5 = SI_SPILL_S32_TO_VGPR $sgpr102, 2, $vgpr5
+  ; GCN-NEXT:   $vgpr5 = SI_SPILL_S32_TO_VGPR $sgpr103, 3, $vgpr5
   ; GCN-NEXT:   $sgpr22 = IMPLICIT_DEF
-  ; GCN-NEXT:   renamable $vgpr5 = SI_SPILL_S32_TO_VGPR $sgpr22, 0, killed $vgpr5
-  ; GCN-NEXT:   dead $vgpr4 = V_SET_INACTIVE_B32 $vgpr0, 0, implicit $exec, implicit-def $scc
+  ; GCN-NEXT:   renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr22, 0, killed $vgpr2
+  ; GCN-NEXT:   dead $vgpr1 = V_SET_INACTIVE_B32 $vgpr0, 0, implicit $exec, implicit-def $scc
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.1:
   ; GCN-NEXT:   successors: %bb.2(0x80000000)
-  ; GCN-NEXT:   liveins: $vgpr1, $vgpr2, $vgpr3, $vgpr5
+  ; GCN-NEXT:   liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   KILL implicit-def $vcc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63, implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, implicit-def $sgpr96_sgpr97_sgpr98_sgpr99_sgpr100_sgpr101_sgpr102_sgpr103
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.2:
   ; GCN-NEXT:   successors: %bb.3(0x80000000)
-  ; GCN-NEXT:   liveins: $vgpr1, $vgpr2, $vgpr3, $vgpr5
+  ; GCN-NEXT:   liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   $sgpr22 = SI_RESTORE_S32_FROM_VGPR $vgpr5, 0
+  ; GCN-NEXT:   $sgpr22 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.3:
-  ; GCN-NEXT:   liveins: $vgpr1, $vgpr2, $vgpr3, $vgpr5
+  ; GCN-NEXT:   liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   $sgpr103 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 3
-  ; GCN-NEXT:   $sgpr102 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 2
-  ; GCN-NEXT:   $sgpr101 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 1
-  ; GCN-NEXT:   $sgpr100 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 0
-  ; GCN-NEXT:   $sgpr99 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 31
-  ; GCN-NEXT:   $sgpr98 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 30
-  ; GCN-NEXT:   $sgpr97 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 29
-  ; GCN-NEXT:   $sgpr96 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 28
-  ; GCN-NEXT:   $sgpr95 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 27
-  ; GCN-NEXT:   $sgpr94 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 26
-  ; GCN-NEXT:   $sgpr93 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 25
-  ; GCN-NEXT:   $sgpr92 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 24
-  ; GCN-NEXT:   $sgpr91 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 23
-  ; GCN-NEXT:   $sgpr90 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 22
-  ; GCN-NEXT:   $sgpr89 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 21
-  ; GCN-NEXT:   $sgpr88 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 20
-  ; GCN-NEXT:   $sgpr87 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 19
-  ; GCN-NEXT:   $sgpr86 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 18
-  ; GCN-NEXT:   $sgpr85 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 17
-  ; GCN-NEXT:   $sgpr84 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 16
-  ; GCN-NEXT:   $sgpr83 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 15
-  ; GCN-NEXT:   $sgpr82 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 14
-  ; GCN-NEXT:   $sgpr81 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 13
-  ; GCN-NEXT:   $sgpr80 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 12
-  ; GCN-NEXT:   $sgpr79 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 11
-  ; GCN-NEXT:   $sgpr78 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 10
-  ; GCN-NEXT:   $sgpr77 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 9
-  ; GCN-NEXT:   $sgpr76 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 8
-  ; GCN-NEXT:   $sgpr75 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 7
-  ; GCN-NEXT:   $sgpr74 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 6
-  ; GCN-NEXT:   $sgpr73 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 5
-  ; GCN-NEXT:   $sgpr72 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 4
-  ; GCN-NEXT:   $sgpr71 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 3
-  ; GCN-NEXT:   $sgpr70 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 2
-  ; GCN-NEXT:   $sgpr69 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1
-  ; GCN-NEXT:   $sgpr68 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0
-  ; GCN-NEXT:   $sgpr67 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 31
-  ; GCN-NEXT:   $sgpr66 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 30
-  ; GCN-NEXT:   $sgpr65 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 29
-  ; GCN-NEXT:   $sgpr64 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 28
-  ; GCN-NEXT:   $sgpr31 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 27
-  ; GCN-NEXT:   $sgpr30 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 26
-  ; GCN-NEXT:   $sgpr29 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 25
-  ; GCN-NEXT:   $sgpr28 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 24
-  ; GCN-NEXT:   $sgpr27 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 23
-  ; GCN-NEXT:   $sgpr26 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 22
-  ; GCN-NEXT:   $sgpr25 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 21
-  ; GCN-NEXT:   $sgpr24 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 20
-  ; GCN-NEXT:   $sgpr23 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 19
-  ; GCN-NEXT:   $sgpr22 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 18
-  ; GCN-NEXT:   $sgpr21 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 17
-  ; GCN-NEXT:   $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 16
-  ; GCN-NEXT:   $sgpr19 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 15
-  ; GCN-NEXT:   $sgpr18 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 14
-  ; GCN-NEXT:   $sgpr17 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 13
-  ; GCN-NEXT:   $sgpr16 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 12
-  ; GCN-NEXT:   $sgpr15 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 11
-  ; GCN-NEXT:   $sgpr14 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 10
-  ; GCN-NEXT:   $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 9
-  ; GCN-NEXT:   $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 8
-  ; GCN-NEXT:   $sgpr11 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 7
-  ; GCN-NEXT:   $sgpr10 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 6
-  ; GCN-NEXT:   $sgpr9 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 5
-  ; GCN-NEXT:   $sgpr8 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 4
-  ; GCN-NEXT:   $sgpr7 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 3
-  ; GCN-NEXT:   $sgpr6 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 2
-  ; GCN-NEXT:   $sgpr5 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 1
-  ; GCN-NEXT:   $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 0
-  ; GCN-NEXT:   KILL killed renamable $vgpr5
-  ; GCN-NEXT:   $sgpr0 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 4
+  ; GCN-NEXT:   $sgpr103 = SI_RESTORE_S32_FROM_VGPR $vgpr5, 3
+  ; GCN-NEXT:   $sgpr102 = SI_RESTORE_S32_FROM_VGPR $vgpr5, 2
+  ; GCN-NEXT:   $sgpr101 = SI_RESTORE_S32_FROM_VGPR $vgpr5, 1
+  ; GCN-NEXT:   $sgpr100 = SI_RESTORE_S32_FROM_VGPR $vgpr5, 0
+  ; GCN-NEXT:   $sgpr99 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 31
+  ; GCN-NEXT:   $sgpr98 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 30
+  ; GCN-NEXT:   $sgpr97 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 29
+  ; GCN-NEXT:   $sgpr96 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 28
+  ; GCN-NEXT:   $sgpr95 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 27
+  ; GCN-NEXT:   $sgpr94 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 26
+  ; GCN-NEXT:   $sgpr93 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 25
+  ; GCN-NEXT:   $sgpr92 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 24
+  ; GCN-NEXT:   $sgpr91 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 23
+  ; GCN-NEXT:   $sgpr90 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 22
+  ; GCN-NEXT:   $sgpr89 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 21
+  ; GCN-NEXT:   $sgpr88 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 20
+  ; GCN-NEXT:   $sgpr87 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 19
+  ; GCN-NEXT:   $sgpr86 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 18
+  ; GCN-NEXT:   $sgpr85 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 17
+  ; GCN-NEXT:   $sgpr84 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 16
+  ; GCN-NEXT:   $sgpr83 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 15
+  ; GCN-NEXT:   $sgpr82 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 14
+  ; GCN-NEXT:   $sgpr81 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 13
+  ; GCN-NEXT:   $sgpr80 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 12
+  ; GCN-NEXT:   $sgpr79 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 11
+  ; GCN-NEXT:   $sgpr78 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 10
+  ; GCN-NEXT:   $sgpr77 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 9
+  ; GCN-NEXT:   $sgpr76 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 8
+  ; GCN-NEXT:   $sgpr75 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 7
+  ; GCN-NEXT:   $sgpr74 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 6
+  ; GCN-NEXT:   $sgpr73 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 5
+  ; GCN-NEXT:   $sgpr72 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 4
+  ; GCN-NEXT:   $sgpr71 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 3
+  ; GCN-NEXT:   $sgpr70 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 2
+  ; GCN-NEXT:   $sgpr69 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 1
+  ; GCN-NEXT:   $sgpr68 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 0
+  ; GCN-NEXT:   $sgpr67 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 31
+  ; GCN-NEXT:   $sgpr66 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 30
+  ; GCN-NEXT:   $sgpr65 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 29
+  ; GCN-NEXT:   $sgpr64 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 28
+  ; GCN-NEXT:   $sgpr31 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 27
+  ; GCN-NEXT:   $sgpr30 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 26
+  ; GCN-NEXT:   $sgpr29 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 25
+  ; GCN-NEXT:   $sgpr28 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 24
+  ; GCN-NEXT:   $sgpr27 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 23
+  ; GCN-NEXT:   $sgpr26 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 22
+  ; GCN-NEXT:   $sgpr25 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 21
+  ; GCN-NEXT:   $sgpr24 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 20
+  ; GCN-NEXT:   $sgpr23 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 19
+  ; GCN-NEXT:   $sgpr22 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 18
+  ; GCN-NEXT:   $sgpr21 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 17
+  ; GCN-NEXT:   $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 16
+  ; GCN-NEXT:   $sgpr19 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 15
+  ; GCN-NEXT:   $sgpr18 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 14
+  ; GCN-NEXT:   $sgpr17 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 13
+  ; GCN-NEXT:   $sgpr16 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 12
+  ; GCN-NEXT:   $sgpr15 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 11
+  ; GCN-NEXT:   $sgpr14 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 10
+  ; GCN-NEXT:   $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 9
+  ; GCN-NEXT:   $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 8
+  ; GCN-NEXT:   $sgpr11 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 7
+  ; GCN-NEXT:   $sgpr10 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 6
+  ; GCN-NEXT:   $sgpr9 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 5
+  ; GCN-NEXT:   $sgpr8 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 4
+  ; GCN-NEXT:   $sgpr7 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 3
+  ; GCN-NEXT:   $sgpr6 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 2
+  ; GCN-NEXT:   $sgpr5 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 1
+  ; GCN-NEXT:   $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 0
+  ; GCN-NEXT:   KILL killed renamable $vgpr2
+  ; GCN-NEXT:   $sgpr0 = SI_RESTORE_S32_FROM_VGPR $vgpr5, 4
   ; GCN-NEXT:   $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-  ; GCN-NEXT:   $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.69, addrspace 5)
-  ; GCN-NEXT:   $vgpr2 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.70, addrspace 5)
-  ; GCN-NEXT:   $vgpr3 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.71, addrspace 5)
-  ; GCN-NEXT:   $vgpr5 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.72, addrspace 5)
-  ; GCN-NEXT:   $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.74, addrspace 5)
+  ; GCN-NEXT:   $vgpr3 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.69, addrspace 5)
+  ; GCN-NEXT:   $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.70, addrspace 5)
+  ; GCN-NEXT:   $vgpr5 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.71, addrspace 5)
+  ; GCN-NEXT:   $vgpr2 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.72, addrspace 5)
+  ; GCN-NEXT:   $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.74, addrspace 5)
   ; GCN-NEXT:   $exec_lo = S_MOV_B32 killed $sgpr1
   ; GCN-NEXT:   $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -24, implicit-def dead $scc
   ; GCN-NEXT:   $sgpr33 = COPY $sgpr0

diff  --git a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
index 16550fc9588aef..f523b4a2495f19 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
@@ -1183,7 +1183,7 @@ define void @spill_sgpr_no_free_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
@@ -1297,11 +1297,11 @@ define void @spill_sgpr_no_free_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; GCN-NEXT:    buffer_store_dword v253, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v254, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v255, off, s[0:3], s32 ; 4-byte Folded Spill
-; GCN-NEXT:    v_writelane_b32 v4, s34, 0
-; GCN-NEXT:    v_writelane_b32 v4, s35, 1
-; GCN-NEXT:    v_writelane_b32 v4, s36, 2
-; GCN-NEXT:    v_writelane_b32 v4, s37, 3
-; GCN-NEXT:    v_mov_b32_e32 v5, v3
+; GCN-NEXT:    v_writelane_b32 v5, s34, 0
+; GCN-NEXT:    v_writelane_b32 v5, s35, 1
+; GCN-NEXT:    v_writelane_b32 v5, s36, 2
+; GCN-NEXT:    v_writelane_b32 v5, s37, 3
+; GCN-NEXT:    v_mov_b32_e32 v4, v3
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NEXT:    ; implicit-def: $sgpr4
 ; GCN-NEXT:    ; implicit-def: $sgpr4
@@ -1310,30 +1310,30 @@ define void @spill_sgpr_no_free_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; GCN-NEXT:    ; implicit-def: $sgpr4
 ; GCN-NEXT:    ; implicit-def: $sgpr4
 ; GCN-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GCN-NEXT:    v_mov_b32_e32 v3, v5
+; GCN-NEXT:    v_mov_b32_e32 v3, v4
 ; GCN-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GCN-NEXT:    ; implicit-def: $sgpr4_sgpr5
-; GCN-NEXT:    flat_load_dwordx4 v[5:8], v[2:3]
+; GCN-NEXT:    flat_load_dwordx4 v[6:9], v[2:3]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    flat_store_dwordx4 v[0:1], v[5:8]
-; GCN-NEXT:    v_readlane_b32 s37, v4, 3
-; GCN-NEXT:    v_readlane_b32 s36, v4, 2
-; GCN-NEXT:    v_readlane_b32 s35, v4, 1
-; GCN-NEXT:    v_readlane_b32 s34, v4, 0
+; GCN-NEXT:    flat_store_dwordx4 v[0:1], v[6:9]
+; GCN-NEXT:    v_readlane_b32 s37, v5, 3
+; GCN-NEXT:    v_readlane_b32 s36, v5, 2
+; GCN-NEXT:    v_readlane_b32 s35, v5, 1
+; GCN-NEXT:    v_readlane_b32 s34, v5, 0
 ; GCN-NEXT:    buffer_load_dword v255, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v254, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v253, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -1447,7 +1447,7 @@ define void @spill_sgpr_no_free_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
index 7a01679f9972c7..2c0f64f85d823a 100644
--- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
@@ -211,15 +211,15 @@ entry:
 ; GCN-DAG: s_addc_u32 s5, s5, i32_fastcc_i32_i32 at gotpcrel32@hi+12
 
 ; GCN-DAG: v_writelane_b32 [[CSRV]], s30, 0
-; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-DAG: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-DAG: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-DAG: v_writelane_b32 [[CSRV]], s31, 1
 
 
 ; GCN: s_swappc_b64
 
-; GCN-DAG: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-DAG: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GCN-DAG: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-DAG: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 
 ; GCN: s_getpc_b64 s[4:5]
 ; GCN-NEXT: s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32 at rel32@lo+4

diff  --git a/llvm/test/CodeGen/AMDGPU/spill-sgpr-csr-live-ins.mir b/llvm/test/CodeGen/AMDGPU/spill-sgpr-csr-live-ins.mir
index d718b493218351..85a615c3d8ae8c 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-sgpr-csr-live-ins.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill-sgpr-csr-live-ins.mir
@@ -10,9 +10,9 @@ body:             |
   bb.0:
     liveins: $sgpr50
     ; CHECK-LABEL: name: spill_csr_sgpr_argument
-    ; CHECK: liveins: $sgpr50, $vgpr0
+    ; CHECK: liveins: $sgpr50, $vgpr63
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr50, 0, $vgpr0
+    ; CHECK-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR $sgpr50, 0, $vgpr63
     ; CHECK-NEXT: S_NOP 0, implicit $sgpr50
     ; CHECK-NEXT: $sgpr50 = S_MOV_B32 0
     S_NOP 0, implicit $sgpr50

diff  --git a/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir b/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir
index efb022ccb0d556..11babc82e919b4 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir
@@ -53,41 +53,41 @@ body:             |
   bb.0:
     liveins: $sgpr30_sgpr31, $sgpr10, $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, $sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79, $sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87, $sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
     ; GCN-LABEL: name: sgpr_spill_lane_crossover
-    ; GCN: liveins: $sgpr10, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $vgpr0, $sgpr30_sgpr31, $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, $sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79, $sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87, $sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN: liveins: $sgpr10, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $vgpr63, $sgpr30_sgpr31, $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, $sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79, $sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87, $sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr64, 0, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr65, 1, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr66, 2, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr67, 3, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr68, 4, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr69, 5, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr70, 6, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr71, 7, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr72, 8, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr73, 9, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr74, 10, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr75, 11, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr76, 12, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr77, 13, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr78, 14, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr79, 15, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr80, 16, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr81, 17, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr82, 18, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr83, 19, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr84, 20, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr85, 21, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr86, 22, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr87, 23, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr88, 24, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr89, 25, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr90, 26, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr91, 27, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr92, 28, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr93, 29, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr94, 30, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr95, 31, $vgpr0
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr64, 0, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr65, 1, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr66, 2, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr67, 3, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr68, 4, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr69, 5, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr70, 6, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr71, 7, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr72, 8, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr73, 9, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr74, 10, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr75, 11, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr76, 12, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr77, 13, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr78, 14, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr79, 15, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr80, 16, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr81, 17, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr82, 18, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr83, 19, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr84, 20, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr85, 21, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr86, 22, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr87, 23, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr88, 24, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr89, 25, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr90, 26, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr91, 27, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr92, 28, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr93, 29, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr94, 30, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr95, 31, $vgpr63
     ; GCN-NEXT: S_NOP 0
     ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, [[DEF]]
     ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr64, 1, [[DEF]], implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95

diff  --git a/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll b/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll
index 00564a7db77bc9..d2b960fe43f843 100644
--- a/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll
@@ -22,11 +22,11 @@ define void @spill_more_than_wavesize_csr_sgprs() {
 }
 
 ; CHECK-LABEL: {{^}}spill_more_than_wavesize_csr_sgprs_with_stack_object:
-; CHECK-DAG:    v_writelane_b32 v0, s98, 63
-; CHECK-DAG:    v_writelane_b32 v1, s99, 0
+; CHECK-DAG:    v_writelane_b32 v1, s98, 63
+; CHECK-DAG:    v_writelane_b32 v2, s99, 0
 ; CHECK-NOT: dummy
-; CHECK-DAG:    v_readlane_b32 s99, v1, 0
-; CHECK-DAG:    v_readlane_b32 s98, v0, 63
+; CHECK-DAG:    v_readlane_b32 s99, v2, 0
+; CHECK-DAG:    v_readlane_b32 s98, v1, 63
 define void @spill_more_than_wavesize_csr_sgprs_with_stack_object() {
   %alloca = alloca i32, align 4, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %alloca

diff  --git a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
index c352229f6a494d..d8db2d53198687 100644
--- a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
+++ b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
@@ -1585,17 +1585,17 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 s24, s33
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 s33, s32
 ; WAVE32-WWM-PREALLOC-NEXT:    s_xor_saveexec_b32 s16, -1
-; WAVE32-WWM-PREALLOC-NEXT:    buffer_store_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill
-; WAVE32-WWM-PREALLOC-NEXT:    buffer_store_dword v33, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill
+; WAVE32-WWM-PREALLOC-NEXT:    buffer_store_dword v33, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill
+; WAVE32-WWM-PREALLOC-NEXT:    buffer_store_dword v32, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 exec_lo, s16
 ; WAVE32-WWM-PREALLOC-NEXT:    s_add_i32 s32, s32, 0x1200
-; WAVE32-WWM-PREALLOC-NEXT:    ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
-; WAVE32-WWM-PREALLOC-NEXT:    v_writelane_b32 v32, s30, 0
-; WAVE32-WWM-PREALLOC-NEXT:    v_writelane_b32 v32, s31, 1
+; WAVE32-WWM-PREALLOC-NEXT:    ; implicit-def: $vgpr32 : SGPR spill to VGPR lane
+; WAVE32-WWM-PREALLOC-NEXT:    v_writelane_b32 v33, s30, 0
+; WAVE32-WWM-PREALLOC-NEXT:    v_writelane_b32 v33, s31, 1
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 s16, s32
-; WAVE32-WWM-PREALLOC-NEXT:    v_writelane_b32 v33, s16, 0
+; WAVE32-WWM-PREALLOC-NEXT:    v_writelane_b32 v32, s16, 0
 ; WAVE32-WWM-PREALLOC-NEXT:    s_lshr_b32 s16, s16, 5
-; WAVE32-WWM-PREALLOC-NEXT:    v_writelane_b32 v33, s16, 1
+; WAVE32-WWM-PREALLOC-NEXT:    v_writelane_b32 v32, s16, 1
 ; WAVE32-WWM-PREALLOC-NEXT:    v_mov_b32_e32 v0, 42
 ; WAVE32-WWM-PREALLOC-NEXT:    buffer_store_dword v0, off, s[0:3], s33
 ; WAVE32-WWM-PREALLOC-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -1673,18 +1673,18 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
 ; WAVE32-WWM-PREALLOC-NEXT:    ; implicit-def: $sgpr18
 ; WAVE32-WWM-PREALLOC-NEXT:    v_mov_b32_e32 v30, s18
 ; WAVE32-WWM-PREALLOC-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; WAVE32-WWM-PREALLOC-NEXT:    v_readlane_b32 s5, v33, 1
-; WAVE32-WWM-PREALLOC-NEXT:    v_readlane_b32 s4, v33, 0
+; WAVE32-WWM-PREALLOC-NEXT:    v_readlane_b32 s5, v32, 1
+; WAVE32-WWM-PREALLOC-NEXT:    v_readlane_b32 s4, v32, 0
 ; WAVE32-WWM-PREALLOC-NEXT:    ;;#ASMSTART
 ; WAVE32-WWM-PREALLOC-NEXT:    ; use s5
 ; WAVE32-WWM-PREALLOC-NEXT:    ;;#ASMEND
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 s32, s4
-; WAVE32-WWM-PREALLOC-NEXT:    v_readlane_b32 s31, v32, 1
-; WAVE32-WWM-PREALLOC-NEXT:    v_readlane_b32 s30, v32, 0
-; WAVE32-WWM-PREALLOC-NEXT:    ; kill: killed $vgpr33
+; WAVE32-WWM-PREALLOC-NEXT:    v_readlane_b32 s31, v33, 1
+; WAVE32-WWM-PREALLOC-NEXT:    v_readlane_b32 s30, v33, 0
+; WAVE32-WWM-PREALLOC-NEXT:    ; kill: killed $vgpr32
 ; WAVE32-WWM-PREALLOC-NEXT:    s_xor_saveexec_b32 s4, -1
-; WAVE32-WWM-PREALLOC-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
-; WAVE32-WWM-PREALLOC-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload
+; WAVE32-WWM-PREALLOC-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
+; WAVE32-WWM-PREALLOC-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 exec_lo, s4
 ; WAVE32-WWM-PREALLOC-NEXT:    s_add_i32 s32, s32, 0xffffee00
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 s33, s24

diff  --git a/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll b/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll
index 57e4cec4eccb11..468a8463a06d6f 100644
--- a/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll
+++ b/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll
@@ -252,31 +252,31 @@ define void @outgoing_f16_return(ptr %ptr) #0 {
 ; GFX7-NEXT:    s_mov_b32 s16, s33
 ; GFX7-NEXT:    s_mov_b32 s33, s32
 ; GFX7-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX7-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX7-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX7-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX7-NEXT:    v_writelane_b32 v40, s16, 2
-; GFX7-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX7-NEXT:    v_writelane_b32 v42, s16, 2
+; GFX7-NEXT:    v_writelane_b32 v42, s30, 0
 ; GFX7-NEXT:    s_mov_b32 s17, f16_result at abs32@hi
 ; GFX7-NEXT:    s_mov_b32 s16, f16_result at abs32@lo
 ; GFX7-NEXT:    s_addk_i32 s32, 0x400
-; GFX7-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX7-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX7-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX7-NEXT:    v_mov_b32_e32 v42, v1
-; GFX7-NEXT:    v_mov_b32_e32 v41, v0
+; GFX7-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX7-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX7-NEXT:    v_writelane_b32 v42, s31, 1
+; GFX7-NEXT:    v_mov_b32_e32 v41, v1
+; GFX7-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX7-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX7-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX7-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX7-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX7-NEXT:    v_readlane_b32 s4, v42, 2
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    flat_store_short v[41:42], v0
-; GFX7-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX7-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX7-NEXT:    flat_store_short v[40:41], v0
+; GFX7-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX7-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX7-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX7-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX7-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX7-NEXT:    s_mov_b32 s33, s4
@@ -294,37 +294,37 @@ define void @outgoing_v2f16_return(ptr %ptr) #0 {
 ; GFX7-NEXT:    s_mov_b32 s16, s33
 ; GFX7-NEXT:    s_mov_b32 s33, s32
 ; GFX7-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX7-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX7-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX7-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX7-NEXT:    v_writelane_b32 v40, s16, 2
-; GFX7-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX7-NEXT:    v_writelane_b32 v42, s16, 2
+; GFX7-NEXT:    v_writelane_b32 v42, s30, 0
 ; GFX7-NEXT:    s_mov_b32 s17, v2f16_result at abs32@hi
 ; GFX7-NEXT:    s_mov_b32 s16, v2f16_result at abs32@lo
 ; GFX7-NEXT:    s_addk_i32 s32, 0x400
-; GFX7-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX7-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX7-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX7-NEXT:    v_mov_b32_e32 v42, v1
-; GFX7-NEXT:    v_mov_b32_e32 v41, v0
+; GFX7-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX7-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX7-NEXT:    v_writelane_b32 v42, s31, 1
+; GFX7-NEXT:    v_mov_b32_e32 v41, v1
+; GFX7-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX7-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX7-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX7-NEXT:    v_readlane_b32 s30, v42, 0
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX7-NEXT:    v_readlane_b32 s4, v42, 2
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT:    flat_store_dword v[41:42], v0
-; GFX7-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX7-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX7-NEXT:    flat_store_dword v[40:41], v0
+; GFX7-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX7-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX7-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX7-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX7-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX7-NEXT:    s_mov_b32 s33, s4
@@ -342,18 +342,18 @@ define void @outgoing_v4f16_return(ptr %ptr) #0 {
 ; GFX7-NEXT:    s_mov_b32 s16, s33
 ; GFX7-NEXT:    s_mov_b32 s33, s32
 ; GFX7-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX7-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX7-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX7-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX7-NEXT:    v_writelane_b32 v40, s16, 2
-; GFX7-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX7-NEXT:    v_writelane_b32 v42, s16, 2
+; GFX7-NEXT:    v_writelane_b32 v42, s30, 0
 ; GFX7-NEXT:    s_mov_b32 s17, v4f16_result at abs32@hi
 ; GFX7-NEXT:    s_mov_b32 s16, v4f16_result at abs32@lo
 ; GFX7-NEXT:    s_addk_i32 s32, 0x400
-; GFX7-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX7-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX7-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX7-NEXT:    v_mov_b32_e32 v42, v1
-; GFX7-NEXT:    v_mov_b32_e32 v41, v0
+; GFX7-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX7-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX7-NEXT:    v_writelane_b32 v42, s31, 1
+; GFX7-NEXT:    v_mov_b32_e32 v41, v1
+; GFX7-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX7-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -375,17 +375,17 @@ define void @outgoing_v4f16_return(ptr %ptr) #0 {
 ; GFX7-NEXT:    v_or_b32_e32 v4, v0, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v0
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 4, v41
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v42, vcc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 4, v40
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v41, vcc
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
-; GFX7-NEXT:    flat_store_dword v[41:42], v4
-; GFX7-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX7-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX7-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX7-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX7-NEXT:    flat_store_dword v[40:41], v4
+; GFX7-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX7-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX7-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX7-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX7-NEXT:    v_readlane_b32 s4, v42, 2
 ; GFX7-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX7-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX7-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX7-NEXT:    s_mov_b32 s33, s4
@@ -403,18 +403,18 @@ define void @outgoing_v8f16_return(ptr %ptr) #0 {
 ; GFX7-NEXT:    s_mov_b32 s16, s33
 ; GFX7-NEXT:    s_mov_b32 s33, s32
 ; GFX7-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX7-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX7-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX7-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX7-NEXT:    v_writelane_b32 v40, s16, 2
-; GFX7-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX7-NEXT:    v_writelane_b32 v42, s16, 2
+; GFX7-NEXT:    v_writelane_b32 v42, s30, 0
 ; GFX7-NEXT:    s_mov_b32 s17, v8f16_result at abs32@hi
 ; GFX7-NEXT:    s_mov_b32 s16, v8f16_result at abs32@lo
 ; GFX7-NEXT:    s_addk_i32 s32, 0x400
-; GFX7-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX7-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX7-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX7-NEXT:    v_mov_b32_e32 v42, v1
-; GFX7-NEXT:    v_mov_b32_e32 v41, v0
+; GFX7-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX7-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX7-NEXT:    v_writelane_b32 v42, s31, 1
+; GFX7-NEXT:    v_mov_b32_e32 v41, v1
+; GFX7-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX7-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -456,23 +456,23 @@ define void @outgoing_v8f16_return(ptr %ptr) #0 {
 ; GFX7-NEXT:    v_or_b32_e32 v5, v1, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
 ; GFX7-NEXT:    v_or_b32_e32 v3, v4, v0
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 12, v41
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v42, vcc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 12, v40
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v41, vcc
 ; GFX7-NEXT:    flat_store_dword v[0:1], v3
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 8, v41
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v42, vcc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 8, v40
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v41, vcc
 ; GFX7-NEXT:    flat_store_dword v[0:1], v5
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 4, v41
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v42, vcc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 4, v40
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v41, vcc
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
-; GFX7-NEXT:    flat_store_dword v[41:42], v8
-; GFX7-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX7-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX7-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX7-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX7-NEXT:    flat_store_dword v[40:41], v8
+; GFX7-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX7-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX7-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX7-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX7-NEXT:    v_readlane_b32 s4, v42, 2
 ; GFX7-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX7-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX7-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX7-NEXT:    s_mov_b32 s33, s4

diff  --git a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
index dda41ec131a31c..ebbce68221a94f 100644
--- a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
+++ b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
@@ -9,27 +9,27 @@ define hidden void @widget() {
 ; GCN-NEXT:    s_mov_b32 s16, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[18:19]
-; GCN-NEXT:    v_writelane_b32 v40, s16, 16
+; GCN-NEXT:    v_writelane_b32 v41, s16, 16
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    v_writelane_b32 v40, s30, 0
-; GCN-NEXT:    v_writelane_b32 v40, s31, 1
-; GCN-NEXT:    v_writelane_b32 v40, s34, 2
-; GCN-NEXT:    v_writelane_b32 v40, s35, 3
-; GCN-NEXT:    v_writelane_b32 v40, s36, 4
-; GCN-NEXT:    v_writelane_b32 v40, s37, 5
-; GCN-NEXT:    v_writelane_b32 v40, s38, 6
-; GCN-NEXT:    v_writelane_b32 v40, s39, 7
-; GCN-NEXT:    v_writelane_b32 v40, s40, 8
-; GCN-NEXT:    v_writelane_b32 v40, s41, 9
-; GCN-NEXT:    v_writelane_b32 v40, s42, 10
-; GCN-NEXT:    v_writelane_b32 v40, s43, 11
-; GCN-NEXT:    v_writelane_b32 v40, s44, 12
-; GCN-NEXT:    v_writelane_b32 v40, s45, 13
-; GCN-NEXT:    v_writelane_b32 v40, s46, 14
-; GCN-NEXT:    v_writelane_b32 v40, s47, 15
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    v_writelane_b32 v41, s30, 0
+; GCN-NEXT:    v_writelane_b32 v41, s31, 1
+; GCN-NEXT:    v_writelane_b32 v41, s34, 2
+; GCN-NEXT:    v_writelane_b32 v41, s35, 3
+; GCN-NEXT:    v_writelane_b32 v41, s36, 4
+; GCN-NEXT:    v_writelane_b32 v41, s37, 5
+; GCN-NEXT:    v_writelane_b32 v41, s38, 6
+; GCN-NEXT:    v_writelane_b32 v41, s39, 7
+; GCN-NEXT:    v_writelane_b32 v41, s40, 8
+; GCN-NEXT:    v_writelane_b32 v41, s41, 9
+; GCN-NEXT:    v_writelane_b32 v41, s42, 10
+; GCN-NEXT:    v_writelane_b32 v41, s43, 11
+; GCN-NEXT:    v_writelane_b32 v41, s44, 12
+; GCN-NEXT:    v_writelane_b32 v41, s45, 13
+; GCN-NEXT:    v_writelane_b32 v41, s46, 14
+; GCN-NEXT:    v_writelane_b32 v41, s47, 15
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    flat_load_dword v0, v[0:1]
@@ -58,9 +58,9 @@ define hidden void @widget() {
 ; GCN-NEXT:    s_mov_b32 s43, s13
 ; GCN-NEXT:    s_mov_b32 s44, s14
 ; GCN-NEXT:    s_mov_b32 s45, s15
-; GCN-NEXT:    v_mov_b32_e32 v41, v31
+; GCN-NEXT:    v_mov_b32_e32 v40, v31
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT:    v_mov_b32_e32 v31, v41
+; GCN-NEXT:    v_mov_b32_e32 v31, v40
 ; GCN-NEXT:    s_mov_b32 s12, s42
 ; GCN-NEXT:    s_mov_b32 s13, s43
 ; GCN-NEXT:    s_mov_b32 s14, s44
@@ -93,26 +93,26 @@ define hidden void @widget() {
 ; GCN-NEXT:    s_addc_u32 s17, s17, wibble at rel32@hi+12
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GCN-NEXT:  .LBB0_8: ; %UnifiedReturnBlock
-; GCN-NEXT:    v_readlane_b32 s47, v40, 15
-; GCN-NEXT:    v_readlane_b32 s46, v40, 14
-; GCN-NEXT:    v_readlane_b32 s45, v40, 13
-; GCN-NEXT:    v_readlane_b32 s44, v40, 12
-; GCN-NEXT:    v_readlane_b32 s43, v40, 11
-; GCN-NEXT:    v_readlane_b32 s42, v40, 10
-; GCN-NEXT:    v_readlane_b32 s41, v40, 9
-; GCN-NEXT:    v_readlane_b32 s40, v40, 8
-; GCN-NEXT:    v_readlane_b32 s39, v40, 7
-; GCN-NEXT:    v_readlane_b32 s38, v40, 6
-; GCN-NEXT:    v_readlane_b32 s37, v40, 5
-; GCN-NEXT:    v_readlane_b32 s36, v40, 4
-; GCN-NEXT:    v_readlane_b32 s35, v40, 3
-; GCN-NEXT:    v_readlane_b32 s34, v40, 2
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
-; GCN-NEXT:    v_readlane_b32 s30, v40, 0
-; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    v_readlane_b32 s4, v40, 16
+; GCN-NEXT:    v_readlane_b32 s47, v41, 15
+; GCN-NEXT:    v_readlane_b32 s46, v41, 14
+; GCN-NEXT:    v_readlane_b32 s45, v41, 13
+; GCN-NEXT:    v_readlane_b32 s44, v41, 12
+; GCN-NEXT:    v_readlane_b32 s43, v41, 11
+; GCN-NEXT:    v_readlane_b32 s42, v41, 10
+; GCN-NEXT:    v_readlane_b32 s41, v41, 9
+; GCN-NEXT:    v_readlane_b32 s40, v41, 8
+; GCN-NEXT:    v_readlane_b32 s39, v41, 7
+; GCN-NEXT:    v_readlane_b32 s38, v41, 6
+; GCN-NEXT:    v_readlane_b32 s37, v41, 5
+; GCN-NEXT:    v_readlane_b32 s36, v41, 4
+; GCN-NEXT:    v_readlane_b32 s35, v41, 3
+; GCN-NEXT:    v_readlane_b32 s34, v41, 2
+; GCN-NEXT:    v_readlane_b32 s31, v41, 1
+; GCN-NEXT:    v_readlane_b32 s30, v41, 0
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    v_readlane_b32 s4, v41, 16
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    s_mov_b32 s33, s4
@@ -257,45 +257,45 @@ define hidden void @blam() {
 ; GCN-NEXT:    s_mov_b32 s16, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[18:19]
-; GCN-NEXT:    v_writelane_b32 v40, s16, 28
+; GCN-NEXT:    v_writelane_b32 v45, s16, 28
 ; GCN-NEXT:    s_addk_i32 s32, 0x800
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    v_writelane_b32 v40, s30, 0
-; GCN-NEXT:    v_writelane_b32 v40, s31, 1
-; GCN-NEXT:    v_writelane_b32 v40, s34, 2
-; GCN-NEXT:    v_writelane_b32 v40, s35, 3
-; GCN-NEXT:    v_writelane_b32 v40, s36, 4
-; GCN-NEXT:    v_writelane_b32 v40, s37, 5
-; GCN-NEXT:    v_writelane_b32 v40, s38, 6
-; GCN-NEXT:    v_writelane_b32 v40, s39, 7
-; GCN-NEXT:    v_writelane_b32 v40, s40, 8
-; GCN-NEXT:    v_writelane_b32 v40, s41, 9
-; GCN-NEXT:    v_writelane_b32 v40, s42, 10
-; GCN-NEXT:    v_writelane_b32 v40, s43, 11
-; GCN-NEXT:    v_writelane_b32 v40, s44, 12
-; GCN-NEXT:    v_writelane_b32 v40, s45, 13
-; GCN-NEXT:    v_writelane_b32 v40, s46, 14
-; GCN-NEXT:    v_writelane_b32 v40, s47, 15
-; GCN-NEXT:    v_writelane_b32 v40, s48, 16
-; GCN-NEXT:    v_writelane_b32 v40, s49, 17
-; GCN-NEXT:    v_writelane_b32 v40, s50, 18
-; GCN-NEXT:    v_writelane_b32 v40, s51, 19
-; GCN-NEXT:    v_writelane_b32 v40, s52, 20
-; GCN-NEXT:    v_writelane_b32 v40, s53, 21
-; GCN-NEXT:    v_writelane_b32 v40, s54, 22
-; GCN-NEXT:    v_writelane_b32 v40, s55, 23
-; GCN-NEXT:    v_writelane_b32 v40, s56, 24
-; GCN-NEXT:    v_writelane_b32 v40, s57, 25
-; GCN-NEXT:    v_writelane_b32 v40, s58, 26
-; GCN-NEXT:    v_writelane_b32 v40, s59, 27
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    v_writelane_b32 v45, s30, 0
+; GCN-NEXT:    v_writelane_b32 v45, s31, 1
+; GCN-NEXT:    v_writelane_b32 v45, s34, 2
+; GCN-NEXT:    v_writelane_b32 v45, s35, 3
+; GCN-NEXT:    v_writelane_b32 v45, s36, 4
+; GCN-NEXT:    v_writelane_b32 v45, s37, 5
+; GCN-NEXT:    v_writelane_b32 v45, s38, 6
+; GCN-NEXT:    v_writelane_b32 v45, s39, 7
+; GCN-NEXT:    v_writelane_b32 v45, s40, 8
+; GCN-NEXT:    v_writelane_b32 v45, s41, 9
+; GCN-NEXT:    v_writelane_b32 v45, s42, 10
+; GCN-NEXT:    v_writelane_b32 v45, s43, 11
+; GCN-NEXT:    v_writelane_b32 v45, s44, 12
+; GCN-NEXT:    v_writelane_b32 v45, s45, 13
+; GCN-NEXT:    v_writelane_b32 v45, s46, 14
+; GCN-NEXT:    v_writelane_b32 v45, s47, 15
+; GCN-NEXT:    v_writelane_b32 v45, s48, 16
+; GCN-NEXT:    v_writelane_b32 v45, s49, 17
+; GCN-NEXT:    v_writelane_b32 v45, s50, 18
+; GCN-NEXT:    v_writelane_b32 v45, s51, 19
+; GCN-NEXT:    v_writelane_b32 v45, s52, 20
+; GCN-NEXT:    v_writelane_b32 v45, s53, 21
+; GCN-NEXT:    v_writelane_b32 v45, s54, 22
+; GCN-NEXT:    v_writelane_b32 v45, s55, 23
+; GCN-NEXT:    v_writelane_b32 v45, s56, 24
+; GCN-NEXT:    v_writelane_b32 v45, s57, 25
+; GCN-NEXT:    v_writelane_b32 v45, s58, 26
+; GCN-NEXT:    v_writelane_b32 v45, s59, 27
 ; GCN-NEXT:    s_mov_b64 s[34:35], s[6:7]
-; GCN-NEXT:    v_mov_b32_e32 v41, v31
+; GCN-NEXT:    v_mov_b32_e32 v40, v31
 ; GCN-NEXT:    s_mov_b32 s46, s15
 ; GCN-NEXT:    s_mov_b32 s47, s14
 ; GCN-NEXT:    s_mov_b32 s48, s13
@@ -305,18 +305,18 @@ define hidden void @blam() {
 ; GCN-NEXT:    s_mov_b64 s[40:41], s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    v_and_b32_e32 v2, 0x3ff, v41
-; GCN-NEXT:    v_mov_b32_e32 v43, 0
-; GCN-NEXT:    flat_load_dword v44, v[0:1]
+; GCN-NEXT:    v_and_b32_e32 v2, 0x3ff, v40
+; GCN-NEXT:    v_mov_b32_e32 v42, 0
+; GCN-NEXT:    flat_load_dword v43, v[0:1]
 ; GCN-NEXT:    s_mov_b64 s[50:51], 0
 ; GCN-NEXT:    s_getpc_b64 s[52:53]
 ; GCN-NEXT:    s_add_u32 s52, s52, spam at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s53, s53, spam at rel32@hi+12
-; GCN-NEXT:    v_lshlrev_b32_e32 v42, 2, v2
+; GCN-NEXT:    v_lshlrev_b32_e32 v41, 2, v2
 ; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cmp_eq_f32_e64 s[54:55], 0, v44
-; GCN-NEXT:    v_cmp_neq_f32_e64 s[42:43], 0, v44
-; GCN-NEXT:    v_mov_b32_e32 v45, 0x7fc00000
+; GCN-NEXT:    v_cmp_eq_f32_e64 s[54:55], 0, v43
+; GCN-NEXT:    v_cmp_neq_f32_e64 s[42:43], 0, v43
+; GCN-NEXT:    v_mov_b32_e32 v44, 0x7fc00000
 ; GCN-NEXT:    s_branch .LBB1_2
 ; GCN-NEXT:  .LBB1_1: ; %Flow7
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
@@ -327,8 +327,8 @@ define hidden void @blam() {
 ; GCN-NEXT:    s_cbranch_execz .LBB1_18
 ; GCN-NEXT:  .LBB1_2: ; %bb2
 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    flat_load_dword v0, v[42:43]
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], 0
+; GCN-NEXT:    flat_load_dword v0, v[41:42]
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], 0
 ; GCN-NEXT:    s_mov_b64 s[6:7], 0
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_cmp_lt_i32_e32 vcc, 2, v0
@@ -351,7 +351,7 @@ define hidden void @blam() {
 ; GCN-NEXT:    s_mov_b32 s13, s48
 ; GCN-NEXT:    s_mov_b32 s14, s47
 ; GCN-NEXT:    s_mov_b32 s15, s46
-; GCN-NEXT:    v_mov_b32_e32 v31, v41
+; GCN-NEXT:    v_mov_b32_e32 v31, v40
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[52:53]
 ; GCN-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v0
 ; GCN-NEXT:    s_mov_b64 s[6:7], 0
@@ -364,7 +364,7 @@ define hidden void @blam() {
 ; GCN-NEXT:    s_cbranch_execz .LBB1_7
 ; GCN-NEXT:  ; %bb.6: ; %bb16
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], 0
 ; GCN-NEXT:    s_or_b64 s[8:9], s[54:55], exec
 ; GCN-NEXT:  .LBB1_7: ; %Flow3
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
@@ -376,7 +376,7 @@ define hidden void @blam() {
 ; GCN-NEXT:  ; %bb.8: ; %bb17
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
 ; GCN-NEXT:    s_mov_b64 s[6:7], exec
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], 0
 ; GCN-NEXT:  .LBB1_9: ; %Flow4
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
 ; GCN-NEXT:    s_or_b64 exec, exec, s[8:9]
@@ -406,7 +406,7 @@ define hidden void @blam() {
 ; GCN-NEXT:    s_cbranch_execz .LBB1_15
 ; GCN-NEXT:  ; %bb.14: ; %bb10
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], 0
 ; GCN-NEXT:    s_or_b64 s[10:11], s[6:7], exec
 ; GCN-NEXT:  .LBB1_15: ; %Flow6
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
@@ -424,47 +424,47 @@ define hidden void @blam() {
 ; GCN-NEXT:    s_cbranch_execz .LBB1_1
 ; GCN-NEXT:  ; %bb.17: ; %bb18
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], 0
 ; GCN-NEXT:    s_andn2_b64 s[4:5], s[4:5], exec
 ; GCN-NEXT:    s_branch .LBB1_1
 ; GCN-NEXT:  .LBB1_18: ; %DummyReturnBlock
 ; GCN-NEXT:    s_or_b64 exec, exec, s[50:51]
-; GCN-NEXT:    v_readlane_b32 s59, v40, 27
-; GCN-NEXT:    v_readlane_b32 s58, v40, 26
-; GCN-NEXT:    v_readlane_b32 s57, v40, 25
-; GCN-NEXT:    v_readlane_b32 s56, v40, 24
-; GCN-NEXT:    v_readlane_b32 s55, v40, 23
-; GCN-NEXT:    v_readlane_b32 s54, v40, 22
-; GCN-NEXT:    v_readlane_b32 s53, v40, 21
-; GCN-NEXT:    v_readlane_b32 s52, v40, 20
-; GCN-NEXT:    v_readlane_b32 s51, v40, 19
-; GCN-NEXT:    v_readlane_b32 s50, v40, 18
-; GCN-NEXT:    v_readlane_b32 s49, v40, 17
-; GCN-NEXT:    v_readlane_b32 s48, v40, 16
-; GCN-NEXT:    v_readlane_b32 s47, v40, 15
-; GCN-NEXT:    v_readlane_b32 s46, v40, 14
-; GCN-NEXT:    v_readlane_b32 s45, v40, 13
-; GCN-NEXT:    v_readlane_b32 s44, v40, 12
-; GCN-NEXT:    v_readlane_b32 s43, v40, 11
-; GCN-NEXT:    v_readlane_b32 s42, v40, 10
-; GCN-NEXT:    v_readlane_b32 s41, v40, 9
-; GCN-NEXT:    v_readlane_b32 s40, v40, 8
-; GCN-NEXT:    v_readlane_b32 s39, v40, 7
-; GCN-NEXT:    v_readlane_b32 s38, v40, 6
-; GCN-NEXT:    v_readlane_b32 s37, v40, 5
-; GCN-NEXT:    v_readlane_b32 s36, v40, 4
-; GCN-NEXT:    v_readlane_b32 s35, v40, 3
-; GCN-NEXT:    v_readlane_b32 s34, v40, 2
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
-; GCN-NEXT:    v_readlane_b32 s30, v40, 0
-; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GCN-NEXT:    v_readlane_b32 s4, v40, 28
+; GCN-NEXT:    v_readlane_b32 s59, v45, 27
+; GCN-NEXT:    v_readlane_b32 s58, v45, 26
+; GCN-NEXT:    v_readlane_b32 s57, v45, 25
+; GCN-NEXT:    v_readlane_b32 s56, v45, 24
+; GCN-NEXT:    v_readlane_b32 s55, v45, 23
+; GCN-NEXT:    v_readlane_b32 s54, v45, 22
+; GCN-NEXT:    v_readlane_b32 s53, v45, 21
+; GCN-NEXT:    v_readlane_b32 s52, v45, 20
+; GCN-NEXT:    v_readlane_b32 s51, v45, 19
+; GCN-NEXT:    v_readlane_b32 s50, v45, 18
+; GCN-NEXT:    v_readlane_b32 s49, v45, 17
+; GCN-NEXT:    v_readlane_b32 s48, v45, 16
+; GCN-NEXT:    v_readlane_b32 s47, v45, 15
+; GCN-NEXT:    v_readlane_b32 s46, v45, 14
+; GCN-NEXT:    v_readlane_b32 s45, v45, 13
+; GCN-NEXT:    v_readlane_b32 s44, v45, 12
+; GCN-NEXT:    v_readlane_b32 s43, v45, 11
+; GCN-NEXT:    v_readlane_b32 s42, v45, 10
+; GCN-NEXT:    v_readlane_b32 s41, v45, 9
+; GCN-NEXT:    v_readlane_b32 s40, v45, 8
+; GCN-NEXT:    v_readlane_b32 s39, v45, 7
+; GCN-NEXT:    v_readlane_b32 s38, v45, 6
+; GCN-NEXT:    v_readlane_b32 s37, v45, 5
+; GCN-NEXT:    v_readlane_b32 s36, v45, 4
+; GCN-NEXT:    v_readlane_b32 s35, v45, 3
+; GCN-NEXT:    v_readlane_b32 s34, v45, 2
+; GCN-NEXT:    v_readlane_b32 s31, v45, 1
+; GCN-NEXT:    v_readlane_b32 s30, v45, 0
+; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GCN-NEXT:    v_readlane_b32 s4, v45, 28
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    s_addk_i32 s32, 0xf800
 ; GCN-NEXT:    s_mov_b32 s33, s4

diff  --git a/llvm/test/CodeGen/AMDGPU/vgpr-large-tuple-alloc-error.ll b/llvm/test/CodeGen/AMDGPU/vgpr-large-tuple-alloc-error.ll
index f3276719ac13ca..5ced02f28c977f 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-large-tuple-alloc-error.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-large-tuple-alloc-error.ll
@@ -1,188 +1,214 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx900 -O0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX900 %s
 ; RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx906 -O0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX906 %s
-; RUN: not --crash llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx908 -O0 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GFX908-ERR %s
+; RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx908 -O0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX908 %s
 ; RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx90a -O0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX90a %s
 
-; This test crashes for gfx908 while allocating the tuple. Compared to the other subtargets,
+; This test used to crash for gfx908 while allocating the tuple. Compared to the other subtargets,
 ; gfx908 marks an extra VGPR reserved for AGPR to VGPR copy that puts more register pressure.
-
-; GFX908-ERR: error: ran out of registers during register allocation
+; To minimize the register pressure, the VGPRs used for CSR SGPR spilling has been picked from the
+; higher available range there by allowing more VGPRs available in the lowest range for allocation.
 
 define i32 @test_tuple(<16 x i64> %0) {
 ; GFX900-LABEL: test_tuple:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX900-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX900-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX900-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX900-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX900-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX900-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX900-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX900-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX900-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX900-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX900-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX900-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX900-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX900-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX900-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX900-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX900-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX900-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX900-NEXT:    v_writelane_b32 v31, s36, 0
-; GFX900-NEXT:    v_writelane_b32 v31, s37, 1
-; GFX900-NEXT:    v_writelane_b32 v31, s38, 2
-; GFX900-NEXT:    v_writelane_b32 v31, s39, 3
-; GFX900-NEXT:    v_writelane_b32 v31, s40, 4
-; GFX900-NEXT:    v_writelane_b32 v31, s41, 5
-; GFX900-NEXT:    v_writelane_b32 v31, s42, 6
-; GFX900-NEXT:    v_writelane_b32 v31, s43, 7
-; GFX900-NEXT:    v_writelane_b32 v31, s44, 8
-; GFX900-NEXT:    v_writelane_b32 v31, s45, 9
-; GFX900-NEXT:    v_writelane_b32 v31, s46, 10
-; GFX900-NEXT:    v_writelane_b32 v31, s47, 11
-; GFX900-NEXT:    v_writelane_b32 v31, s48, 12
-; GFX900-NEXT:    v_writelane_b32 v31, s49, 13
-; GFX900-NEXT:    v_writelane_b32 v31, s50, 14
-; GFX900-NEXT:    v_writelane_b32 v31, s51, 15
-; GFX900-NEXT:    v_writelane_b32 v31, s52, 16
-; GFX900-NEXT:    v_writelane_b32 v31, s53, 17
-; GFX900-NEXT:    v_writelane_b32 v31, s54, 18
-; GFX900-NEXT:    v_writelane_b32 v31, s55, 19
-; GFX900-NEXT:    v_writelane_b32 v31, s56, 20
-; GFX900-NEXT:    v_writelane_b32 v31, s57, 21
-; GFX900-NEXT:    v_writelane_b32 v31, s58, 22
-; GFX900-NEXT:    v_writelane_b32 v31, s59, 23
-; GFX900-NEXT:    v_writelane_b32 v31, s60, 24
-; GFX900-NEXT:    v_writelane_b32 v31, s61, 25
-; GFX900-NEXT:    v_writelane_b32 v31, s62, 26
-; GFX900-NEXT:    v_writelane_b32 v31, s63, 27
-; GFX900-NEXT:    v_writelane_b32 v31, s64, 28
-; GFX900-NEXT:    v_writelane_b32 v31, s65, 29
-; GFX900-NEXT:    v_writelane_b32 v31, s66, 30
-; GFX900-NEXT:    v_writelane_b32 v31, s67, 31
-; GFX900-NEXT:    v_mov_b32_e32 v32, v0
-; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; kill: def $vgpr32 killed $vgpr32 def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 killed $exec
-; GFX900-NEXT:    v_mov_b32_e32 v33, v1
-; GFX900-NEXT:    v_mov_b32_e32 v34, v2
-; GFX900-NEXT:    v_mov_b32_e32 v35, v3
-; GFX900-NEXT:    v_mov_b32_e32 v36, v4
-; GFX900-NEXT:    v_mov_b32_e32 v37, v5
-; GFX900-NEXT:    v_mov_b32_e32 v38, v6
-; GFX900-NEXT:    v_mov_b32_e32 v39, v7
-; GFX900-NEXT:    v_mov_b32_e32 v40, v8
-; GFX900-NEXT:    v_mov_b32_e32 v41, v9
-; GFX900-NEXT:    v_mov_b32_e32 v42, v10
-; GFX900-NEXT:    v_mov_b32_e32 v43, v11
-; GFX900-NEXT:    v_mov_b32_e32 v44, v12
-; GFX900-NEXT:    v_mov_b32_e32 v45, v13
-; GFX900-NEXT:    v_mov_b32_e32 v46, v14
-; GFX900-NEXT:    v_mov_b32_e32 v47, v15
-; GFX900-NEXT:    v_mov_b32_e32 v48, v16
-; GFX900-NEXT:    v_mov_b32_e32 v49, v17
-; GFX900-NEXT:    v_mov_b32_e32 v50, v18
-; GFX900-NEXT:    v_mov_b32_e32 v51, v19
-; GFX900-NEXT:    v_mov_b32_e32 v52, v20
-; GFX900-NEXT:    v_mov_b32_e32 v53, v21
-; GFX900-NEXT:    v_mov_b32_e32 v54, v22
-; GFX900-NEXT:    v_mov_b32_e32 v55, v23
-; GFX900-NEXT:    v_mov_b32_e32 v56, v24
-; GFX900-NEXT:    v_mov_b32_e32 v57, v25
-; GFX900-NEXT:    v_mov_b32_e32 v58, v26
-; GFX900-NEXT:    v_mov_b32_e32 v59, v27
-; GFX900-NEXT:    v_mov_b32_e32 v60, v28
-; GFX900-NEXT:    v_mov_b32_e32 v61, v29
-; GFX900-NEXT:    v_mov_b32_e32 v62, v30
-; GFX900-NEXT:    ; kill: def $vgpr63 killed $vgpr0 killed $exec
+; GFX900-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX900-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX900-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX900-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX900-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX900-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX900-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX900-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX900-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX900-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX900-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX900-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX900-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX900-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX900-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX900-NEXT:    v_writelane_b32 v63, s36, 0
+; GFX900-NEXT:    v_writelane_b32 v63, s37, 1
+; GFX900-NEXT:    v_writelane_b32 v63, s38, 2
+; GFX900-NEXT:    v_writelane_b32 v63, s39, 3
+; GFX900-NEXT:    v_writelane_b32 v63, s40, 4
+; GFX900-NEXT:    v_writelane_b32 v63, s41, 5
+; GFX900-NEXT:    v_writelane_b32 v63, s42, 6
+; GFX900-NEXT:    v_writelane_b32 v63, s43, 7
+; GFX900-NEXT:    v_writelane_b32 v63, s44, 8
+; GFX900-NEXT:    v_writelane_b32 v63, s45, 9
+; GFX900-NEXT:    v_writelane_b32 v63, s46, 10
+; GFX900-NEXT:    v_writelane_b32 v63, s47, 11
+; GFX900-NEXT:    v_writelane_b32 v63, s48, 12
+; GFX900-NEXT:    v_writelane_b32 v63, s49, 13
+; GFX900-NEXT:    v_writelane_b32 v63, s50, 14
+; GFX900-NEXT:    v_writelane_b32 v63, s51, 15
+; GFX900-NEXT:    v_writelane_b32 v63, s52, 16
+; GFX900-NEXT:    v_writelane_b32 v63, s53, 17
+; GFX900-NEXT:    v_writelane_b32 v63, s54, 18
+; GFX900-NEXT:    v_writelane_b32 v63, s55, 19
+; GFX900-NEXT:    v_writelane_b32 v63, s56, 20
+; GFX900-NEXT:    v_writelane_b32 v63, s57, 21
+; GFX900-NEXT:    v_writelane_b32 v63, s58, 22
+; GFX900-NEXT:    v_writelane_b32 v63, s59, 23
+; GFX900-NEXT:    v_writelane_b32 v63, s60, 24
+; GFX900-NEXT:    v_writelane_b32 v63, s61, 25
+; GFX900-NEXT:    v_writelane_b32 v63, s62, 26
+; GFX900-NEXT:    v_writelane_b32 v63, s63, 27
+; GFX900-NEXT:    v_writelane_b32 v63, s64, 28
+; GFX900-NEXT:    v_writelane_b32 v63, s65, 29
+; GFX900-NEXT:    v_writelane_b32 v63, s66, 30
+; GFX900-NEXT:    v_writelane_b32 v63, s67, 31
+; GFX900-NEXT:    v_mov_b32_e32 v33, v30
+; GFX900-NEXT:    v_mov_b32_e32 v34, v29
+; GFX900-NEXT:    v_mov_b32_e32 v35, v28
+; GFX900-NEXT:    v_mov_b32_e32 v36, v27
+; GFX900-NEXT:    v_mov_b32_e32 v37, v26
+; GFX900-NEXT:    v_mov_b32_e32 v38, v25
+; GFX900-NEXT:    v_mov_b32_e32 v39, v24
+; GFX900-NEXT:    v_mov_b32_e32 v48, v23
+; GFX900-NEXT:    v_mov_b32_e32 v49, v22
+; GFX900-NEXT:    v_mov_b32_e32 v50, v21
+; GFX900-NEXT:    v_mov_b32_e32 v51, v20
+; GFX900-NEXT:    v_mov_b32_e32 v52, v19
+; GFX900-NEXT:    v_mov_b32_e32 v53, v18
+; GFX900-NEXT:    v_mov_b32_e32 v54, v17
+; GFX900-NEXT:    v_mov_b32_e32 v55, v16
+; GFX900-NEXT:    v_mov_b32_e32 v40, v15
+; GFX900-NEXT:    v_mov_b32_e32 v41, v14
+; GFX900-NEXT:    v_mov_b32_e32 v42, v13
+; GFX900-NEXT:    v_mov_b32_e32 v43, v12
+; GFX900-NEXT:    v_mov_b32_e32 v44, v11
+; GFX900-NEXT:    v_mov_b32_e32 v45, v10
+; GFX900-NEXT:    v_mov_b32_e32 v46, v9
+; GFX900-NEXT:    v_mov_b32_e32 v47, v8
+; GFX900-NEXT:    v_mov_b32_e32 v56, v7
+; GFX900-NEXT:    v_mov_b32_e32 v57, v6
+; GFX900-NEXT:    v_mov_b32_e32 v58, v5
+; GFX900-NEXT:    v_mov_b32_e32 v59, v4
+; GFX900-NEXT:    v_mov_b32_e32 v60, v3
+; GFX900-NEXT:    v_mov_b32_e32 v61, v2
+; GFX900-NEXT:    v_mov_b32_e32 v62, v1
+; GFX900-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 killed $exec
+; GFX900-NEXT:    v_mov_b32_e32 v1, v62
+; GFX900-NEXT:    v_mov_b32_e32 v2, v61
+; GFX900-NEXT:    v_mov_b32_e32 v3, v60
+; GFX900-NEXT:    v_mov_b32_e32 v4, v59
+; GFX900-NEXT:    v_mov_b32_e32 v5, v58
+; GFX900-NEXT:    v_mov_b32_e32 v6, v57
+; GFX900-NEXT:    v_mov_b32_e32 v7, v56
+; GFX900-NEXT:    v_mov_b32_e32 v8, v47
+; GFX900-NEXT:    v_mov_b32_e32 v9, v46
+; GFX900-NEXT:    v_mov_b32_e32 v10, v45
+; GFX900-NEXT:    v_mov_b32_e32 v11, v44
+; GFX900-NEXT:    v_mov_b32_e32 v12, v43
+; GFX900-NEXT:    v_mov_b32_e32 v13, v42
+; GFX900-NEXT:    v_mov_b32_e32 v14, v41
+; GFX900-NEXT:    v_mov_b32_e32 v15, v40
+; GFX900-NEXT:    v_mov_b32_e32 v16, v55
+; GFX900-NEXT:    v_mov_b32_e32 v17, v54
+; GFX900-NEXT:    v_mov_b32_e32 v18, v53
+; GFX900-NEXT:    v_mov_b32_e32 v19, v52
+; GFX900-NEXT:    v_mov_b32_e32 v20, v51
+; GFX900-NEXT:    v_mov_b32_e32 v21, v50
+; GFX900-NEXT:    v_mov_b32_e32 v22, v49
+; GFX900-NEXT:    v_mov_b32_e32 v23, v48
+; GFX900-NEXT:    v_mov_b32_e32 v24, v39
+; GFX900-NEXT:    v_mov_b32_e32 v25, v38
+; GFX900-NEXT:    v_mov_b32_e32 v26, v37
+; GFX900-NEXT:    v_mov_b32_e32 v27, v36
+; GFX900-NEXT:    v_mov_b32_e32 v28, v35
+; GFX900-NEXT:    v_mov_b32_e32 v29, v34
+; GFX900-NEXT:    v_mov_b32_e32 v30, v33
+; GFX900-NEXT:    ; kill: def $vgpr31 killed $vgpr32 killed $exec
 ; GFX900-NEXT:    ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v0, 0
-; GFX900-NEXT:    v_readlane_b32 s67, v31, 31
-; GFX900-NEXT:    v_readlane_b32 s66, v31, 30
-; GFX900-NEXT:    v_readlane_b32 s65, v31, 29
-; GFX900-NEXT:    v_readlane_b32 s64, v31, 28
-; GFX900-NEXT:    v_readlane_b32 s63, v31, 27
-; GFX900-NEXT:    v_readlane_b32 s62, v31, 26
-; GFX900-NEXT:    v_readlane_b32 s61, v31, 25
-; GFX900-NEXT:    v_readlane_b32 s60, v31, 24
-; GFX900-NEXT:    v_readlane_b32 s59, v31, 23
-; GFX900-NEXT:    v_readlane_b32 s58, v31, 22
-; GFX900-NEXT:    v_readlane_b32 s57, v31, 21
-; GFX900-NEXT:    v_readlane_b32 s56, v31, 20
-; GFX900-NEXT:    v_readlane_b32 s55, v31, 19
-; GFX900-NEXT:    v_readlane_b32 s54, v31, 18
-; GFX900-NEXT:    v_readlane_b32 s53, v31, 17
-; GFX900-NEXT:    v_readlane_b32 s52, v31, 16
-; GFX900-NEXT:    v_readlane_b32 s51, v31, 15
-; GFX900-NEXT:    v_readlane_b32 s50, v31, 14
-; GFX900-NEXT:    v_readlane_b32 s49, v31, 13
-; GFX900-NEXT:    v_readlane_b32 s48, v31, 12
-; GFX900-NEXT:    v_readlane_b32 s47, v31, 11
-; GFX900-NEXT:    v_readlane_b32 s46, v31, 10
-; GFX900-NEXT:    v_readlane_b32 s45, v31, 9
-; GFX900-NEXT:    v_readlane_b32 s44, v31, 8
-; GFX900-NEXT:    v_readlane_b32 s43, v31, 7
-; GFX900-NEXT:    v_readlane_b32 s42, v31, 6
-; GFX900-NEXT:    v_readlane_b32 s41, v31, 5
-; GFX900-NEXT:    v_readlane_b32 s40, v31, 4
-; GFX900-NEXT:    v_readlane_b32 s39, v31, 3
-; GFX900-NEXT:    v_readlane_b32 s38, v31, 2
-; GFX900-NEXT:    v_readlane_b32 s37, v31, 1
-; GFX900-NEXT:    v_readlane_b32 s36, v31, 0
-; GFX900-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX900-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX900-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX900-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX900-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX900-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX900-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX900-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX900-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX900-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX900-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX900-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX900-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX900-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX900-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX900-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX900-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX900-NEXT:    v_readlane_b32 s67, v63, 31
+; GFX900-NEXT:    v_readlane_b32 s66, v63, 30
+; GFX900-NEXT:    v_readlane_b32 s65, v63, 29
+; GFX900-NEXT:    v_readlane_b32 s64, v63, 28
+; GFX900-NEXT:    v_readlane_b32 s63, v63, 27
+; GFX900-NEXT:    v_readlane_b32 s62, v63, 26
+; GFX900-NEXT:    v_readlane_b32 s61, v63, 25
+; GFX900-NEXT:    v_readlane_b32 s60, v63, 24
+; GFX900-NEXT:    v_readlane_b32 s59, v63, 23
+; GFX900-NEXT:    v_readlane_b32 s58, v63, 22
+; GFX900-NEXT:    v_readlane_b32 s57, v63, 21
+; GFX900-NEXT:    v_readlane_b32 s56, v63, 20
+; GFX900-NEXT:    v_readlane_b32 s55, v63, 19
+; GFX900-NEXT:    v_readlane_b32 s54, v63, 18
+; GFX900-NEXT:    v_readlane_b32 s53, v63, 17
+; GFX900-NEXT:    v_readlane_b32 s52, v63, 16
+; GFX900-NEXT:    v_readlane_b32 s51, v63, 15
+; GFX900-NEXT:    v_readlane_b32 s50, v63, 14
+; GFX900-NEXT:    v_readlane_b32 s49, v63, 13
+; GFX900-NEXT:    v_readlane_b32 s48, v63, 12
+; GFX900-NEXT:    v_readlane_b32 s47, v63, 11
+; GFX900-NEXT:    v_readlane_b32 s46, v63, 10
+; GFX900-NEXT:    v_readlane_b32 s45, v63, 9
+; GFX900-NEXT:    v_readlane_b32 s44, v63, 8
+; GFX900-NEXT:    v_readlane_b32 s43, v63, 7
+; GFX900-NEXT:    v_readlane_b32 s42, v63, 6
+; GFX900-NEXT:    v_readlane_b32 s41, v63, 5
+; GFX900-NEXT:    v_readlane_b32 s40, v63, 4
+; GFX900-NEXT:    v_readlane_b32 s39, v63, 3
+; GFX900-NEXT:    v_readlane_b32 s38, v63, 2
+; GFX900-NEXT:    v_readlane_b32 s37, v63, 1
+; GFX900-NEXT:    v_readlane_b32 s36, v63, 0
+; GFX900-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX900-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
@@ -190,185 +216,416 @@ define i32 @test_tuple(<16 x i64> %0) {
 ; GFX906-LABEL: test_tuple:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX906-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX906-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX906-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_writelane_b32 v31, s36, 0
-; GFX906-NEXT:    v_writelane_b32 v31, s37, 1
-; GFX906-NEXT:    v_writelane_b32 v31, s38, 2
-; GFX906-NEXT:    v_writelane_b32 v31, s39, 3
-; GFX906-NEXT:    v_writelane_b32 v31, s40, 4
-; GFX906-NEXT:    v_writelane_b32 v31, s41, 5
-; GFX906-NEXT:    v_writelane_b32 v31, s42, 6
-; GFX906-NEXT:    v_writelane_b32 v31, s43, 7
-; GFX906-NEXT:    v_writelane_b32 v31, s44, 8
-; GFX906-NEXT:    v_writelane_b32 v31, s45, 9
-; GFX906-NEXT:    v_writelane_b32 v31, s46, 10
-; GFX906-NEXT:    v_writelane_b32 v31, s47, 11
-; GFX906-NEXT:    v_writelane_b32 v31, s48, 12
-; GFX906-NEXT:    v_writelane_b32 v31, s49, 13
-; GFX906-NEXT:    v_writelane_b32 v31, s50, 14
-; GFX906-NEXT:    v_writelane_b32 v31, s51, 15
-; GFX906-NEXT:    v_writelane_b32 v31, s52, 16
-; GFX906-NEXT:    v_writelane_b32 v31, s53, 17
-; GFX906-NEXT:    v_writelane_b32 v31, s54, 18
-; GFX906-NEXT:    v_writelane_b32 v31, s55, 19
-; GFX906-NEXT:    v_writelane_b32 v31, s56, 20
-; GFX906-NEXT:    v_writelane_b32 v31, s57, 21
-; GFX906-NEXT:    v_writelane_b32 v31, s58, 22
-; GFX906-NEXT:    v_writelane_b32 v31, s59, 23
-; GFX906-NEXT:    v_writelane_b32 v31, s60, 24
-; GFX906-NEXT:    v_writelane_b32 v31, s61, 25
-; GFX906-NEXT:    v_writelane_b32 v31, s62, 26
-; GFX906-NEXT:    v_writelane_b32 v31, s63, 27
-; GFX906-NEXT:    v_writelane_b32 v31, s64, 28
-; GFX906-NEXT:    v_writelane_b32 v31, s65, 29
-; GFX906-NEXT:    v_writelane_b32 v31, s66, 30
-; GFX906-NEXT:    v_writelane_b32 v31, s67, 31
-; GFX906-NEXT:    v_mov_b32_e32 v32, v0
-; GFX906-NEXT:    buffer_load_dword v0, off, s[0:3], s32
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; kill: def $vgpr32 killed $vgpr32 def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 killed $exec
-; GFX906-NEXT:    v_mov_b32_e32 v33, v1
-; GFX906-NEXT:    v_mov_b32_e32 v34, v2
-; GFX906-NEXT:    v_mov_b32_e32 v35, v3
-; GFX906-NEXT:    v_mov_b32_e32 v36, v4
-; GFX906-NEXT:    v_mov_b32_e32 v37, v5
-; GFX906-NEXT:    v_mov_b32_e32 v38, v6
-; GFX906-NEXT:    v_mov_b32_e32 v39, v7
-; GFX906-NEXT:    v_mov_b32_e32 v40, v8
-; GFX906-NEXT:    v_mov_b32_e32 v41, v9
-; GFX906-NEXT:    v_mov_b32_e32 v42, v10
-; GFX906-NEXT:    v_mov_b32_e32 v43, v11
-; GFX906-NEXT:    v_mov_b32_e32 v44, v12
-; GFX906-NEXT:    v_mov_b32_e32 v45, v13
-; GFX906-NEXT:    v_mov_b32_e32 v46, v14
-; GFX906-NEXT:    v_mov_b32_e32 v47, v15
-; GFX906-NEXT:    v_mov_b32_e32 v48, v16
-; GFX906-NEXT:    v_mov_b32_e32 v49, v17
-; GFX906-NEXT:    v_mov_b32_e32 v50, v18
-; GFX906-NEXT:    v_mov_b32_e32 v51, v19
-; GFX906-NEXT:    v_mov_b32_e32 v52, v20
-; GFX906-NEXT:    v_mov_b32_e32 v53, v21
-; GFX906-NEXT:    v_mov_b32_e32 v54, v22
-; GFX906-NEXT:    v_mov_b32_e32 v55, v23
-; GFX906-NEXT:    v_mov_b32_e32 v56, v24
-; GFX906-NEXT:    v_mov_b32_e32 v57, v25
-; GFX906-NEXT:    v_mov_b32_e32 v58, v26
-; GFX906-NEXT:    v_mov_b32_e32 v59, v27
-; GFX906-NEXT:    v_mov_b32_e32 v60, v28
-; GFX906-NEXT:    v_mov_b32_e32 v61, v29
-; GFX906-NEXT:    v_mov_b32_e32 v62, v30
-; GFX906-NEXT:    ; kill: def $vgpr63 killed $vgpr0 killed $exec
+; GFX906-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_writelane_b32 v63, s36, 0
+; GFX906-NEXT:    v_writelane_b32 v63, s37, 1
+; GFX906-NEXT:    v_writelane_b32 v63, s38, 2
+; GFX906-NEXT:    v_writelane_b32 v63, s39, 3
+; GFX906-NEXT:    v_writelane_b32 v63, s40, 4
+; GFX906-NEXT:    v_writelane_b32 v63, s41, 5
+; GFX906-NEXT:    v_writelane_b32 v63, s42, 6
+; GFX906-NEXT:    v_writelane_b32 v63, s43, 7
+; GFX906-NEXT:    v_writelane_b32 v63, s44, 8
+; GFX906-NEXT:    v_writelane_b32 v63, s45, 9
+; GFX906-NEXT:    v_writelane_b32 v63, s46, 10
+; GFX906-NEXT:    v_writelane_b32 v63, s47, 11
+; GFX906-NEXT:    v_writelane_b32 v63, s48, 12
+; GFX906-NEXT:    v_writelane_b32 v63, s49, 13
+; GFX906-NEXT:    v_writelane_b32 v63, s50, 14
+; GFX906-NEXT:    v_writelane_b32 v63, s51, 15
+; GFX906-NEXT:    v_writelane_b32 v63, s52, 16
+; GFX906-NEXT:    v_writelane_b32 v63, s53, 17
+; GFX906-NEXT:    v_writelane_b32 v63, s54, 18
+; GFX906-NEXT:    v_writelane_b32 v63, s55, 19
+; GFX906-NEXT:    v_writelane_b32 v63, s56, 20
+; GFX906-NEXT:    v_writelane_b32 v63, s57, 21
+; GFX906-NEXT:    v_writelane_b32 v63, s58, 22
+; GFX906-NEXT:    v_writelane_b32 v63, s59, 23
+; GFX906-NEXT:    v_writelane_b32 v63, s60, 24
+; GFX906-NEXT:    v_writelane_b32 v63, s61, 25
+; GFX906-NEXT:    v_writelane_b32 v63, s62, 26
+; GFX906-NEXT:    v_writelane_b32 v63, s63, 27
+; GFX906-NEXT:    v_writelane_b32 v63, s64, 28
+; GFX906-NEXT:    v_writelane_b32 v63, s65, 29
+; GFX906-NEXT:    v_writelane_b32 v63, s66, 30
+; GFX906-NEXT:    v_writelane_b32 v63, s67, 31
+; GFX906-NEXT:    v_mov_b32_e32 v33, v30
+; GFX906-NEXT:    v_mov_b32_e32 v34, v29
+; GFX906-NEXT:    v_mov_b32_e32 v35, v28
+; GFX906-NEXT:    v_mov_b32_e32 v36, v27
+; GFX906-NEXT:    v_mov_b32_e32 v37, v26
+; GFX906-NEXT:    v_mov_b32_e32 v38, v25
+; GFX906-NEXT:    v_mov_b32_e32 v39, v24
+; GFX906-NEXT:    v_mov_b32_e32 v48, v23
+; GFX906-NEXT:    v_mov_b32_e32 v49, v22
+; GFX906-NEXT:    v_mov_b32_e32 v50, v21
+; GFX906-NEXT:    v_mov_b32_e32 v51, v20
+; GFX906-NEXT:    v_mov_b32_e32 v52, v19
+; GFX906-NEXT:    v_mov_b32_e32 v53, v18
+; GFX906-NEXT:    v_mov_b32_e32 v54, v17
+; GFX906-NEXT:    v_mov_b32_e32 v55, v16
+; GFX906-NEXT:    v_mov_b32_e32 v40, v15
+; GFX906-NEXT:    v_mov_b32_e32 v41, v14
+; GFX906-NEXT:    v_mov_b32_e32 v42, v13
+; GFX906-NEXT:    v_mov_b32_e32 v43, v12
+; GFX906-NEXT:    v_mov_b32_e32 v44, v11
+; GFX906-NEXT:    v_mov_b32_e32 v45, v10
+; GFX906-NEXT:    v_mov_b32_e32 v46, v9
+; GFX906-NEXT:    v_mov_b32_e32 v47, v8
+; GFX906-NEXT:    v_mov_b32_e32 v56, v7
+; GFX906-NEXT:    v_mov_b32_e32 v57, v6
+; GFX906-NEXT:    v_mov_b32_e32 v58, v5
+; GFX906-NEXT:    v_mov_b32_e32 v59, v4
+; GFX906-NEXT:    v_mov_b32_e32 v60, v3
+; GFX906-NEXT:    v_mov_b32_e32 v61, v2
+; GFX906-NEXT:    v_mov_b32_e32 v62, v1
+; GFX906-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 killed $exec
+; GFX906-NEXT:    v_mov_b32_e32 v1, v62
+; GFX906-NEXT:    v_mov_b32_e32 v2, v61
+; GFX906-NEXT:    v_mov_b32_e32 v3, v60
+; GFX906-NEXT:    v_mov_b32_e32 v4, v59
+; GFX906-NEXT:    v_mov_b32_e32 v5, v58
+; GFX906-NEXT:    v_mov_b32_e32 v6, v57
+; GFX906-NEXT:    v_mov_b32_e32 v7, v56
+; GFX906-NEXT:    v_mov_b32_e32 v8, v47
+; GFX906-NEXT:    v_mov_b32_e32 v9, v46
+; GFX906-NEXT:    v_mov_b32_e32 v10, v45
+; GFX906-NEXT:    v_mov_b32_e32 v11, v44
+; GFX906-NEXT:    v_mov_b32_e32 v12, v43
+; GFX906-NEXT:    v_mov_b32_e32 v13, v42
+; GFX906-NEXT:    v_mov_b32_e32 v14, v41
+; GFX906-NEXT:    v_mov_b32_e32 v15, v40
+; GFX906-NEXT:    v_mov_b32_e32 v16, v55
+; GFX906-NEXT:    v_mov_b32_e32 v17, v54
+; GFX906-NEXT:    v_mov_b32_e32 v18, v53
+; GFX906-NEXT:    v_mov_b32_e32 v19, v52
+; GFX906-NEXT:    v_mov_b32_e32 v20, v51
+; GFX906-NEXT:    v_mov_b32_e32 v21, v50
+; GFX906-NEXT:    v_mov_b32_e32 v22, v49
+; GFX906-NEXT:    v_mov_b32_e32 v23, v48
+; GFX906-NEXT:    v_mov_b32_e32 v24, v39
+; GFX906-NEXT:    v_mov_b32_e32 v25, v38
+; GFX906-NEXT:    v_mov_b32_e32 v26, v37
+; GFX906-NEXT:    v_mov_b32_e32 v27, v36
+; GFX906-NEXT:    v_mov_b32_e32 v28, v35
+; GFX906-NEXT:    v_mov_b32_e32 v29, v34
+; GFX906-NEXT:    v_mov_b32_e32 v30, v33
+; GFX906-NEXT:    ; kill: def $vgpr31 killed $vgpr32 killed $exec
 ; GFX906-NEXT:    ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_mov_b32_e32 v0, 0
-; GFX906-NEXT:    v_readlane_b32 s67, v31, 31
-; GFX906-NEXT:    v_readlane_b32 s66, v31, 30
-; GFX906-NEXT:    v_readlane_b32 s65, v31, 29
-; GFX906-NEXT:    v_readlane_b32 s64, v31, 28
-; GFX906-NEXT:    v_readlane_b32 s63, v31, 27
-; GFX906-NEXT:    v_readlane_b32 s62, v31, 26
-; GFX906-NEXT:    v_readlane_b32 s61, v31, 25
-; GFX906-NEXT:    v_readlane_b32 s60, v31, 24
-; GFX906-NEXT:    v_readlane_b32 s59, v31, 23
-; GFX906-NEXT:    v_readlane_b32 s58, v31, 22
-; GFX906-NEXT:    v_readlane_b32 s57, v31, 21
-; GFX906-NEXT:    v_readlane_b32 s56, v31, 20
-; GFX906-NEXT:    v_readlane_b32 s55, v31, 19
-; GFX906-NEXT:    v_readlane_b32 s54, v31, 18
-; GFX906-NEXT:    v_readlane_b32 s53, v31, 17
-; GFX906-NEXT:    v_readlane_b32 s52, v31, 16
-; GFX906-NEXT:    v_readlane_b32 s51, v31, 15
-; GFX906-NEXT:    v_readlane_b32 s50, v31, 14
-; GFX906-NEXT:    v_readlane_b32 s49, v31, 13
-; GFX906-NEXT:    v_readlane_b32 s48, v31, 12
-; GFX906-NEXT:    v_readlane_b32 s47, v31, 11
-; GFX906-NEXT:    v_readlane_b32 s46, v31, 10
-; GFX906-NEXT:    v_readlane_b32 s45, v31, 9
-; GFX906-NEXT:    v_readlane_b32 s44, v31, 8
-; GFX906-NEXT:    v_readlane_b32 s43, v31, 7
-; GFX906-NEXT:    v_readlane_b32 s42, v31, 6
-; GFX906-NEXT:    v_readlane_b32 s41, v31, 5
-; GFX906-NEXT:    v_readlane_b32 s40, v31, 4
-; GFX906-NEXT:    v_readlane_b32 s39, v31, 3
-; GFX906-NEXT:    v_readlane_b32 s38, v31, 2
-; GFX906-NEXT:    v_readlane_b32 s37, v31, 1
-; GFX906-NEXT:    v_readlane_b32 s36, v31, 0
-; GFX906-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX906-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_readlane_b32 s67, v63, 31
+; GFX906-NEXT:    v_readlane_b32 s66, v63, 30
+; GFX906-NEXT:    v_readlane_b32 s65, v63, 29
+; GFX906-NEXT:    v_readlane_b32 s64, v63, 28
+; GFX906-NEXT:    v_readlane_b32 s63, v63, 27
+; GFX906-NEXT:    v_readlane_b32 s62, v63, 26
+; GFX906-NEXT:    v_readlane_b32 s61, v63, 25
+; GFX906-NEXT:    v_readlane_b32 s60, v63, 24
+; GFX906-NEXT:    v_readlane_b32 s59, v63, 23
+; GFX906-NEXT:    v_readlane_b32 s58, v63, 22
+; GFX906-NEXT:    v_readlane_b32 s57, v63, 21
+; GFX906-NEXT:    v_readlane_b32 s56, v63, 20
+; GFX906-NEXT:    v_readlane_b32 s55, v63, 19
+; GFX906-NEXT:    v_readlane_b32 s54, v63, 18
+; GFX906-NEXT:    v_readlane_b32 s53, v63, 17
+; GFX906-NEXT:    v_readlane_b32 s52, v63, 16
+; GFX906-NEXT:    v_readlane_b32 s51, v63, 15
+; GFX906-NEXT:    v_readlane_b32 s50, v63, 14
+; GFX906-NEXT:    v_readlane_b32 s49, v63, 13
+; GFX906-NEXT:    v_readlane_b32 s48, v63, 12
+; GFX906-NEXT:    v_readlane_b32 s47, v63, 11
+; GFX906-NEXT:    v_readlane_b32 s46, v63, 10
+; GFX906-NEXT:    v_readlane_b32 s45, v63, 9
+; GFX906-NEXT:    v_readlane_b32 s44, v63, 8
+; GFX906-NEXT:    v_readlane_b32 s43, v63, 7
+; GFX906-NEXT:    v_readlane_b32 s42, v63, 6
+; GFX906-NEXT:    v_readlane_b32 s41, v63, 5
+; GFX906-NEXT:    v_readlane_b32 s40, v63, 4
+; GFX906-NEXT:    v_readlane_b32 s39, v63, 3
+; GFX906-NEXT:    v_readlane_b32 s38, v63, 2
+; GFX906-NEXT:    v_readlane_b32 s37, v63, 1
+; GFX906-NEXT:    v_readlane_b32 s36, v63, 0
+; GFX906-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX906-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX908-LABEL: test_tuple:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX908-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX908-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX908-NEXT:    v_accvgpr_write_b32 a0, v40 ; Reload Reuse
+; GFX908-NEXT:    v_accvgpr_write_b32 a1, v41 ; Reload Reuse
+; GFX908-NEXT:    v_accvgpr_write_b32 a2, v42 ; Reload Reuse
+; GFX908-NEXT:    v_accvgpr_write_b32 a3, v43 ; Reload Reuse
+; GFX908-NEXT:    v_accvgpr_write_b32 a4, v44 ; Reload Reuse
+; GFX908-NEXT:    v_accvgpr_write_b32 a5, v45 ; Reload Reuse
+; GFX908-NEXT:    v_accvgpr_write_b32 a6, v46 ; Reload Reuse
+; GFX908-NEXT:    v_accvgpr_write_b32 a7, v47 ; Reload Reuse
+; GFX908-NEXT:    v_accvgpr_write_b32 a8, v56 ; Reload Reuse
+; GFX908-NEXT:    v_accvgpr_write_b32 a9, v57 ; Reload Reuse
+; GFX908-NEXT:    v_accvgpr_write_b32 a10, v58 ; Reload Reuse
+; GFX908-NEXT:    v_accvgpr_write_b32 a11, v59 ; Reload Reuse
+; GFX908-NEXT:    v_accvgpr_write_b32 a12, v60 ; Reload Reuse
+; GFX908-NEXT:    v_accvgpr_write_b32 a13, v61 ; Reload Reuse
+; GFX908-NEXT:    v_writelane_b32 v62, s36, 0
+; GFX908-NEXT:    v_writelane_b32 v62, s37, 1
+; GFX908-NEXT:    v_writelane_b32 v62, s38, 2
+; GFX908-NEXT:    v_writelane_b32 v62, s39, 3
+; GFX908-NEXT:    v_writelane_b32 v62, s40, 4
+; GFX908-NEXT:    v_writelane_b32 v62, s41, 5
+; GFX908-NEXT:    v_writelane_b32 v62, s42, 6
+; GFX908-NEXT:    v_writelane_b32 v62, s43, 7
+; GFX908-NEXT:    v_writelane_b32 v62, s44, 8
+; GFX908-NEXT:    v_writelane_b32 v62, s45, 9
+; GFX908-NEXT:    v_writelane_b32 v62, s46, 10
+; GFX908-NEXT:    v_writelane_b32 v62, s47, 11
+; GFX908-NEXT:    v_writelane_b32 v62, s48, 12
+; GFX908-NEXT:    v_writelane_b32 v62, s49, 13
+; GFX908-NEXT:    v_writelane_b32 v62, s50, 14
+; GFX908-NEXT:    v_writelane_b32 v62, s51, 15
+; GFX908-NEXT:    v_writelane_b32 v62, s52, 16
+; GFX908-NEXT:    v_writelane_b32 v62, s53, 17
+; GFX908-NEXT:    v_writelane_b32 v62, s54, 18
+; GFX908-NEXT:    v_writelane_b32 v62, s55, 19
+; GFX908-NEXT:    v_writelane_b32 v62, s56, 20
+; GFX908-NEXT:    v_writelane_b32 v62, s57, 21
+; GFX908-NEXT:    v_writelane_b32 v62, s58, 22
+; GFX908-NEXT:    v_writelane_b32 v62, s59, 23
+; GFX908-NEXT:    v_writelane_b32 v62, s60, 24
+; GFX908-NEXT:    v_writelane_b32 v62, s61, 25
+; GFX908-NEXT:    v_writelane_b32 v62, s62, 26
+; GFX908-NEXT:    v_writelane_b32 v62, s63, 27
+; GFX908-NEXT:    v_writelane_b32 v62, s64, 28
+; GFX908-NEXT:    v_writelane_b32 v62, s65, 29
+; GFX908-NEXT:    v_writelane_b32 v62, s66, 30
+; GFX908-NEXT:    v_writelane_b32 v62, s67, 31
+; GFX908-NEXT:    v_mov_b32_e32 v33, v30
+; GFX908-NEXT:    v_mov_b32_e32 v34, v29
+; GFX908-NEXT:    v_mov_b32_e32 v35, v28
+; GFX908-NEXT:    v_mov_b32_e32 v36, v27
+; GFX908-NEXT:    v_mov_b32_e32 v37, v26
+; GFX908-NEXT:    v_mov_b32_e32 v38, v25
+; GFX908-NEXT:    v_mov_b32_e32 v39, v24
+; GFX908-NEXT:    v_mov_b32_e32 v48, v23
+; GFX908-NEXT:    v_mov_b32_e32 v49, v22
+; GFX908-NEXT:    v_mov_b32_e32 v50, v21
+; GFX908-NEXT:    v_mov_b32_e32 v51, v20
+; GFX908-NEXT:    v_mov_b32_e32 v52, v19
+; GFX908-NEXT:    v_mov_b32_e32 v53, v18
+; GFX908-NEXT:    v_mov_b32_e32 v54, v17
+; GFX908-NEXT:    v_mov_b32_e32 v55, v16
+; GFX908-NEXT:    v_mov_b32_e32 v40, v15
+; GFX908-NEXT:    v_mov_b32_e32 v41, v14
+; GFX908-NEXT:    v_mov_b32_e32 v42, v13
+; GFX908-NEXT:    v_mov_b32_e32 v43, v12
+; GFX908-NEXT:    v_mov_b32_e32 v44, v11
+; GFX908-NEXT:    v_mov_b32_e32 v45, v10
+; GFX908-NEXT:    v_mov_b32_e32 v46, v9
+; GFX908-NEXT:    v_mov_b32_e32 v47, v8
+; GFX908-NEXT:    v_mov_b32_e32 v56, v7
+; GFX908-NEXT:    v_mov_b32_e32 v57, v6
+; GFX908-NEXT:    v_mov_b32_e32 v58, v5
+; GFX908-NEXT:    v_mov_b32_e32 v59, v4
+; GFX908-NEXT:    v_mov_b32_e32 v60, v3
+; GFX908-NEXT:    v_mov_b32_e32 v61, v2
+; GFX908-NEXT:    v_mov_b32_e32 v32, v1
+; GFX908-NEXT:    buffer_load_dword v1, off, s[0:3], s32
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_accvgpr_write_b32 a14, v1 ; Reload Reuse
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 killed $exec
+; GFX908-NEXT:    v_mov_b32_e32 v1, v32
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    v_accvgpr_read_b32 v32, a14 ; Reload Reuse
+; GFX908-NEXT:    v_mov_b32_e32 v2, v61
+; GFX908-NEXT:    v_mov_b32_e32 v3, v60
+; GFX908-NEXT:    v_mov_b32_e32 v4, v59
+; GFX908-NEXT:    v_mov_b32_e32 v5, v58
+; GFX908-NEXT:    v_mov_b32_e32 v6, v57
+; GFX908-NEXT:    v_mov_b32_e32 v7, v56
+; GFX908-NEXT:    v_mov_b32_e32 v8, v47
+; GFX908-NEXT:    v_mov_b32_e32 v9, v46
+; GFX908-NEXT:    v_mov_b32_e32 v10, v45
+; GFX908-NEXT:    v_mov_b32_e32 v11, v44
+; GFX908-NEXT:    v_mov_b32_e32 v12, v43
+; GFX908-NEXT:    v_mov_b32_e32 v13, v42
+; GFX908-NEXT:    v_mov_b32_e32 v14, v41
+; GFX908-NEXT:    v_mov_b32_e32 v15, v40
+; GFX908-NEXT:    v_mov_b32_e32 v16, v55
+; GFX908-NEXT:    v_mov_b32_e32 v17, v54
+; GFX908-NEXT:    v_mov_b32_e32 v18, v53
+; GFX908-NEXT:    v_mov_b32_e32 v19, v52
+; GFX908-NEXT:    v_mov_b32_e32 v20, v51
+; GFX908-NEXT:    v_mov_b32_e32 v21, v50
+; GFX908-NEXT:    v_mov_b32_e32 v22, v49
+; GFX908-NEXT:    v_mov_b32_e32 v23, v48
+; GFX908-NEXT:    v_mov_b32_e32 v24, v39
+; GFX908-NEXT:    v_mov_b32_e32 v25, v38
+; GFX908-NEXT:    v_mov_b32_e32 v26, v37
+; GFX908-NEXT:    v_mov_b32_e32 v27, v36
+; GFX908-NEXT:    v_mov_b32_e32 v28, v35
+; GFX908-NEXT:    v_mov_b32_e32 v29, v34
+; GFX908-NEXT:    v_mov_b32_e32 v30, v33
+; GFX908-NEXT:    ; kill: def $vgpr31 killed $vgpr32 killed $exec
+; GFX908-NEXT:    ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67
+; GFX908-NEXT:    v_mov_b32_e32 v0, 0
+; GFX908-NEXT:    v_readlane_b32 s67, v62, 31
+; GFX908-NEXT:    v_readlane_b32 s66, v62, 30
+; GFX908-NEXT:    v_readlane_b32 s65, v62, 29
+; GFX908-NEXT:    v_readlane_b32 s64, v62, 28
+; GFX908-NEXT:    v_readlane_b32 s63, v62, 27
+; GFX908-NEXT:    v_readlane_b32 s62, v62, 26
+; GFX908-NEXT:    v_readlane_b32 s61, v62, 25
+; GFX908-NEXT:    v_readlane_b32 s60, v62, 24
+; GFX908-NEXT:    v_readlane_b32 s59, v62, 23
+; GFX908-NEXT:    v_readlane_b32 s58, v62, 22
+; GFX908-NEXT:    v_readlane_b32 s57, v62, 21
+; GFX908-NEXT:    v_readlane_b32 s56, v62, 20
+; GFX908-NEXT:    v_readlane_b32 s55, v62, 19
+; GFX908-NEXT:    v_readlane_b32 s54, v62, 18
+; GFX908-NEXT:    v_readlane_b32 s53, v62, 17
+; GFX908-NEXT:    v_readlane_b32 s52, v62, 16
+; GFX908-NEXT:    v_readlane_b32 s51, v62, 15
+; GFX908-NEXT:    v_readlane_b32 s50, v62, 14
+; GFX908-NEXT:    v_readlane_b32 s49, v62, 13
+; GFX908-NEXT:    v_readlane_b32 s48, v62, 12
+; GFX908-NEXT:    v_readlane_b32 s47, v62, 11
+; GFX908-NEXT:    v_readlane_b32 s46, v62, 10
+; GFX908-NEXT:    v_readlane_b32 s45, v62, 9
+; GFX908-NEXT:    v_readlane_b32 s44, v62, 8
+; GFX908-NEXT:    v_readlane_b32 s43, v62, 7
+; GFX908-NEXT:    v_readlane_b32 s42, v62, 6
+; GFX908-NEXT:    v_readlane_b32 s41, v62, 5
+; GFX908-NEXT:    v_readlane_b32 s40, v62, 4
+; GFX908-NEXT:    v_readlane_b32 s39, v62, 3
+; GFX908-NEXT:    v_readlane_b32 s38, v62, 2
+; GFX908-NEXT:    v_readlane_b32 s37, v62, 1
+; GFX908-NEXT:    v_readlane_b32 s36, v62, 0
+; GFX908-NEXT:    v_accvgpr_read_b32 v61, a13 ; Reload Reuse
+; GFX908-NEXT:    v_accvgpr_read_b32 v60, a12 ; Reload Reuse
+; GFX908-NEXT:    v_accvgpr_read_b32 v59, a11 ; Reload Reuse
+; GFX908-NEXT:    v_accvgpr_read_b32 v58, a10 ; Reload Reuse
+; GFX908-NEXT:    v_accvgpr_read_b32 v57, a9 ; Reload Reuse
+; GFX908-NEXT:    v_accvgpr_read_b32 v56, a8 ; Reload Reuse
+; GFX908-NEXT:    v_accvgpr_read_b32 v47, a7 ; Reload Reuse
+; GFX908-NEXT:    v_accvgpr_read_b32 v46, a6 ; Reload Reuse
+; GFX908-NEXT:    v_accvgpr_read_b32 v45, a5 ; Reload Reuse
+; GFX908-NEXT:    v_accvgpr_read_b32 v44, a4 ; Reload Reuse
+; GFX908-NEXT:    v_accvgpr_read_b32 v43, a3 ; Reload Reuse
+; GFX908-NEXT:    v_accvgpr_read_b32 v42, a2 ; Reload Reuse
+; GFX908-NEXT:    v_accvgpr_read_b32 v41, a1 ; Reload Reuse
+; GFX908-NEXT:    v_accvgpr_read_b32 v40, a0 ; Reload Reuse
+; GFX908-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX908-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX908-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX90a-LABEL: test_tuple:
 ; GFX90a:       ; %bb.0:
 ; GFX90a-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90a-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX90a-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX90a-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX90a-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX90a-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX90a-NEXT:    v_accvgpr_write_b32 a0, v40 ; Reload Reuse
 ; GFX90a-NEXT:    v_accvgpr_write_b32 a1, v41 ; Reload Reuse
@@ -385,41 +642,69 @@ define i32 @test_tuple(<16 x i64> %0) {
 ; GFX90a-NEXT:    v_accvgpr_write_b32 a12, v60 ; Reload Reuse
 ; GFX90a-NEXT:    v_accvgpr_write_b32 a13, v61 ; Reload Reuse
 ; GFX90a-NEXT:    v_accvgpr_write_b32 a14, v62 ; Reload Reuse
-; GFX90a-NEXT:    v_accvgpr_write_b32 a15, v63 ; Reload Reuse
-; GFX90a-NEXT:    v_writelane_b32 v31, s36, 0
-; GFX90a-NEXT:    v_writelane_b32 v31, s37, 1
-; GFX90a-NEXT:    v_writelane_b32 v31, s38, 2
-; GFX90a-NEXT:    v_writelane_b32 v31, s39, 3
-; GFX90a-NEXT:    v_writelane_b32 v31, s40, 4
-; GFX90a-NEXT:    v_writelane_b32 v31, s41, 5
-; GFX90a-NEXT:    v_writelane_b32 v31, s42, 6
-; GFX90a-NEXT:    v_writelane_b32 v31, s43, 7
-; GFX90a-NEXT:    v_writelane_b32 v31, s44, 8
-; GFX90a-NEXT:    v_writelane_b32 v31, s45, 9
-; GFX90a-NEXT:    v_writelane_b32 v31, s46, 10
-; GFX90a-NEXT:    v_writelane_b32 v31, s47, 11
-; GFX90a-NEXT:    v_writelane_b32 v31, s48, 12
-; GFX90a-NEXT:    v_writelane_b32 v31, s49, 13
-; GFX90a-NEXT:    v_writelane_b32 v31, s50, 14
-; GFX90a-NEXT:    v_writelane_b32 v31, s51, 15
-; GFX90a-NEXT:    v_writelane_b32 v31, s52, 16
-; GFX90a-NEXT:    v_writelane_b32 v31, s53, 17
-; GFX90a-NEXT:    v_writelane_b32 v31, s54, 18
-; GFX90a-NEXT:    v_writelane_b32 v31, s55, 19
-; GFX90a-NEXT:    v_writelane_b32 v31, s56, 20
-; GFX90a-NEXT:    v_writelane_b32 v31, s57, 21
-; GFX90a-NEXT:    v_writelane_b32 v31, s58, 22
-; GFX90a-NEXT:    v_writelane_b32 v31, s59, 23
-; GFX90a-NEXT:    v_writelane_b32 v31, s60, 24
-; GFX90a-NEXT:    v_writelane_b32 v31, s61, 25
-; GFX90a-NEXT:    v_writelane_b32 v31, s62, 26
-; GFX90a-NEXT:    v_writelane_b32 v31, s63, 27
-; GFX90a-NEXT:    v_writelane_b32 v31, s64, 28
-; GFX90a-NEXT:    v_writelane_b32 v31, s65, 29
-; GFX90a-NEXT:    v_writelane_b32 v31, s66, 30
-; GFX90a-NEXT:    v_writelane_b32 v31, s67, 31
-; GFX90a-NEXT:    v_mov_b32_e32 v32, v0
-; GFX90a-NEXT:    buffer_load_dword v0, off, s[0:3], s32
+; GFX90a-NEXT:    v_writelane_b32 v63, s36, 0
+; GFX90a-NEXT:    v_writelane_b32 v63, s37, 1
+; GFX90a-NEXT:    v_writelane_b32 v63, s38, 2
+; GFX90a-NEXT:    v_writelane_b32 v63, s39, 3
+; GFX90a-NEXT:    v_writelane_b32 v63, s40, 4
+; GFX90a-NEXT:    v_writelane_b32 v63, s41, 5
+; GFX90a-NEXT:    v_writelane_b32 v63, s42, 6
+; GFX90a-NEXT:    v_writelane_b32 v63, s43, 7
+; GFX90a-NEXT:    v_writelane_b32 v63, s44, 8
+; GFX90a-NEXT:    v_writelane_b32 v63, s45, 9
+; GFX90a-NEXT:    v_writelane_b32 v63, s46, 10
+; GFX90a-NEXT:    v_writelane_b32 v63, s47, 11
+; GFX90a-NEXT:    v_writelane_b32 v63, s48, 12
+; GFX90a-NEXT:    v_writelane_b32 v63, s49, 13
+; GFX90a-NEXT:    v_writelane_b32 v63, s50, 14
+; GFX90a-NEXT:    v_writelane_b32 v63, s51, 15
+; GFX90a-NEXT:    v_writelane_b32 v63, s52, 16
+; GFX90a-NEXT:    v_writelane_b32 v63, s53, 17
+; GFX90a-NEXT:    v_writelane_b32 v63, s54, 18
+; GFX90a-NEXT:    v_writelane_b32 v63, s55, 19
+; GFX90a-NEXT:    v_writelane_b32 v63, s56, 20
+; GFX90a-NEXT:    v_writelane_b32 v63, s57, 21
+; GFX90a-NEXT:    v_writelane_b32 v63, s58, 22
+; GFX90a-NEXT:    v_writelane_b32 v63, s59, 23
+; GFX90a-NEXT:    v_writelane_b32 v63, s60, 24
+; GFX90a-NEXT:    v_writelane_b32 v63, s61, 25
+; GFX90a-NEXT:    v_writelane_b32 v63, s62, 26
+; GFX90a-NEXT:    v_writelane_b32 v63, s63, 27
+; GFX90a-NEXT:    v_writelane_b32 v63, s64, 28
+; GFX90a-NEXT:    v_writelane_b32 v63, s65, 29
+; GFX90a-NEXT:    v_writelane_b32 v63, s66, 30
+; GFX90a-NEXT:    v_writelane_b32 v63, s67, 31
+; GFX90a-NEXT:    v_mov_b32_e32 v33, v30
+; GFX90a-NEXT:    v_mov_b32_e32 v34, v29
+; GFX90a-NEXT:    v_mov_b32_e32 v35, v28
+; GFX90a-NEXT:    v_mov_b32_e32 v36, v27
+; GFX90a-NEXT:    v_mov_b32_e32 v37, v26
+; GFX90a-NEXT:    v_mov_b32_e32 v38, v25
+; GFX90a-NEXT:    v_mov_b32_e32 v39, v24
+; GFX90a-NEXT:    v_mov_b32_e32 v48, v23
+; GFX90a-NEXT:    v_mov_b32_e32 v49, v22
+; GFX90a-NEXT:    v_mov_b32_e32 v50, v21
+; GFX90a-NEXT:    v_mov_b32_e32 v51, v20
+; GFX90a-NEXT:    v_mov_b32_e32 v52, v19
+; GFX90a-NEXT:    v_mov_b32_e32 v53, v18
+; GFX90a-NEXT:    v_mov_b32_e32 v54, v17
+; GFX90a-NEXT:    v_mov_b32_e32 v55, v16
+; GFX90a-NEXT:    v_mov_b32_e32 v40, v15
+; GFX90a-NEXT:    v_mov_b32_e32 v41, v14
+; GFX90a-NEXT:    v_mov_b32_e32 v42, v13
+; GFX90a-NEXT:    v_mov_b32_e32 v43, v12
+; GFX90a-NEXT:    v_mov_b32_e32 v44, v11
+; GFX90a-NEXT:    v_mov_b32_e32 v45, v10
+; GFX90a-NEXT:    v_mov_b32_e32 v46, v9
+; GFX90a-NEXT:    v_mov_b32_e32 v47, v8
+; GFX90a-NEXT:    v_mov_b32_e32 v56, v7
+; GFX90a-NEXT:    v_mov_b32_e32 v57, v6
+; GFX90a-NEXT:    v_mov_b32_e32 v58, v5
+; GFX90a-NEXT:    v_mov_b32_e32 v59, v4
+; GFX90a-NEXT:    v_mov_b32_e32 v60, v3
+; GFX90a-NEXT:    v_mov_b32_e32 v61, v2
+; GFX90a-NEXT:    v_mov_b32_e32 v62, v1
+; GFX90a-NEXT:    buffer_load_dword v32, off, s[0:3], s32
 ; GFX90a-NEXT:    ; implicit-def: $sgpr4
 ; GFX90a-NEXT:    ; implicit-def: $sgpr4
 ; GFX90a-NEXT:    ; implicit-def: $sgpr4
@@ -452,74 +737,72 @@ define i32 @test_tuple(<16 x i64> %0) {
 ; GFX90a-NEXT:    ; implicit-def: $sgpr4
 ; GFX90a-NEXT:    ; implicit-def: $sgpr4
 ; GFX90a-NEXT:    ; implicit-def: $sgpr4
-; GFX90a-NEXT:    ; kill: def $vgpr32 killed $vgpr32 def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 killed $exec
-; GFX90a-NEXT:    v_mov_b32_e32 v33, v1
-; GFX90a-NEXT:    v_mov_b32_e32 v34, v2
-; GFX90a-NEXT:    v_mov_b32_e32 v35, v3
-; GFX90a-NEXT:    v_mov_b32_e32 v36, v4
-; GFX90a-NEXT:    v_mov_b32_e32 v37, v5
-; GFX90a-NEXT:    v_mov_b32_e32 v38, v6
-; GFX90a-NEXT:    v_mov_b32_e32 v39, v7
-; GFX90a-NEXT:    v_mov_b32_e32 v40, v8
-; GFX90a-NEXT:    v_mov_b32_e32 v41, v9
-; GFX90a-NEXT:    v_mov_b32_e32 v42, v10
-; GFX90a-NEXT:    v_mov_b32_e32 v43, v11
-; GFX90a-NEXT:    v_mov_b32_e32 v44, v12
-; GFX90a-NEXT:    v_mov_b32_e32 v45, v13
-; GFX90a-NEXT:    v_mov_b32_e32 v46, v14
-; GFX90a-NEXT:    v_mov_b32_e32 v47, v15
-; GFX90a-NEXT:    v_mov_b32_e32 v48, v16
-; GFX90a-NEXT:    v_mov_b32_e32 v49, v17
-; GFX90a-NEXT:    v_mov_b32_e32 v50, v18
-; GFX90a-NEXT:    v_mov_b32_e32 v51, v19
-; GFX90a-NEXT:    v_mov_b32_e32 v52, v20
-; GFX90a-NEXT:    v_mov_b32_e32 v53, v21
-; GFX90a-NEXT:    v_mov_b32_e32 v54, v22
-; GFX90a-NEXT:    v_mov_b32_e32 v55, v23
-; GFX90a-NEXT:    v_mov_b32_e32 v56, v24
-; GFX90a-NEXT:    v_mov_b32_e32 v57, v25
-; GFX90a-NEXT:    v_mov_b32_e32 v58, v26
-; GFX90a-NEXT:    v_mov_b32_e32 v59, v27
-; GFX90a-NEXT:    v_mov_b32_e32 v60, v28
-; GFX90a-NEXT:    v_mov_b32_e32 v61, v29
-; GFX90a-NEXT:    v_mov_b32_e32 v62, v30
-; GFX90a-NEXT:    ; kill: def $vgpr63 killed $vgpr0 killed $exec
+; GFX90a-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 killed $exec
+; GFX90a-NEXT:    v_mov_b32_e32 v1, v62
+; GFX90a-NEXT:    v_mov_b32_e32 v2, v61
+; GFX90a-NEXT:    v_mov_b32_e32 v3, v60
+; GFX90a-NEXT:    v_mov_b32_e32 v4, v59
+; GFX90a-NEXT:    v_mov_b32_e32 v5, v58
+; GFX90a-NEXT:    v_mov_b32_e32 v6, v57
+; GFX90a-NEXT:    v_mov_b32_e32 v7, v56
+; GFX90a-NEXT:    v_mov_b32_e32 v8, v47
+; GFX90a-NEXT:    v_mov_b32_e32 v9, v46
+; GFX90a-NEXT:    v_mov_b32_e32 v10, v45
+; GFX90a-NEXT:    v_mov_b32_e32 v11, v44
+; GFX90a-NEXT:    v_mov_b32_e32 v12, v43
+; GFX90a-NEXT:    v_mov_b32_e32 v13, v42
+; GFX90a-NEXT:    v_mov_b32_e32 v14, v41
+; GFX90a-NEXT:    v_mov_b32_e32 v15, v40
+; GFX90a-NEXT:    v_mov_b32_e32 v16, v55
+; GFX90a-NEXT:    v_mov_b32_e32 v17, v54
+; GFX90a-NEXT:    v_mov_b32_e32 v18, v53
+; GFX90a-NEXT:    v_mov_b32_e32 v19, v52
+; GFX90a-NEXT:    v_mov_b32_e32 v20, v51
+; GFX90a-NEXT:    v_mov_b32_e32 v21, v50
+; GFX90a-NEXT:    v_mov_b32_e32 v22, v49
+; GFX90a-NEXT:    v_mov_b32_e32 v23, v48
+; GFX90a-NEXT:    v_mov_b32_e32 v24, v39
+; GFX90a-NEXT:    v_mov_b32_e32 v25, v38
+; GFX90a-NEXT:    v_mov_b32_e32 v26, v37
+; GFX90a-NEXT:    v_mov_b32_e32 v27, v36
+; GFX90a-NEXT:    v_mov_b32_e32 v28, v35
+; GFX90a-NEXT:    v_mov_b32_e32 v29, v34
+; GFX90a-NEXT:    v_mov_b32_e32 v30, v33
+; GFX90a-NEXT:    ; kill: def $vgpr31 killed $vgpr32 killed $exec
 ; GFX90a-NEXT:    ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67
-; GFX90a-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-NEXT:    v_readlane_b32 s67, v31, 31
-; GFX90a-NEXT:    v_readlane_b32 s66, v31, 30
-; GFX90a-NEXT:    v_readlane_b32 s65, v31, 29
-; GFX90a-NEXT:    v_readlane_b32 s64, v31, 28
-; GFX90a-NEXT:    v_readlane_b32 s63, v31, 27
-; GFX90a-NEXT:    v_readlane_b32 s62, v31, 26
-; GFX90a-NEXT:    v_readlane_b32 s61, v31, 25
-; GFX90a-NEXT:    v_readlane_b32 s60, v31, 24
-; GFX90a-NEXT:    v_readlane_b32 s59, v31, 23
-; GFX90a-NEXT:    v_readlane_b32 s58, v31, 22
-; GFX90a-NEXT:    v_readlane_b32 s57, v31, 21
-; GFX90a-NEXT:    v_readlane_b32 s56, v31, 20
-; GFX90a-NEXT:    v_readlane_b32 s55, v31, 19
-; GFX90a-NEXT:    v_readlane_b32 s54, v31, 18
-; GFX90a-NEXT:    v_readlane_b32 s53, v31, 17
-; GFX90a-NEXT:    v_readlane_b32 s52, v31, 16
-; GFX90a-NEXT:    v_readlane_b32 s51, v31, 15
-; GFX90a-NEXT:    v_readlane_b32 s50, v31, 14
-; GFX90a-NEXT:    v_readlane_b32 s49, v31, 13
-; GFX90a-NEXT:    v_readlane_b32 s48, v31, 12
-; GFX90a-NEXT:    v_readlane_b32 s47, v31, 11
-; GFX90a-NEXT:    v_readlane_b32 s46, v31, 10
-; GFX90a-NEXT:    v_readlane_b32 s45, v31, 9
-; GFX90a-NEXT:    v_readlane_b32 s44, v31, 8
-; GFX90a-NEXT:    v_readlane_b32 s43, v31, 7
-; GFX90a-NEXT:    v_readlane_b32 s42, v31, 6
-; GFX90a-NEXT:    v_readlane_b32 s41, v31, 5
-; GFX90a-NEXT:    v_readlane_b32 s40, v31, 4
-; GFX90a-NEXT:    v_readlane_b32 s39, v31, 3
-; GFX90a-NEXT:    v_readlane_b32 s38, v31, 2
-; GFX90a-NEXT:    v_readlane_b32 s37, v31, 1
-; GFX90a-NEXT:    v_readlane_b32 s36, v31, 0
-; GFX90a-NEXT:    v_accvgpr_read_b32 v63, a15 ; Reload Reuse
+; GFX90a-NEXT:    v_readlane_b32 s67, v63, 31
+; GFX90a-NEXT:    v_readlane_b32 s66, v63, 30
+; GFX90a-NEXT:    v_readlane_b32 s65, v63, 29
+; GFX90a-NEXT:    v_readlane_b32 s64, v63, 28
+; GFX90a-NEXT:    v_readlane_b32 s63, v63, 27
+; GFX90a-NEXT:    v_readlane_b32 s62, v63, 26
+; GFX90a-NEXT:    v_readlane_b32 s61, v63, 25
+; GFX90a-NEXT:    v_readlane_b32 s60, v63, 24
+; GFX90a-NEXT:    v_readlane_b32 s59, v63, 23
+; GFX90a-NEXT:    v_readlane_b32 s58, v63, 22
+; GFX90a-NEXT:    v_readlane_b32 s57, v63, 21
+; GFX90a-NEXT:    v_readlane_b32 s56, v63, 20
+; GFX90a-NEXT:    v_readlane_b32 s55, v63, 19
+; GFX90a-NEXT:    v_readlane_b32 s54, v63, 18
+; GFX90a-NEXT:    v_readlane_b32 s53, v63, 17
+; GFX90a-NEXT:    v_readlane_b32 s52, v63, 16
+; GFX90a-NEXT:    v_readlane_b32 s51, v63, 15
+; GFX90a-NEXT:    v_readlane_b32 s50, v63, 14
+; GFX90a-NEXT:    v_readlane_b32 s49, v63, 13
+; GFX90a-NEXT:    v_readlane_b32 s48, v63, 12
+; GFX90a-NEXT:    v_readlane_b32 s47, v63, 11
+; GFX90a-NEXT:    v_readlane_b32 s46, v63, 10
+; GFX90a-NEXT:    v_readlane_b32 s45, v63, 9
+; GFX90a-NEXT:    v_readlane_b32 s44, v63, 8
+; GFX90a-NEXT:    v_readlane_b32 s43, v63, 7
+; GFX90a-NEXT:    v_readlane_b32 s42, v63, 6
+; GFX90a-NEXT:    v_readlane_b32 s41, v63, 5
+; GFX90a-NEXT:    v_readlane_b32 s40, v63, 4
+; GFX90a-NEXT:    v_readlane_b32 s39, v63, 3
+; GFX90a-NEXT:    v_readlane_b32 s38, v63, 2
+; GFX90a-NEXT:    v_readlane_b32 s37, v63, 1
+; GFX90a-NEXT:    v_readlane_b32 s36, v63, 0
 ; GFX90a-NEXT:    v_accvgpr_read_b32 v62, a14 ; Reload Reuse
 ; GFX90a-NEXT:    v_accvgpr_read_b32 v61, a13 ; Reload Reuse
 ; GFX90a-NEXT:    v_accvgpr_read_b32 v60, a12 ; Reload Reuse
@@ -535,8 +818,8 @@ define i32 @test_tuple(<16 x i64> %0) {
 ; GFX90a-NEXT:    v_accvgpr_read_b32 v42, a2 ; Reload Reuse
 ; GFX90a-NEXT:    v_accvgpr_read_b32 v41, a1 ; Reload Reuse
 ; GFX90a-NEXT:    v_accvgpr_read_b32 v40, a0 ; Reload Reuse
-; GFX90a-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX90a-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX90a-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX90a-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GFX90a-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX90a-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90a-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
index 92efbe5a718263..2b96e10fd3cc3b 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
@@ -14,17 +14,17 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX9-NEXT:    s_mov_b32 s4, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-NEXT:    v_mov_b32_e32 v36, v16
 ; GFX9-NEXT:    v_mov_b32_e32 v35, v15
 ; GFX9-NEXT:    v_mov_b32_e32 v34, v14
 ; GFX9-NEXT:    v_mov_b32_e32 v33, v13
 ; GFX9-NEXT:    v_mov_b32_e32 v32, v12
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    ;;#ASMSTART
@@ -33,30 +33,30 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    image_gather4_c_b_cl v[41:44], v[32:36], s[4:11], s[4:7] dmask:0x1
+; GFX9-NEXT:    image_gather4_c_b_cl v[40:43], v[32:36], s[4:11], s[4:7] dmask:0x1
 ; GFX9-NEXT:    s_addk_i32 s32, 0x800
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 2
+; GFX9-NEXT:    v_writelane_b32 v44, s4, 2
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, extern_func at gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, extern_func at gotpcrel32@hi+12
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v44, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v44, s31, 1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_mov_b32_e32 v0, v41
-; GFX9-NEXT:    v_mov_b32_e32 v1, v42
-; GFX9-NEXT:    v_mov_b32_e32 v2, v43
-; GFX9-NEXT:    v_mov_b32_e32 v3, v44
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX9-NEXT:    v_mov_b32_e32 v0, v40
+; GFX9-NEXT:    v_mov_b32_e32 v1, v41
+; GFX9-NEXT:    v_mov_b32_e32 v2, v42
+; GFX9-NEXT:    v_mov_b32_e32 v3, v43
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_readlane_b32 s31, v44, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v44, 0
+; GFX9-NEXT:    v_readlane_b32 s4, v44, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xf800
 ; GFX9-NEXT:    s_mov_b32 s33, s4
@@ -69,7 +69,7 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX10-NEXT:    s_mov_b32 s4, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s5, -1
-; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v36, v16
@@ -77,10 +77,10 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX10-NEXT:    v_mov_b32_e32 v34, v14
 ; GFX10-NEXT:    v_mov_b32_e32 v33, v13
 ; GFX10-NEXT:    v_mov_b32_e32 v32, v12
-; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    ;;#ASMSTART
@@ -89,31 +89,31 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
-; GFX10-NEXT:    image_gather4_c_b_cl v[41:44], v[32:36], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT:    image_gather4_c_b_cl v[40:43], v[32:36], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
 ; GFX10-NEXT:    s_addk_i32 s32, 0x400
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 2
+; GFX10-NEXT:    v_writelane_b32 v44, s4, 2
 ; GFX10-NEXT:    s_getpc_b64 s[4:5]
 ; GFX10-NEXT:    s_add_u32 s4, s4, extern_func at gotpcrel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s5, s5, extern_func at gotpcrel32@hi+12
 ; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    v_writelane_b32 v44, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v44, s31, 1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v0, v41
-; GFX10-NEXT:    v_mov_b32_e32 v1, v42
-; GFX10-NEXT:    v_mov_b32_e32 v2, v43
-; GFX10-NEXT:    v_mov_b32_e32 v3, v44
+; GFX10-NEXT:    v_mov_b32_e32 v0, v40
+; GFX10-NEXT:    v_mov_b32_e32 v1, v41
+; GFX10-NEXT:    v_mov_b32_e32 v2, v42
+; GFX10-NEXT:    v_mov_b32_e32 v3, v43
 ; GFX10-NEXT:    s_clause 0x3
-; GFX10-NEXT:    buffer_load_dword v44, off, s[0:3], s33
-; GFX10-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:4
-; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8
-; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:12
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX10-NEXT:    buffer_load_dword v43, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:4
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:8
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:12
+; GFX10-NEXT:    v_readlane_b32 s31, v44, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v44, 0
+; GFX10-NEXT:    v_readlane_b32 s4, v44, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s5, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GFX10-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s5
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfc00
@@ -127,16 +127,16 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:16 ; 4-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b32 off, v44, s33 offset:16 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    v_dual_mov_b32 v36, v16 :: v_dual_mov_b32 v35, v15
 ; GFX11-NEXT:    v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13
 ; GFX11-NEXT:    v_mov_b32_e32 v32, v12
 ; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:12
-; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:8
-; GFX11-NEXT:    scratch_store_b32 off, v43, s33 offset:4
-; GFX11-NEXT:    scratch_store_b32 off, v44, s33
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:12
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:8
+; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:4
+; GFX11-NEXT:    scratch_store_b32 off, v43, s33
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    ;;#ASMSTART
@@ -145,29 +145,29 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    image_gather4_c_b_cl v[41:44], v[32:36], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT:    image_gather4_c_b_cl v[40:43], v[32:36], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D
 ; GFX11-NEXT:    s_add_i32 s32, s32, 32
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v44, s0, 2
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, extern_func at gotpcrel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, extern_func at gotpcrel32@hi+12
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    v_writelane_b32 v44, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v44, s31, 1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_dual_mov_b32 v0, v41 :: v_dual_mov_b32 v1, v42
-; GFX11-NEXT:    v_dual_mov_b32 v2, v43 :: v_dual_mov_b32 v3, v44
+; GFX11-NEXT:    v_dual_mov_b32 v0, v40 :: v_dual_mov_b32 v1, v41
+; GFX11-NEXT:    v_dual_mov_b32 v2, v42 :: v_dual_mov_b32 v3, v43
 ; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    scratch_load_b32 v44, off, s33
-; GFX11-NEXT:    scratch_load_b32 v43, off, s33 offset:4
-; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:8
-; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:12
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    scratch_load_b32 v43, off, s33
+; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:4
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:8
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:12
+; GFX11-NEXT:    v_readlane_b32 s31, v44, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v44, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v44, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:16 ; 4-byte Folded Reload
+; GFX11-NEXT:    scratch_load_b32 v44, off, s33 offset:16 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_addk_i32 s32, 0xffe0
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -204,43 +204,43 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX9-NEXT:    s_mov_b32 s4, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v45, v16
-; GFX9-NEXT:    v_mov_b32_e32 v44, v15
-; GFX9-NEXT:    v_mov_b32_e32 v43, v14
-; GFX9-NEXT:    v_mov_b32_e32 v42, v13
-; GFX9-NEXT:    v_mov_b32_e32 v41, v12
-; GFX9-NEXT:    image_gather4_c_b_cl v[0:3], v[41:45], s[4:11], s[4:7] dmask:0x1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v44, v16
+; GFX9-NEXT:    v_mov_b32_e32 v43, v15
+; GFX9-NEXT:    v_mov_b32_e32 v42, v14
+; GFX9-NEXT:    v_mov_b32_e32 v41, v13
+; GFX9-NEXT:    v_mov_b32_e32 v40, v12
+; GFX9-NEXT:    image_gather4_c_b_cl v[0:3], v[40:44], s[4:11], s[4:7] dmask:0x1
 ; GFX9-NEXT:    s_addk_i32 s32, 0x800
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 2
+; GFX9-NEXT:    v_writelane_b32 v45, s4, 2
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, extern_func at gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, extern_func at gotpcrel32@hi+12
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v45, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v45, s31, 1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    image_gather4_c_b_cl v[0:3], v[41:45], s[4:11], s[4:7] dmask:0x1
+; GFX9-NEXT:    image_gather4_c_b_cl v[0:3], v[40:44], s[4:11], s[4:7] dmask:0x1
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_readlane_b32 s31, v45, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v45, 0
+; GFX9-NEXT:    v_readlane_b32 s4, v45, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xf800
 ; GFX9-NEXT:    s_mov_b32 s33, s4
@@ -253,44 +253,44 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX10-NEXT:    s_mov_b32 s4, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s5, -1
-; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s5
-; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    image_gather4_c_b_cl v[0:3], v[12:16], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
 ; GFX10-NEXT:    s_addk_i32 s32, 0x400
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 2
+; GFX10-NEXT:    v_writelane_b32 v45, s4, 2
 ; GFX10-NEXT:    s_getpc_b64 s[4:5]
 ; GFX10-NEXT:    s_add_u32 s4, s4, extern_func at gotpcrel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s5, s5, extern_func at gotpcrel32@hi+12
-; GFX10-NEXT:    v_mov_b32_e32 v41, v16
+; GFX10-NEXT:    v_mov_b32_e32 v40, v16
 ; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v42, v15
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    v_mov_b32_e32 v43, v14
-; GFX10-NEXT:    v_mov_b32_e32 v44, v13
-; GFX10-NEXT:    v_mov_b32_e32 v45, v12
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    v_mov_b32_e32 v41, v15
+; GFX10-NEXT:    v_writelane_b32 v45, s30, 0
+; GFX10-NEXT:    v_mov_b32_e32 v42, v14
+; GFX10-NEXT:    v_mov_b32_e32 v43, v13
+; GFX10-NEXT:    v_mov_b32_e32 v44, v12
+; GFX10-NEXT:    v_writelane_b32 v45, s31, 1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    image_gather4_c_b_cl v[0:3], [v45, v44, v43, v42, v41], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT:    image_gather4_c_b_cl v[0:3], [v44, v43, v42, v41, v40], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
 ; GFX10-NEXT:    s_clause 0x4
-; GFX10-NEXT:    buffer_load_dword v45, off, s[0:3], s33
-; GFX10-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:4
-; GFX10-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:8
-; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:12
-; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:16
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX10-NEXT:    buffer_load_dword v44, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:4
+; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:12
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:16
+; GFX10-NEXT:    v_readlane_b32 s31, v45, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v45, 0
+; GFX10-NEXT:    v_readlane_b32 s4, v45, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s5, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; GFX10-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s5
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfc00
@@ -304,42 +304,42 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:20 ; 4-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b32 off, v45, s33 offset:20 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_clause 0x4
-; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:16
-; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:12
-; GFX11-NEXT:    scratch_store_b32 off, v43, s33 offset:8
-; GFX11-NEXT:    scratch_store_b32 off, v44, s33 offset:4
-; GFX11-NEXT:    scratch_store_b32 off, v45, s33
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:16
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:12
+; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:8
+; GFX11-NEXT:    scratch_store_b32 off, v43, s33 offset:4
+; GFX11-NEXT:    scratch_store_b32 off, v44, s33
 ; GFX11-NEXT:    image_gather4_c_b_cl v[0:3], v[12:16], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D
 ; GFX11-NEXT:    s_add_i32 s32, s32, 32
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v45, s0, 2
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, extern_func at gotpcrel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, extern_func at gotpcrel32@hi+12
-; GFX11-NEXT:    v_dual_mov_b32 v41, v16 :: v_dual_mov_b32 v42, v15
+; GFX11-NEXT:    v_dual_mov_b32 v40, v16 :: v_dual_mov_b32 v41, v15
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-NEXT:    v_dual_mov_b32 v43, v14 :: v_dual_mov_b32 v44, v13
-; GFX11-NEXT:    v_mov_b32_e32 v45, v12
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    v_writelane_b32 v45, s30, 0
+; GFX11-NEXT:    v_dual_mov_b32 v42, v14 :: v_dual_mov_b32 v43, v13
+; GFX11-NEXT:    v_mov_b32_e32 v44, v12
+; GFX11-NEXT:    v_writelane_b32 v45, s31, 1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    image_gather4_c_b_cl v[0:3], [v45, v44, v43, v42, v41], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT:    image_gather4_c_b_cl v[0:3], [v44, v43, v42, v41, v40], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D
 ; GFX11-NEXT:    s_clause 0x4
-; GFX11-NEXT:    scratch_load_b32 v45, off, s33
-; GFX11-NEXT:    scratch_load_b32 v44, off, s33 offset:4
-; GFX11-NEXT:    scratch_load_b32 v43, off, s33 offset:8
-; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:12
-; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:16
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    scratch_load_b32 v44, off, s33
+; GFX11-NEXT:    scratch_load_b32 v43, off, s33 offset:4
+; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:8
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:12
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:16
+; GFX11-NEXT:    v_readlane_b32 s31, v45, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v45, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v45, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:20 ; 4-byte Folded Reload
+; GFX11-NEXT:    scratch_load_b32 v45, off, s33 offset:20 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_addk_i32 s32, 0xffe0
 ; GFX11-NEXT:    s_mov_b32 s33, s0

diff  --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
index ab7d3ca0ab425f..e79cb66dcd7760 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
@@ -371,13 +371,13 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
 ; GFX9-O0-NEXT:    s_mov_b32 s48, s33
 ; GFX9-O0-NEXT:    s_mov_b32 s33, s32
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_add_i32 s32, s32, 0x400
-; GFX9-O0-NEXT:    v_writelane_b32 v2, s30, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s30, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX9-O0-NEXT:    s_mov_b32 s40, s6
 ; GFX9-O0-NEXT:    s_mov_b32 s34, s4
 ; GFX9-O0-NEXT:    ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41
@@ -393,9 +393,9 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
 ; GFX9-O0-NEXT:    s_mov_b32 s38, s43
 ; GFX9-O0-NEXT:    s_mov_b32 s39, s42
 ; GFX9-O0-NEXT:    s_mov_b32 s34, 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s34
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s34
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[40:41], -1
 ; GFX9-O0-NEXT:    s_getpc_b64 s[42:43]
@@ -405,18 +405,18 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
 ; GFX9-O0-NEXT:    s_mov_b64 s[44:45], s[0:1]
 ; GFX9-O0-NEXT:    s_mov_b64 s[0:1], s[44:45]
 ; GFX9-O0-NEXT:    s_mov_b64 s[2:3], s[46:47]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[42:43]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-O0-NEXT:    v_add_u32_e64 v1, v1, v3
+; GFX9-O0-NEXT:    v_add_u32_e64 v1, v1, v2
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[40:41]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[36:39], s34 offset:4
-; GFX9-O0-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX9-O0-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX9-O0-NEXT:    v_readlane_b32 s31, v3, 1
+; GFX9-O0-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_add_i32 s32, s32, 0xfffffc00
@@ -430,33 +430,33 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
 ; GFX9-O3-NEXT:    s_mov_b32 s38, s33
 ; GFX9-O3-NEXT:    s_mov_b32 s33, s32
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX9-O3-NEXT:    v_writelane_b32 v3, s30, 0
 ; GFX9-O3-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-O3-NEXT:    v_writelane_b32 v2, s31, 1
-; GFX9-O3-NEXT:    v_mov_b32_e32 v3, s8
+; GFX9-O3-NEXT:    v_writelane_b32 v3, s31, 1
+; GFX9-O3-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-O3-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v3
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-O3-NEXT:    s_getpc_b64 s[36:37]
 ; GFX9-O3-NEXT:    s_add_u32 s36, s36, strict_wwm_called at rel32@lo+4
 ; GFX9-O3-NEXT:    s_addc_u32 s37, s37, strict_wwm_called at rel32@hi+12
 ; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[36:37]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-O3-NEXT:    v_add_u32_e32 v1, v1, v3
+; GFX9-O3-NEXT:    v_add_u32_e32 v1, v1, v2
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-O3-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:4
-; GFX9-O3-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX9-O3-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX9-O3-NEXT:    v_readlane_b32 s31, v3, 1
+; GFX9-O3-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    s_addk_i32 s32, 0xfc00
@@ -562,12 +562,12 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-NEXT:    s_mov_b32 s48, s33
 ; GFX9-O0-NEXT:    s_mov_b32 s33, s32
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
@@ -581,8 +581,8 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_add_i32 s32, s32, 0x1000
 ; GFX9-O0-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
-; GFX9-O0-NEXT:    v_writelane_b32 v8, s30, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v8, s31, 1
+; GFX9-O0-NEXT:    v_writelane_b32 v10, s30, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v10, s31, 1
 ; GFX9-O0-NEXT:    s_mov_b32 s34, s8
 ; GFX9-O0-NEXT:    s_mov_b32 s38, s6
 ; GFX9-O0-NEXT:    s_mov_b32 s36, s4
@@ -606,11 +606,11 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-NEXT:    s_mov_b32 s35, s9
 ; GFX9-O0-NEXT:    ; kill: def $sgpr36_sgpr37 killed $sgpr34_sgpr35
 ; GFX9-O0-NEXT:    s_mov_b64 s[36:37], 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s34
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, s35
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s34
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s35
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s36
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, s37
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s36
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s37
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    v_writelane_b32 v0, s34, 4
@@ -618,10 +618,10 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[46:47], -1
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[46:47]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v8
 ; GFX9-O0-NEXT:    s_mov_b32 s34, 32
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr36_sgpr37
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[3:4], s34, v[9:10]
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[3:4], s34, v[8:9]
 ; GFX9-O0-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-O0-NEXT:    s_add_u32 s34, s34, strict_wwm_called_i64 at gotpcrel32@lo+4
 ; GFX9-O0-NEXT:    s_addc_u32 s35, s35, strict_wwm_called_i64 at gotpcrel32@hi+12
@@ -651,8 +651,8 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr40
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr40
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
 ; GFX9-O0-NEXT:    v_add_co_u32_e64 v2, s[40:41], v2, v4
 ; GFX9-O0-NEXT:    v_addc_co_u32_e64 v3, s[40:41], v3, v5, s[40:41]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
@@ -660,15 +660,15 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX9-O0-NEXT:    s_mov_b32 s34, 0
 ; GFX9-O0-NEXT:    buffer_store_dwordx2 v[6:7], off, s[36:39], s34 offset:4
-; GFX9-O0-NEXT:    v_readlane_b32 s31, v8, 1
-; GFX9-O0-NEXT:    v_readlane_b32 s30, v8, 0
+; GFX9-O0-NEXT:    v_readlane_b32 s31, v10, 1
+; GFX9-O0-NEXT:    v_readlane_b32 s30, v10, 0
 ; GFX9-O0-NEXT:    ; kill: killed $vgpr0
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
@@ -689,50 +689,50 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O3-NEXT:    s_mov_b32 s40, s33
 ; GFX9-O3-NEXT:    s_mov_b32 s33, s32
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    buffer_store_dword v6, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v8, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O3-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT:    v_writelane_b32 v6, s30, 0
+; GFX9-O3-NEXT:    v_writelane_b32 v8, s30, 0
 ; GFX9-O3-NEXT:    s_addk_i32 s32, 0x800
-; GFX9-O3-NEXT:    v_writelane_b32 v6, s31, 1
+; GFX9-O3-NEXT:    v_writelane_b32 v8, s31, 1
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    s_getpc_b64 s[36:37]
 ; GFX9-O3-NEXT:    s_add_u32 s36, s36, strict_wwm_called_i64 at gotpcrel32@lo+4
 ; GFX9-O3-NEXT:    s_addc_u32 s37, s37, strict_wwm_called_i64 at gotpcrel32@hi+12
 ; GFX9-O3-NEXT:    s_load_dwordx2 s[36:37], s[36:37], 0x0
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v7, s8
-; GFX9-O3-NEXT:    v_mov_b32_e32 v8, s9
+; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s8
+; GFX9-O3-NEXT:    v_mov_b32_e32 v7, s9
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v7, 0
-; GFX9-O3-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[38:39], -1
-; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v7
-; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v8
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v6
+; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX9-O3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[36:37]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-O3-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v7
-; GFX9-O3-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v8, vcc
+; GFX9-O3-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
+; GFX9-O3-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v7, vcc
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[38:39]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX9-O3-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0 offset:4
-; GFX9-O3-NEXT:    v_readlane_b32 s31, v6, 1
-; GFX9-O3-NEXT:    v_readlane_b32 s30, v6, 0
+; GFX9-O3-NEXT:    v_readlane_b32 s31, v8, 1
+; GFX9-O3-NEXT:    v_readlane_b32 s30, v8, 0
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v8, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
@@ -922,64 +922,64 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O0:       ; %bb.0:
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_writelane_b32 v32, s64, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v32, s65, 1
-; GFX9-O0-NEXT:    v_writelane_b32 v32, s66, 2
-; GFX9-O0-NEXT:    v_writelane_b32 v32, s67, 3
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_writelane_b32 v47, s64, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v47, s65, 1
+; GFX9-O0-NEXT:    v_writelane_b32 v47, s66, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v47, s67, 3
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:16
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:12
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:8
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
@@ -987,147 +987,147 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v43, s5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v42, s5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s10
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s11
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s12
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s14
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s13
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s15
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s14
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s15
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s16
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s18
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s17
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s19
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s18
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s20
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s19
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s21
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s22
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s21
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s23
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s22
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s24
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s23
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v47, s25
-; GFX9-O0-NEXT:    v_mov_b32_e32 v46, s26
-; GFX9-O0-NEXT:    v_mov_b32_e32 v45, s27
-; GFX9-O0-NEXT:    v_mov_b32_e32 v44, s28
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s29
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s24
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v46, s25
+; GFX9-O0-NEXT:    v_mov_b32_e32 v45, s26
+; GFX9-O0-NEXT:    v_mov_b32_e32 v44, s27
+; GFX9-O0-NEXT:    v_mov_b32_e32 v43, s28
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s29
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v47
-; GFX9-O0-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v46
-; GFX9-O0-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v45
-; GFX9-O0-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    v_mov_b32_e32 v24, v44
-; GFX9-O0-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v46
+; GFX9-O0-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v45
+; GFX9-O0-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v44
+; GFX9-O0-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v24, v43
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v25, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v25, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v26, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    v_mov_b32_e32 v27, v47
-; GFX9-O0-NEXT:    v_mov_b32_e32 v28, v46
-; GFX9-O0-NEXT:    v_mov_b32_e32 v29, v45
-; GFX9-O0-NEXT:    v_mov_b32_e32 v30, v44
-; GFX9-O0-NEXT:    ; kill: def $vgpr31 killed $vgpr43 killed $exec
-; GFX9-O0-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v26, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v27, v46
+; GFX9-O0-NEXT:    v_mov_b32_e32 v28, v45
+; GFX9-O0-NEXT:    v_mov_b32_e32 v29, v44
+; GFX9-O0-NEXT:    v_mov_b32_e32 v30, v43
+; GFX9-O0-NEXT:    ; kill: def $vgpr31 killed $vgpr42 killed $exec
+; GFX9-O0-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
 ; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
@@ -1157,55 +1157,55 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr34_sgpr35
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67
 ; GFX9-O0-NEXT:    s_mov_b64 s[34:35], 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v33, v10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v34, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v32, v10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v33, v11
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v33, s34
-; GFX9-O0-NEXT:    v_mov_b32_e32 v34, s35
+; GFX9-O0-NEXT:    v_mov_b32_e32 v32, s34
+; GFX9-O0-NEXT:    v_mov_b32_e32 v33, s35
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v35, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v36, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v34, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v35, v9
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v35, s34
-; GFX9-O0-NEXT:    v_mov_b32_e32 v36, s35
+; GFX9-O0-NEXT:    v_mov_b32_e32 v34, s34
+; GFX9-O0-NEXT:    v_mov_b32_e32 v35, s35
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v37, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v38, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v36, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v37, v7
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v37, s34
-; GFX9-O0-NEXT:    v_mov_b32_e32 v38, s35
+; GFX9-O0-NEXT:    v_mov_b32_e32 v36, s34
+; GFX9-O0-NEXT:    v_mov_b32_e32 v37, s35
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v39, v4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v40, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v38, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v39, v5
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v39, s34
-; GFX9-O0-NEXT:    v_mov_b32_e32 v40, s35
+; GFX9-O0-NEXT:    v_mov_b32_e32 v38, s34
+; GFX9-O0-NEXT:    v_mov_b32_e32 v39, s35
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v41, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v42, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v40, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v41, v3
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v41, s34
-; GFX9-O0-NEXT:    v_mov_b32_e32 v42, s35
+; GFX9-O0-NEXT:    v_mov_b32_e32 v40, s34
+; GFX9-O0-NEXT:    v_mov_b32_e32 v41, s35
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v34
-; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:4
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v33
+; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v32
 ; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v36
-; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:12
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v35
+; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v34
 ; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v38
-; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:20
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v37
+; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:20
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v36
 ; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v40
-; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:28
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v39
+; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:28
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v38
 ; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:24
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v42
-; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:36
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v41
+; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:36
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v40
 ; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:32
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s5
@@ -1233,20 +1233,19 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v23, s27
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v24, s28
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v25, s29
-; GFX9-O0-NEXT:    v_readlane_b32 s67, v32, 3
-; GFX9-O0-NEXT:    v_readlane_b32 s66, v32, 2
-; GFX9-O0-NEXT:    v_readlane_b32 s65, v32, 1
-; GFX9-O0-NEXT:    v_readlane_b32 s64, v32, 0
-; GFX9-O0-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_readlane_b32 s67, v47, 3
+; GFX9-O0-NEXT:    v_readlane_b32 s66, v47, 2
+; GFX9-O0-NEXT:    v_readlane_b32 s65, v47, 1
+; GFX9-O0-NEXT:    v_readlane_b32 s64, v47, 0
+; GFX9-O0-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
@@ -1256,7 +1255,8 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O0-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]


        


More information about the llvm-commits mailing list